In [1]:
import pandas as pd
import os
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:
chem = pd.read_csv(r"C:\Training\Academy\Statistics (Python)\Cases\Chemical Process Data\ChemicalProcess.csv")
chem.shape

(176, 58)

In [3]:
chem.columns

Index(['Yield', 'BiologicalMaterial01', 'BiologicalMaterial02',
       'BiologicalMaterial03', 'BiologicalMaterial04', 'BiologicalMaterial05',
       'BiologicalMaterial06', 'BiologicalMaterial07', 'BiologicalMaterial08',
       'BiologicalMaterial09', 'BiologicalMaterial10', 'BiologicalMaterial11',
       'BiologicalMaterial12', 'ManufacturingProcess01',
       'ManufacturingProcess02', 'ManufacturingProcess03',
       'ManufacturingProcess04', 'ManufacturingProcess05',
       'ManufacturingProcess06', 'ManufacturingProcess07',
       'ManufacturingProcess08', 'ManufacturingProcess09',
       'ManufacturingProcess10', 'ManufacturingProcess11',
       'ManufacturingProcess12', 'ManufacturingProcess13',
       'ManufacturingProcess14', 'ManufacturingProcess15',
       'ManufacturingProcess16', 'ManufacturingProcess17',
       'ManufacturingProcess18', 'ManufacturingProcess19',
       'ManufacturingProcess20', 'ManufacturingProcess21',
       'ManufacturingProcess22', 'ManufacturingProce

In [4]:
np.sum(chem.isnull())

Yield                      0
BiologicalMaterial01       0
BiologicalMaterial02       0
BiologicalMaterial03       0
BiologicalMaterial04       0
BiologicalMaterial05       0
BiologicalMaterial06       0
BiologicalMaterial07       0
BiologicalMaterial08       0
BiologicalMaterial09       0
BiologicalMaterial10       0
BiologicalMaterial11       0
BiologicalMaterial12       0
ManufacturingProcess01     1
ManufacturingProcess02     3
ManufacturingProcess03    15
ManufacturingProcess04     1
ManufacturingProcess05     1
ManufacturingProcess06     2
ManufacturingProcess07     1
ManufacturingProcess08     1
ManufacturingProcess09     0
ManufacturingProcess10     9
ManufacturingProcess11    10
ManufacturingProcess12     1
ManufacturingProcess13     0
ManufacturingProcess14     1
ManufacturingProcess15     0
ManufacturingProcess16     0
ManufacturingProcess17     0
ManufacturingProcess18     0
ManufacturingProcess19     0
ManufacturingProcess20     0
ManufacturingProcess21     0
ManufacturingP

### Mean Imputation

In [5]:
imp = SimpleImputer(strategy='mean').set_output(transform="pandas")
imputed = imp.fit_transform(chem)
type(imputed)

pandas.core.frame.DataFrame

In [6]:
X = imputed.drop('Yield', axis=1)
y = imputed['Yield']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=23)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((123, 57), (53, 57), (123,), (53,))

In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)
ycap = lr.predict(X_test)
print(r2_score(y_test, ycap))

0.407181682727197


In [8]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-13.94621386275129


In [9]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-13.325020122456074


### Median Imputation

In [10]:
imp = SimpleImputer(strategy='median').set_output(transform="pandas")
imputed = imp.fit_transform(chem)
type(imputed)

pandas.core.frame.DataFrame

In [11]:
X = imputed.drop('Yield', axis=1)
y = imputed['Yield']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=23)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((123, 57), (53, 57), (123,), (53,))

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)
ycap = lr.predict(X_test)
print(r2_score(y_test, ycap))

0.39479825978664973


In [13]:
poly = PolynomialFeatures(degree=2)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-13.252078970983781


In [14]:
poly = PolynomialFeatures(degree=3)
lr = LinearRegression()
pipe = Pipeline([('POLY',poly), ('LR', lr)])
pipe.fit(X_train, y_train)
ycap = pipe.predict(X_test)
print(r2_score(y_test, ycap))

-12.430061268901452
