# A few examples of preprocessing

In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from error_metrics import *
from sklearn import preprocessing

df = pd.read_csv("./data/cars.csv")
df.iloc[3,3] = float('NaN')
df.iloc[4,5] = float('NaN')
df.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110.0,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110.0,3.9,2.875,17.02,0,1,4,4
2,22.8,4,108.0,93.0,3.85,2.32,18.61,1,1,4,1
3,21.4,6,258.0,,3.08,3.215,19.44,1,0,3,1
4,18.7,8,360.0,175.0,3.15,,17.02,0,0,3,2


In [2]:
data_x = df[list(df)[1:]]
data_y = df[list(df)[0]]
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=4)

In [3]:
#create the preprocessing pipeline.
#first step is the imputer, impute column mean to each missing value
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean',axis=0) #ASK ABOUT AXIS
#second step is a standard z-score scaler
scaler = preprocessing.StandardScaler()

#NOTE: we could easily put on a scale of 0-1 using MinMaxScaler object - see sklearn preprocessing documentation
#min_max_scaler = preprocessing.MinMaxScaler()

#run training data through the pipeline first
train_x_pp = imp.fit_transform(x_train)
train_x_pp = scaler.fit_transform(train_x_pp)
#train_x_pp



In [4]:
# Build model on preprocessed training data
model = linear_model.LinearRegression()
model.fit(train_x_pp, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [5]:
# Run our test data through the same pipeline
# Using transform instead of fit_transform makes sure it transforms the data using what it learned from the training data!
x_test_pp = imp.transform(x_test)
x_test_pp = scaler.transform(x_test_pp)

#make predictions on transformed data and show results
preds = model.predict(x_test_pp)
print_reg_error_metrics(preds, y_test)


MSE, MAE, R^2, EVS: [23.364246852031126, 4.138178780601052, 0.6515431243685778, 0.7618751855927595]
