# Data Preprocessing

This notebook contains a few examples of how to do data preprocessing using scikit learn.

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pprint
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from data_util import *

cars = pd.read_csv('./data/mtcars-dirty.csv')
data_x = cars[list(cars)[1:]]
data_y = cars[list(cars)[0]]

In [37]:
data_x.head()

Unnamed: 0,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,6,160.0,110,3.9,2.875,,0,1,4,4
2,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,8,360.0,175,3.15,3.44,17.02,0,0,3,2


### 1. Impute Missing Values and Scale

In [38]:
# Step 1: Impute column means to missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(data_x)
data_x = imputer.transform(data_x)

# Step 2: Normalize values
data_x = preprocessing.normalize(data_x)

### 2. Do a linear regression with proper train/test split

In [39]:
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, 
                                                   random_state=4)
base_mod = linear_model.LinearRegression()
base_mod.fit(x_train, y_train)
preds = base_mod.predict(x_test)
print_regression_error_report(preds, y_test)

MSE, MAE, R^2, EVS: [7.354720361097279, 2.36396579333422, 0.8026332671527819, 0.8086594792791698]
