# PreProcessing

This notebook will house preprocessing pipelines for the Boston Homes data. This will handle skewness, distribution shape, and scaling of the data. 

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [71]:
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, PowerTransformer 
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, QuantileTransformer, FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from scipy import stats



# Preprocessing Pipelines

Let's establish some Preprocessing pipelines.

First, turn into guassian distribution by transforming with powertransformer, quantile
then apply Standard Scaler
then fix outliers with z-score fixing.


In [7]:
from sklearn.datasets import load_boston
data = load_boston()
X, y = pd.DataFrame(data['data'], columns = data.feature_names), pd.Series(data['target'], name='MEDV')
X.head()
print(y.head())

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64


In [8]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [9]:
#set up training and testing splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .75, random_state = 777)

In [66]:
box_cox = PowerTransformer(method= 'box-cox', standardize=False)
yeo_john = PowerTransformer(method = 'yeo-johnson', standardize=False)

quantile = QuantileTransformer(output_distribution='Normal', random_state=777)

scaler = StandardScaler()

def z_score_outliers(df):
    df_values = df
    z = stats.zscore(df_values)
    z = np.where(z > 3, 3, z)
    z = np.where(z < -3, -3, z)
    
    final_df = pd.DataFrame(z, columns=X_train.columns)
    return final_df

func = FunctionTransformer(z_score_outliers)

In [70]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()

pipe = Pipeline([
    ('yeo', yeo_john),
    ('scale', scaler),
    ('outlier', func),
    ('reg', reg)  
                ])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)



In [72]:
mean_squared_error(y_test, y_pred)

22.16750845883944

In [73]:
y_pred

array([16.57706385, 15.04822888, 14.73983914, 23.19941911, 35.21123251,
       18.56638904, 11.83571603, 46.1292165 , 13.60938717, 38.98806985,
       30.96580536, 28.62998342, 16.66637679, 18.84012537, 17.56877437,
       37.5834715 , 10.68305793, 20.61085772, 23.85854468, 16.282259  ,
       16.28260397, 18.18595234, 12.60427838, 30.02932614, 20.05279035,
       24.81123369, 23.22117603, 32.46635806, 22.24161938, 21.9451746 ,
       29.83617667, 30.56139346, 25.85748031, 32.15000915, 24.26241655,
       31.3780724 , 13.67074548, 23.6244546 , 20.17822929, 18.80427538,
       20.76699273, 17.68046   , 10.76375031, 18.8055579 , 13.88169002,
       22.55588355, 14.47030561, 24.02081307, 20.64925368, 18.56335508,
       31.72250172, 36.03056034, 18.66382105, 17.29408071, 22.41797222,
       26.69857404, 19.09263162, 19.81290634, 29.0579199 , 15.44858624,
       30.29898058, 18.28270987, 26.17170167, 18.85126965, 15.67771551,
       36.5896564 , 36.32481761, 17.42496027, 25.96481384, 14.72