# Module 5: Preprocessing Numerical Features, Pipelines and Hyperparameter Optimization

### The Importance of Preprocessing

Transformers: Scaling example

In [1]:
from sklearn.preprocessing import StandardScaler

In [2]:
scaler = StandardScaler() # Create feature transformer object

scaler.fit(X_train); # Fitting the transformer on the train split

X_train_scaled = scaler.transform(X_train)

X_test_scale = scaler.transform(X_test)

pd.DataFrame(X_train_scaled, columns = X_train.columns).head()

model.fit(X_train, y_train)

X_train_predictions = model.predict(X_train)

X_test_predictions = model.predict(X_test)

transformer.fit(X_train, [y_train])

x_train_transformed = transformer.transform(X_train)

knn_scaled = KNeighborsClassifier()

knn_scaled.fit(X_train_scaled, y_train)

print('Train score: ', (knn_scaled.score(X_train_scaled, y_train).round(2)))

print('Test score: ', (knn_scaled.score(X_test_scaled, y_test).round(2)))

### Case Study: Preprocessing with Imputation

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
housing_df = pd.read_csv("data/housing.csv")

In [5]:
train_df, test_df = train_test_split(housing_df, test_size=0.1, random_state=123)
train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
6051,-117.75,34.04,22.0,2948.0,636.0,2600.0,602.0,3.125,113600.0,INLAND
20113,-119.57,37.94,17.0,346.0,130.0,51.0,20.0,3.4861,137500.0,INLAND
14289,-117.13,32.74,46.0,3355.0,768.0,1457.0,708.0,2.6604,170100.0,NEAR OCEAN
13665,-117.31,34.02,18.0,1634.0,274.0,899.0,285.0,5.2139,129300.0,INLAND
14471,-117.23,32.88,18.0,5566.0,1465.0,6303.0,1458.0,1.858,205000.0,NEAR OCEAN


In [6]:
train_df = train_df.assign(rooms_per_household = train_df["total_rooms"]/train_df["households"],
                           bedrooms_per_household = train_df["total_bedrooms"]/train_df["households"],
                           population_per_household = train_df["population"]/train_df["households"])

test_df = train_df.assign(rooms_per_household = test_df["total_rooms"]/test_df["households"],
                           bedrooms_per_household = test_df["total_bedrooms"]/test_df["households"],
                           population_per_household = test_df["population"]/test_df["households"])

train_df = train_df.drop(columns=['total_rooms', 'total_bedrooms', 'population'])
test_df = test_df.drop(columns=['total_rooms', 'total_bedrooms', 'population'])

train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_household,population_per_household
6051,-117.75,34.04,22.0,602.0,3.125,113600.0,INLAND,4.89701,1.056478,4.318937
20113,-119.57,37.94,17.0,20.0,3.4861,137500.0,INLAND,17.3,6.5,2.55
14289,-117.13,32.74,46.0,708.0,2.6604,170100.0,NEAR OCEAN,4.738701,1.084746,2.05791
13665,-117.31,34.02,18.0,285.0,5.2139,129300.0,INLAND,5.733333,0.961404,3.154386
14471,-117.23,32.88,18.0,1458.0,1.858,205000.0,NEAR OCEAN,3.817558,1.004801,4.323045


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18576 entries, 6051 to 19966
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 18576 non-null  float64
 1   latitude                  18576 non-null  float64
 2   housing_median_age        18576 non-null  float64
 3   households                18576 non-null  float64
 4   median_income             18576 non-null  float64
 5   median_house_value        18576 non-null  float64
 6   ocean_proximity           18576 non-null  object 
 7   rooms_per_household       18576 non-null  float64
 8   bedrooms_per_household    18391 non-null  float64
 9   population_per_household  18576 non-null  float64
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [8]:
train_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,households,median_income,median_house_value,rooms_per_household,bedrooms_per_household,population_per_household
count,18576.0,18576.0,18576.0,18576.0,18576.0,18576.0,18576.0,18391.0,18576.0
mean,-119.565888,35.627966,28.622255,500.0611,3.862552,206292.067991,5.426067,1.097516,3.052349
std,1.999622,2.134658,12.588307,383.044313,1.892491,115083.856175,2.512319,0.486266,10.020873
min,-124.35,32.54,1.0,1.0,0.4999,14999.0,0.846154,0.333333,0.692308
25%,-121.79,33.93,18.0,280.0,2.560225,119400.0,4.43936,1.005888,2.430323
50%,-118.49,34.25,29.0,410.0,3.5275,179300.0,5.226415,1.04886,2.818868
75%,-118.01,37.71,37.0,606.0,4.7369,263600.0,6.05162,1.099723,3.283921
max,-114.31,41.95,52.0,6082.0,15.0001,500001.0,141.909091,34.066667,1243.333333


In [9]:
X_train = train_df.drop(columns=["median_house_value", "ocean_proximity"])
y_train = train_df["median_house_value"]

X_test = test_df.drop(columns=["median_house_value", "ocean_proximity"])
y_test = test_df["median_house_value"]

In [10]:
# knn = KNeighborsRegressor()
# knn.fit(X_train, y_train)

This will throw error since there are NA values in the bedrooms column.

Removing NA values:

In [11]:
train_df["bedrooms_per_household"].isnull().sum()

185

In [12]:
X_train.shape

(18576, 8)

In [13]:
X_train_no_nan = X_train.dropna()
y_train_no_nan = y_train.dropna()

In [14]:
X_train_no_nan.shape

(18391, 8)

Dropping a column:

In [15]:
X_train.shape

(18576, 8)

In [16]:
X_train_no_col = X_train.dropna(axis=1)

In [17]:
X_train_no_col.shape

(18576, 7)

Imputation:

In [18]:
from sklearn.impute import SimpleImputer

In [19]:
X_train.sort_values('bedrooms_per_household').tail(10)

Unnamed: 0,longitude,latitude,housing_median_age,households,median_income,rooms_per_household,bedrooms_per_household,population_per_household
18786,-122.42,40.44,16.0,181.0,2.1875,5.491713,,2.734807
17923,-121.97,37.35,30.0,386.0,4.6328,5.064767,,2.588083
16880,-122.39,37.59,32.0,715.0,6.1323,6.28951,,2.581818
4309,-118.32,34.09,44.0,726.0,1.676,3.672176,,3.163912
538,-122.28,37.78,29.0,1273.0,2.5762,4.048704,,2.938727
4591,-118.28,34.06,42.0,1179.0,1.2254,2.096692,,3.21883
19485,-120.98,37.66,10.0,255.0,0.9336,3.662745,,1.572549
6962,-118.05,33.99,38.0,357.0,3.7328,4.535014,,2.481793
14970,-117.01,32.74,31.0,677.0,2.6973,5.129985,,3.098966
7763,-118.1,33.91,36.0,130.0,3.6389,5.584615,,3.769231


In [20]:
imputer = SimpleImputer(strategy="median")
imputer.fit(X_train);
X_train_imp = imputer.transform(X_train)
X_test_imp = imputer.transform(X_test)

In [21]:
X_train_imp

array([[-117.75      ,   34.04      ,   22.        , ...,    4.89700997,
           1.05647841,    4.31893688],
       [-119.57      ,   37.94      ,   17.        , ...,   17.3       ,
           6.5       ,    2.55      ],
       [-117.13      ,   32.74      ,   46.        , ...,    4.73870056,
           1.08474576,    2.0579096 ],
       ...,
       [-121.76      ,   37.33      ,    5.        , ...,    5.95839311,
           1.03156385,    3.49354376],
       [-122.44      ,   37.78      ,   44.        , ...,    4.7392638 ,
           1.02453988,    1.7208589 ],
       [-119.08      ,   36.21      ,   20.        , ...,    5.49137931,
           1.11781609,    3.56609195]])

In [22]:
X_train_imp_df = pd.DataFrame(X_train_imp, columns = X_train.columns, index = X_train.index)
X_train_imp_df.loc[[7763]]

Unnamed: 0,longitude,latitude,housing_median_age,households,median_income,rooms_per_household,bedrooms_per_household,population_per_household
7763,-118.1,33.91,36.0,130.0,3.6389,5.584615,1.04886,3.769231


In [23]:
X_train.loc[[7763]]

Unnamed: 0,longitude,latitude,housing_median_age,households,median_income,rooms_per_household,bedrooms_per_household,population_per_household
7763,-118.1,33.91,36.0,130.0,3.6389,5.584615,,3.769231


In [24]:
from sklearn.neighbors import KNeighborsRegressor

In [25]:
knn = KNeighborsRegressor();
knn.fit(X_train_imp, y_train);

In [26]:
knn.score(X_train_imp, y_train)

0.5609808539232339

### Case Study: Preprocessing with Scaling

In [27]:
X_train_imp_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,households,median_income,rooms_per_household,bedrooms_per_household,population_per_household
6051,-117.75,34.04,22.0,602.0,3.125,4.89701,1.056478,4.318937
20113,-119.57,37.94,17.0,20.0,3.4861,17.3,6.5,2.55
14289,-117.13,32.74,46.0,708.0,2.6604,4.738701,1.084746,2.05791
13665,-117.31,34.02,18.0,285.0,5.2139,5.733333,0.961404,3.154386
14471,-117.23,32.88,18.0,1458.0,1.858,3.817558,1.004801,4.323045


**StandardScaler (Standardization)**
* will proudce values where the range depends on the values in the data

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imp)
X_test_scaled = scaler.transform(X_test_imp)
pd.DataFrame(X_train_scaled, columns = X_train.columns, index=X_train.index).head()

Unnamed: 0,longitude,latitude,housing_median_age,households,median_income,rooms_per_household,bedrooms_per_household,population_per_household
6051,0.90814,-0.743917,-0.526078,0.266135,-0.389736,-0.210591,-0.083813,0.126398
20113,-0.002057,1.083123,-0.923283,-1.253312,-0.198924,4.726412,11.166631,-0.050132
14289,1.218207,-1.35293,1.380504,0.542873,-0.635239,-0.273606,-0.025391,-0.09924
13665,1.128188,-0.753286,-0.843842,-0.561467,0.714077,0.122307,-0.28031,0.010183
14471,1.168196,-1.287344,-0.843842,2.500924,-1.059242,-0.640266,-0.190617,0.126808


In [30]:
knn = KNeighborsRegressor()
knn.fit(X_train_scaled, y_train);
print(knn.score(X_test_scaled, y_test).round(3))

0.688


**MinMaxScaler (Normalization)** 
* will never produce negative values
* will never produce values greater than 1

In [31]:
from sklearn.preprocessing import MinMaxScaler

In [32]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_imp)
X_test_scaled = scaler.transform(X_test_imp)
pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index).head()

Unnamed: 0,longitude,latitude,housing_median_age,households,median_income,rooms_per_household,bedrooms_per_household,population_per_household
6051,0.657371,0.159405,0.411765,0.098832,0.181039,0.028717,0.021437,0.002918
20113,0.476096,0.573858,0.313725,0.003124,0.205942,0.116642,0.182806,0.001495
14289,0.719124,0.021254,0.882353,0.116264,0.148998,0.027594,0.022275,0.001099
13665,0.701195,0.157279,0.333333,0.046703,0.325099,0.034645,0.018619,0.001981
14471,0.709163,0.036132,0.333333,0.239599,0.093661,0.021064,0.019905,0.002922


In [33]:
knn = KNeighborsRegressor()
knn.fit(X_train_scaled, y_train);
print(knn.score(X_test_scaled, y_test).round(3))

0.798


### Case Study: Pipelines

In [34]:
from sklearn.model_selection import cross_validate

In [35]:
knn = KNeighborsRegressor()
scores = cross_validate(knn, X_train_scaled, y_train, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.036612,0.126434,0.69497,0.796888
1,0.009262,0.117264,0.684024,0.79844
2,0.012408,0.111511,0.699858,0.794654
3,0.010833,0.129993,0.683428,0.79774
4,0.00947,0.097312,0.685468,0.796949


In [36]:
from sklearn.pipeline import Pipeline

In [37]:
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("reg", KNeighborsRegressor())
])

In [38]:
pipe.fit(X_train, y_train)

In [39]:
pipe.predict(X_train)

array([126500., 117380., 187700., ..., 259500., 308120.,  60860.])

In [40]:
scores_processed = cross_validate(pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(scores_processed)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.109604,0.384434,0.693883,0.792395
1,0.036648,0.337399,0.685017,0.789108
2,0.026963,0.265972,0.694409,0.787796
3,0.043619,0.342401,0.677055,0.792444
4,0.0311,0.264119,0.714494,0.823421


In [41]:
pd.DataFrame(scores_processed).mean()

fit_time       0.049587
score_time     0.318865
test_score     0.692972
train_score    0.797033
dtype: float64

### Automated Hyperparameter Optimization

In [42]:
cities_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/DSCI_571_sup-learn-1/master/lectures/data/canada_usa_cities.csv")

In [43]:
train_df, test_df = train_test_split(cities_df, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns=['country']), train_df['country']
X_test, y_test = test_df.drop(columns=['country']), test_df['country']
X_train.head()

Unnamed: 0,longitude,latitude
160,-76.4813,44.2307
127,-81.2496,42.9837
169,-66.058,45.2788
188,-73.2533,45.3057
187,-67.9245,47.1652


In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [45]:
param_grid = {
    "gamma": [0.1, 1.0, 10, 100]
}

In [48]:
svc = SVC()
grid_search = GridSearchCV(svc, param_grid, verbose=1)

In [49]:
grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [50]:
param_grid = {
    "gamma": [0.1, 1.0, 10, 100],
    "C": [0.1, 1.0, 10, 100]
}

In [51]:
svc = SVC()
grid_search = GridSearchCV(svc, param_grid, cv=5, verbose=1, n_jobs=-1)

In [52]:
grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [53]:
pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("svc", SVC())])

In [54]:
param_grid = {
    "svc__gamma": [0.1, 1.0, 10, 100],
    "svc__C": [0.1, 1.0, 10, 100]
}

In [55]:
grid_search = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [56]:
grid_search.best_params_

{'svc__C': 10, 'svc__gamma': 1.0}

In [57]:
grid_search.best_score_

0.8208556149732621

In [58]:
best_model = grid_search.best_estimator_

In [59]:
best_model.fit(X_train, y_train)

In [60]:
best_model.score(X_test, y_test)

0.8333333333333334

In [61]:
grid_search.score(X_test, y_test)

0.8333333333333334

In [62]:
best_model.predict(X_test)

array(['Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'USA', 'USA', 'Canada', 'Canada',
       'Canada', 'Canada', 'USA', 'Canada', 'USA', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'USA', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'USA', 'USA', 'Canada',
       'Canada', 'Canada'], dtype=object)

In [63]:
grid_search.predict(X_test)

array(['Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'USA', 'USA', 'Canada', 'Canada',
       'Canada', 'Canada', 'USA', 'Canada', 'USA', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'USA', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'USA', 'USA', 'Canada',
       'Canada', 'Canada'], dtype=object)

In [64]:
from sklearn.model_selection import RandomizedSearchCV

In [65]:
param_grid

{'svc__gamma': [0.1, 1.0, 10, 100], 'svc__C': [0.1, 1.0, 10, 100]}

In [66]:
random_search = RandomizedSearchCV(pipe, param_grid, cv=5, verbose=1, n_jobs=-1, n_iter=10)
random_search.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [67]:
random_search.score(X_test, y_test)

0.8333333333333334