In [1]:
# import the requisites
import pandas as pd
from sklearn.datasets import load_boston

In [2]:
bos = load_boston()

In [3]:
X = pd.DataFrame( bos.data, columns = bos.feature_names )
y = pd.Series( bos.target, name = 'medv' )

In [4]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
X.CHAS.value_counts(dropna=False)

0.0    471
1.0     35
Name: CHAS, dtype: int64

In [6]:
X.ZN.value_counts(dropna = False)

0.0      372
20.0      21
80.0      15
12.5      10
22.0      10
25.0      10
40.0       7
45.0       6
30.0       6
90.0       5
21.0       4
95.0       4
60.0       4
33.0       4
75.0       3
35.0       3
28.0       3
55.0       3
52.5       3
70.0       3
34.0       3
85.0       2
82.5       2
17.5       1
100.0      1
18.0       1
Name: ZN, dtype: int64

In [7]:
X.RAD.value_counts()

24.0    132
5.0     115
4.0     110
3.0      38
6.0      26
8.0      24
2.0      24
1.0      20
7.0      17
Name: RAD, dtype: int64

In [5]:
# try to keep interpretable
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split( X,y, test_size = 0.2  )

## Transform the training feature matrix

#### One hot encoding

In [7]:
## FOR OLDER VERSIONS OF SKLEARN
# One hot encoding of the RAD column
from sklearn.preprocessing import OneHotEncoder
RAD_ohe = OneHotEncoder( categories = 'auto', drop='first', sparse = False )
RAD_enc = RAD_ohe.fit_transform( Xtrain[['RAD']] )
RAD_enc = pd.DataFrame( RAD_enc, columns=RAD_ohe.get_feature_names(['RAD'])[1:] )
Xtrain = pd.concat( (Xtrain.drop(["RAD"], axis=1).reset_index(drop = True), RAD_enc), axis = 1 )
Xtrain.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,RAD_2.0,RAD_3.0,RAD_4.0,RAD_5.0,RAD_6.0,RAD_7.0,RAD_8.0,RAD_24.0
0,0.08707,0.0,12.83,0.0,0.437,6.14,45.8,4.0905,398.0,18.7,386.96,10.27,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.06617,0.0,3.24,0.0,0.46,5.868,25.8,5.2146,430.0,16.9,382.44,9.97,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.02543,55.0,3.78,0.0,0.484,6.696,56.4,5.7321,370.0,17.6,396.9,7.18,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.01301,35.0,1.52,0.0,0.442,7.241,49.3,7.0379,284.0,15.5,394.74,5.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.59005,0.0,21.89,0.0,0.624,6.372,97.9,2.3274,437.0,21.2,385.76,11.12,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
## FOR NEWER VERSIONS OF SKLEARN
#One hot encoding of the RAD column
#from sklearn.preprocessing import OneHotEncoder
RAD_ohe = OneHotEncoder( drop='first', sparse = False )
RAD_enc = RAD_ohe.fit_transform( Xtrain[['RAD']] )
RAD_enc = pd.DataFrame( RAD_enc, columns=RAD_ohe.get_feature_names(['RAD']) )
Xtrain = pd.concat( (Xtrain.drop(["RAD"], axis=1).reset_index(drop = True), RAD_enc), axis = 1 )
Xtrain.head()

In [8]:
## FOR OLDER VERSIONS OF SKLEARN
'''
def repro_RAD_ohe( X_df ):
    df = X_df.copy()
    RAD_enc = RAD_ohe.transform( df[['RAD']] )
    RAD_enc = pd.DataFrame( RAD_enc, columns=RAD_ohe.get_feature_names(['RAD'])[1:] )
    df = pd.concat( (df.drop(["RAD"], axis=1).reset_index(drop = True), RAD_enc), axis = 1 )
    return df
'''

In [None]:
## FOR NEWER VERSIONS OF SKLEARN
#def repro_RAD_ohe( X_df ):
#    df = X_df.copy()
#    RAD_enc = RAD_ohe.transform( df[['RAD']] )
#    RAD_enc = pd.DataFrame( RAD_enc, columns=RAD_ohe.get_feature_names(['RAD']) )
#    df = pd.concat( (df.drop(["RAD"], axis=1).reset_index(drop = True), RAD_enc), axis = 1 )
#    return df

In [12]:
repro_RAD_ohe(Xtest.iloc[3:4])

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,RAD_2.0,RAD_3.0,RAD_4.0,RAD_5.0,RAD_6.0,RAD_7.0,RAD_8.0,RAD_24.0
0,0.15098,0.0,10.01,0.0,0.547,6.021,82.6,2.7474,432.0,17.8,394.51,10.3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### Transform and Remove Cols ZN and CHAS

In [13]:
Xtrain['ZN_bool'] = Xtrain.ZN.apply( lambda x: 1 if x > 0 else 0 )
Xtrain = Xtrain.drop(['ZN', 'CHAS'], axis = 1)
Xtrain.head()

Unnamed: 0,CRIM,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,RAD_2.0,RAD_3.0,RAD_4.0,RAD_5.0,RAD_6.0,RAD_7.0,RAD_8.0,RAD_24.0,ZN_bool
0,0.08707,12.83,0.437,6.14,45.8,4.0905,398.0,18.7,386.96,10.27,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
1,0.06617,3.24,0.46,5.868,25.8,5.2146,430.0,16.9,382.44,9.97,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,0.02543,3.78,0.484,6.696,56.4,5.7321,370.0,17.6,396.9,7.18,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
3,0.01301,1.52,0.442,7.241,49.3,7.0379,284.0,15.5,394.74,5.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.59005,21.89,0.624,6.372,97.9,2.3274,437.0,21.2,385.76,11.12,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0


In [14]:
def repro_ZN_CHAS( X_df ):
    df = X_df.copy()
    df['ZN_bool'] = df.ZN.apply( lambda x: 1 if x > 0 else 0 )
    return df.drop( ['ZN','CHAS'], axis = 1 )

#### Standardize Scale Columns

In [15]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

Xtrain = pd.DataFrame( ss.fit_transform( Xtrain ), columns = Xtrain.columns )

Xtrain.head()

Unnamed: 0,CRIM,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,RAD_2.0,RAD_3.0,RAD_4.0,RAD_5.0,RAD_6.0,RAD_7.0,RAD_8.0,RAD_24.0,ZN_bool
0,-0.458933,0.246458,-0.996666,-0.202272,-0.771223,0.119037,-0.078972,0.106428,0.342351,-0.325517,-0.239983,-0.272888,-0.500773,1.841549,-0.228218,-0.203069,-0.209589,-0.611678,-0.607858
1,-0.461667,-1.151097,-0.801629,-0.591011,-1.47398,0.645012,0.109987,-0.733904,0.295129,-0.367229,-0.239983,-0.272888,1.996911,-0.543021,-0.228218,-0.203069,-0.209589,-0.611678,-0.607858
2,-0.466997,-1.072403,-0.598114,0.592355,-0.398762,0.887155,-0.24431,-0.407108,0.446199,-0.755152,-0.239983,-0.272888,-0.500773,1.841549,-0.228218,-0.203069,-0.209589,-0.611678,1.645121
3,-0.468621,-1.401754,-0.954266,1.371262,-0.648241,1.498149,-0.752137,-1.387496,0.423632,-0.99013,-0.239983,-0.272888,-0.500773,-0.543021,-0.228218,-0.203069,-0.209589,-0.611678,1.645121
4,-0.393133,1.566775,0.589063,0.129299,1.059458,-0.705931,0.151322,1.273557,0.329814,-0.207332,-0.239983,-0.272888,1.996911,-0.543021,-0.228218,-0.203069,-0.209589,-0.611678,-0.607858


In [16]:
def repro_SS( X_df ):
    df = X_df.copy()
    return pd.DataFrame( ss.transform( df ), columns = df.columns )

## Create a Preprocessing Pipeline for all future testing

In [18]:
def preprocess_new_data( X_mat ):
    df = X_mat.copy()
    df = repro_RAD_ohe( df )
    df = repro_ZN_CHAS( df )
    df = repro_SS( df )
    return df

In [19]:
preprocess_new_data( Xtest ).head()

Unnamed: 0,CRIM,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,RAD_2.0,RAD_3.0,RAD_4.0,RAD_5.0,RAD_6.0,RAD_7.0,RAD_8.0,RAD_24.0,ZN_bool
0,-0.367734,-0.437018,-0.140203,-0.41665,0.490225,0.197365,-0.616323,1.180187,0.340157,0.286262,-0.239983,-0.272888,1.996911,-0.543021,-0.228218,-0.203069,-0.209589,-0.611678,-0.607858
1,-0.341017,-0.437018,-0.140203,-0.669616,1.133247,0.121236,-0.616323,1.180187,0.421543,1.010663,-0.239983,-0.272888,1.996911,-0.543021,-0.228218,-0.203069,-0.209589,-0.611678,-0.607858
2,-0.278886,1.230138,0.427946,1.725701,0.809979,-0.872741,-0.049447,-1.760977,0.211445,-1.512923,-0.239983,-0.272888,-0.500773,1.841549,-0.228218,-0.203069,-0.209589,-0.611678,-0.607858
3,-0.450572,-0.164502,-0.063884,-0.372346,0.521849,-0.50941,0.121797,-0.313738,0.421229,-0.321345,-0.239983,-0.272888,-0.500773,-0.543021,4.38178,-0.203069,-0.209589,-0.611678,-0.607858
4,-0.432268,1.566775,0.589063,-0.15368,0.908365,-1.04072,0.151322,1.273557,0.354052,1.605757,-0.239983,-0.272888,1.996911,-0.543021,-0.228218,-0.203069,-0.209589,-0.611678,-0.607858


## Lasso Regression

In [20]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [21]:
lasso = Lasso()

params = {'alpha':(0.1,0.5,1,2,10), 'fit_intercept':[True,False]}
grid_search_lasso = GridSearchCV( estimator=lasso, param_grid=params )
grid_search_lasso.fit(Xtrain,ytrain)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': (0.1, 0.5, 1, 2, 10),
                         'fit_intercept': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [22]:
grid_search_lasso.best_params_

{'alpha': 0.1, 'fit_intercept': True}

In [23]:
grid_search_lasso.best_score_

0.7003738455703424

## KNN regressor

In [24]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

knn_params = {'n_neighbors':[1,2,5,10]}
grid_search_knn = GridSearchCV( estimator=knn, param_grid=knn_params )
grid_search_knn.fit(Xtrain,ytrain)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=None, param_grid={'n_neighbors': [1, 2, 5, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [25]:
grid_search_knn.best_params_

{'n_neighbors': 1}

In [26]:
grid_search_knn.best_score_

0.7765970955349616

In [27]:
### CHOOSE KNN (BETTER CV SCORE)
knn_model = grid_search_knn.best_estimator_

In [28]:
knn_model.score( preprocess_new_data(Xtest), ytest )

0.8706179391229838

In [29]:
## JUST CHECKING
grid_search_lasso.best_estimator_.score(preprocess_new_data(Xtest),ytest)

0.7020902198407593