In [40]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
import pandas as pd
import numpy as np
import matplotlib as pltb

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [42]:
df = pd.read_csv('dataset-mpg.csv')
df.describe(include='all')

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
unique,,,94.0,,,,,
top,,,150.0,,,,,
freq,,,22.0,,,,,
mean,5.454774,193.425879,,2970.424623,15.56809,76.01005,1.572864,23.514573
std,1.701004,104.269838,,846.841774,2.757689,3.697627,0.802055,7.815984
min,3.0,68.0,,1613.0,8.0,70.0,1.0,9.0
25%,4.0,104.25,,2223.75,13.825,73.0,1.0,17.5
50%,4.0,148.5,,2803.5,15.5,76.0,1.0,23.0
75%,8.0,262.0,,3608.0,17.175,79.0,2.0,29.0


In [43]:
# Error Estimation
## OLS
## 1NN
## Unweighted 3NN
## Distance-Weighted 3NN
## Unweighted 10NN
## Distance-Weighted 10NN

In [44]:
print(df.shape)
df.dtypes

(398, 8)


cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
year              int64
origin            int64
mpg             float64
dtype: object

In [45]:
#Horsepower needs to be either deleted or replaced.....
df[df['horsepower'] == '?']

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg
32,4,98.0,?,2046.0,19.0,71,1,25.0
126,6,200.0,?,2875.0,17.0,74,1,21.0
330,4,85.0,?,1835.0,17.3,80,2,40.9
336,4,140.0,?,2905.0,14.3,80,1,23.6
354,4,100.0,?,2320.0,15.8,81,2,34.5
374,4,151.0,?,3035.0,20.5,82,1,23.0


In [46]:
#Delete all the rows with the anomylous horsepower rows.

df_delete = (df[(df['horsepower'] != '?')]).copy()
df_delete.reset_index(drop = True, inplace = True)
df_delete.describe(include = 'all')

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
unique,,,93.0,,,,,
top,,,150.0,,,,,
freq,,,22.0,,,,,
mean,5.471939,194.41199,,2977.584184,15.541327,75.979592,1.576531,23.445918
std,1.705783,104.644004,,849.40256,2.758864,3.683737,0.805518,7.805007
min,3.0,68.0,,1613.0,8.0,70.0,1.0,9.0
25%,4.0,105.0,,2225.25,13.775,73.0,1.0,17.0
50%,4.0,151.0,,2803.5,15.5,76.0,1.0,22.75
75%,8.0,275.75,,3614.75,17.025,79.0,2.0,29.0


In [47]:
# change non-numerics to NaN
df = df.apply(pd.to_numeric, errors='coerce')
df.dropna(subset=['horsepower'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.shape

(392, 8)

In [48]:
ols = Pipeline([('impute_means', Imputer(missing_values='?', strategy='mean')),
                ('impute_modes', Imputer(missing_values=-1, strategy='most_frequent')),
                ('standardize', StandardScaler()),
                ('estimator', LinearRegression())])
 
 
lassocv = Pipeline([('impute_means', Imputer(missing_values='?', strategy='mean')),
                    ('impute_modes', Imputer(missing_values=-1, strategy='most_frequent')),
                    ('standardize', StandardScaler()),
                    ('estimator', LassoCV(cv=10))])
 
 
ridgecv = Pipeline([('impute_means', Imputer(missing_values='?', strategy='mean')),
                    ('impute_modes', Imputer(missing_values=-1, strategy='most_frequent')),
                    ('standardize', StandardScaler()),
                    ('estimator', RidgeCV(cv=10))])
 
 
knn = Pipeline([('impute_means', Imputer(missing_values='?', strategy='mean')),
                ('impute_modes', Imputer(missing_values=-1, strategy='most_frequent')),
                ('standardize', StandardScaler()),
                ('estimator', KNeighborsRegressor(weights = 'distance'))])
 

#Remeber that it's estimator__n_neighbors
knn_hyperparameters = {'estimator__n_neighbors' : [1, 2, 3, 4, 5, 6, 7,8, 9, 10]}
knn_gs = GridSearchCV(knn, knn_hyperparameters, scoring = 'neg_mean_squared_error', cv = 10)

In [51]:
def compute_error(estimator, data, input_cv=10):
    features = data.copy().drop('mpg', axis=1)
    dependentVar = data['mpg']
    mse_test = cross_val_score(estimator, features, dependentVar, scoring='neg_mean_squared_error', cv=10)
    return np.mean(mse_test)

In [52]:
ols_mses_test = compute_error(ols, df)
ls_mean_mse_test = np.mean(ols_mses_test)

lassocv_mses_test = compute_error(lassocv, df)
lassocv_mean_mse_test = np.mean(lassocv_mses_test)

ridgecv_mses_test = compute_error(oridgecv, df)
ridgecv_mean_mse_test = np.mean(ridgecv_mses_test)

knn_gs = GridSearchCV(knn, knn_hyperparameters, scoring = 'neg_mean_squared_error', cv = 10)
knn_mses_test = cross_val_score(knn_gs, X, y, scoring = 'neg_mean_squared_error', cv = 10)

                            
knn_mean_mse_test = np.mean(knn_mses_test)
print('OLS %4f\nLasso %4f\nRidge %4f\nkNN %4f' %
(ols_mean_mse_test, lassocv_mean_mse_test, ridgecv_mean_mse_test,knn_mean_mse_test))
print("-- 10FOLD CROSS-VAL --")

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''