In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn import neighbors
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.decomposition import PCA
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression

In [2]:
auto_mpg_df = pd.read_table('auto-mpg.data-original.csv', delimiter=',').dropna()
auto_mpg_df.head()

Unnamed: 0,MPG,Cylinders,Displacement,horsepower,Weight,Acceleration,Model_Year,Origin,Car Name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [3]:
auto_mpg_df['horsepower'] = np.log(auto_mpg_df['horsepower'])
auto_mpg_df['Weight'] = np.log(auto_mpg_df['Weight'])

In [4]:
#Drop all the cylinder 3 and 5 data because we don't have enough of them to make good prediction for them
cy5 = auto_mpg_df[auto_mpg_df['Cylinders'] == 5]
cy3 = auto_mpg_df[auto_mpg_df['Cylinders'] == 3]
cytodrop = pd.concat([cy3, cy5])
cyindex = cytodrop.index.values
auto_mpg_df = auto_mpg_df.drop(cyindex)

In [5]:
X_noDummie = auto_mpg_df.drop(['MPG', 'Car Name'], 1)
y_noDummie = auto_mpg_df['MPG']

X_train, X_test, y_train, y_test = train_test_split(X_noDummie, y_noDummie, test_size=0.2)
X_noDummie.head()

Unnamed: 0,Cylinders,Displacement,horsepower,Weight,Acceleration,Model_Year,Origin
0,8,307.0,4.867534,8.16166,12.0,70,1
1,8,350.0,5.105945,8.214194,11.5,70,1
2,8,318.0,5.010635,8.142063,11.0,70,1
3,8,304.0,5.010635,8.14119,12.0,70,1
4,8,302.0,4.941642,8.14584,10.5,70,1


In [6]:
lass = linear_model.Lasso(alpha=.35)
lassfit = lass.fit(X_train, y_train)

print('R² for the model with few features:')
print(lass.score(X_test, y_test))
origparams = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates for the model with few features:')
print(str(origparams) + '\n')

print(cross_val_score(lassfit, X_test, y_test, cv=5))

R² for the model with few features:
0.734527182811

Parameter estimates for the model with few features:
[ -0.          -0.05136758  -0.          -0.          -0.10001546
   0.62827577   0.76112067 -13.90313327]

[ 0.82788278  0.47022996  0.78134143  0.5465996   0.90842871]


In [7]:
# Create a set of dummy variables from the sex variable
df_Cylinders = pd.get_dummies(auto_mpg_df['Cylinders'])
df_Model_Year = pd.get_dummies(auto_mpg_df['Model_Year'])
df_Origin = pd.get_dummies(auto_mpg_df['Origin'])

# Join the dummy variables to the main dataframe
auto_mpg_df = pd.concat([auto_mpg_df, df_Cylinders, df_Model_Year, df_Origin], axis=1)
auto_mpg_df = auto_mpg_df.drop(['Cylinders', 'Model_Year', 'Origin'], 1).drop('Car Name', 1)
auto_mpg_df.head()

Unnamed: 0,MPG,Displacement,horsepower,Weight,Acceleration,4,6,8,70,71,...,76,77,78,79,80,81,82,1,2,3
0,18.0,307.0,4.867534,8.16166,12.0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
1,15.0,350.0,5.105945,8.214194,11.5,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
2,18.0,318.0,5.010635,8.142063,11.0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
3,16.0,304.0,5.010635,8.14119,12.0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
4,17.0,302.0,4.941642,8.14584,10.5,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
X = auto_mpg_df.drop('MPG', 1)
y = auto_mpg_df['MPG']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2)

In [9]:
lass = linear_model.Lasso(alpha=.35)
lassfit = lass.fit(X_train2, y_train2)

print('R² for the model with few features:')
print(lass.score(X_test2, y_test2))
origparams = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates for the model with few features:')
print(str(origparams) + '\n')

print(cross_val_score(lassfit, X_test2, y_test2, cv=5))

R² for the model with few features:
0.633111260834

Parameter estimates for the model with few features:
[ -6.10966091e-02  -0.00000000e+00  -0.00000000e+00  -2.06603309e-02
   0.00000000e+00  -2.65652106e-01   0.00000000e+00  -0.00000000e+00
  -0.00000000e+00  -0.00000000e+00  -0.00000000e+00  -0.00000000e+00
  -0.00000000e+00  -0.00000000e+00   0.00000000e+00  -0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   4.36315471e-01
  -0.00000000e+00  -0.00000000e+00   0.00000000e+00   3.57149920e+01]

[ 0.71814557  0.57375344  0.42083126  0.6557793   0.58843184]


In [16]:
# ###### High multi-corrilation, applying PCA
# Standardizing the features
X_pca = auto_mpg_df.drop('MPG', 1)
X_pca = pd.DataFrame(StandardScaler().fit_transform(X_pca))

pca = PCA(n_components=0.90, svd_solver='full')
principalComponents = pca.fit_transform(X_pca)
principalDf = pd.DataFrame(data = principalComponents)
#principalDf.columns = ['zero','one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen']
#principalDf['MPG'] = auto_mpg_df['MPG']
principalDf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,3.065539,-1.644062,-0.246035,-0.599913,2.226263,-0.104289,-0.557016,-0.33832,-0.159757,-1.324875,0.158612,0.004024,0.209633,0.901393
1,3.614392,-1.820793,-0.270346,-0.700441,2.231959,-0.099278,-0.551501,-0.354165,-0.190633,-1.316976,0.156038,0.000852,0.181589,0.841318
2,3.332767,-1.831178,-0.29595,-0.681544,2.28862,-0.068929,-0.555703,-0.346077,-0.167189,-1.321294,0.15825,0.003951,0.229365,0.818449
3,3.185257,-1.711765,-0.256264,-0.629804,2.238786,-0.097967,-0.546326,-0.343912,-0.171112,-1.322332,0.159764,0.006519,0.21072,0.881338
4,3.245548,-1.848881,-0.309319,-0.682898,2.316378,-0.054127,-0.560938,-0.342788,-0.158061,-1.323631,0.15916,0.004699,0.251039,0.797715


In [17]:
X_cpa = principalDf
y_cpa = auto_mpg_df['MPG']

X_train3, X_test3, y_train3, y_test3 = train_test_split(X_cpa, y_cpa, test_size=0.2)

In [29]:
lass = linear_model.Lasso(alpha=2)
lassfit = lass.fit(X_train3, y_train3)

print('R² for the model with few features:')
print(lass.score(X_test3, y_test3))
origparams = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates for the model with few features:')
print(str(origparams) + '\n')

print(cross_val_score(lassfit, X_test3, y_test3, cv=5))

R² for the model with few features:
0.765245787149

Parameter estimates for the model with few features:
[ -2.50021797  -0.          -0.           0.           0.           0.           0.
   0.           0.          -0.           0.          -0.          -0.           0.
  23.53802773]

[ 0.79293425  0.82317169  0.7354133   0.58394816  0.7201671 ]
