# Final Exam 
## Working with Auto Data (Horsepower, MPG, Etc.)

In [88]:
import pandas as pd

auto = pd.read_csv("auto-mpg.csv", sep=(","))
print(auto.head())

    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0    3504          12.0   
1  15.0          8         350.0       165.0    3693          11.5   
2  18.0          8         318.0       150.0    3436          11.0   
3  16.0          8         304.0       150.0    3433          12.0   
4  17.0          8         302.0       140.0    3449          10.5   

   model year  origin                   car name  
0          70       1  chevrolet chevelle malibu  
1          70       1          buick skylark 320  
2          70       1         plymouth satellite  
3          70       1              amc rebel sst  
4          70       1                ford torino  


### Train/Test Data Split

In [89]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(auto, 
                        test_size=0.2, random_state=123)
print('Train size: ', len(train_set), 'Test size: ', len(test_set))

Train size:  318 Test size:  80


In [90]:
auto['horsepower'].replace(['?'],['0'],inplace=True)

In [91]:
auto['horsepower'] = auto['horsepower'].astype(float)

In [112]:
print(auto['horsepower'].median())        
        

95.0


In [104]:
 auto['horsepower'] = auto['horsepower'].replace(102.894472, auto['horsepower'].median())

### Linear Regression Model

In [113]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
X = train_set[['horsepower', 'weight']]
y = train_set['mpg']
X_test = test_set[['horsepower', 'weight']]
y_test = test_set['mpg']

lr_model = LinearRegression()
lr_model.fit(X,y)

y_pred = lr_model.predict(X)
print('Results for linear regression on training data')
print('Input: horsepower, weight')
print(' Default settings')
print('Internal parameters:')
print(' Bias is ', lr_model.intercept_)
print(' Coefficients', lr_model.coef_)
print(' Score', lr_model.score(X,y))
print('MAE is ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2 ', r2_score(y,y_pred))

y_test_pred = lr_model.predict(X_test)
print()
print('Results for linear regression on test data')
print('Input: horsepower, weight')
print('MAE is ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, 
y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2 ', r2_score(y_test,y_test_pred))


Results for linear regression on training data
Input: horsepower, weight
 Default settings
Internal parameters:
 Bias is  45.87411050864164
 Coefficients [-0.04074079 -0.00605784]
 Score 0.7055752526651475
MAE is  3.241837988715952
RMSE is  4.256668029366336
MSE is  18.119222712229487
R^2  0.7055752526651475

Results for linear regression on test data
Input: horsepower, weight
MAE is  3.3556244275847575
RMSE is  4.225971657558403
MSE is  17.858836450486915
R^2  0.6938079256709418


### Pipelined Model Pipeline #1

In [116]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

X = train_set[['horsepower', 'weight']]
y = train_set['mpg']

X_test = test_set[['horsepower', 'weight']]
y_test = test_set['mpg']

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
scale = StandardScaler()
lrpipe_model = LinearRegression()

stages = [('imp_median', imp_median),
 ('scale', scale),
 ('lrpipe_model', lrpipe_model),
 ]
pipe_model = Pipeline(stages)

pipe_model.fit(X,y)

y_pred = pipe_model.predict(X)
print('Results for pipeline linear regression on training data')
#print('  Default settings')
#print('Internal parameters:')
print('   Bias is ', pipe_model.predict([[0, 0]]))
#print('   Coefficients', pipe_model.coef_)
print('   Score', pipe_model.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

y_test_pred = pipe_model.predict(X_test)
print()
print('Results for pipeline linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))


Results for pipeline linear regression on training data
   Bias is  [45.87411051]
   Score 0.7055752526651475
MAE is   3.2418379887159525
RMSE is  4.256668029366336
MSE is  18.11922271222949
R^2     0.7055752526651475

Results for pipeline linear regression on test data
MAE is   3.3556244275847584
RMSE is  4.225971657558404
MSE is  17.858836450486923
R^2     0.6938079256709417




### Pipeline #2

In [97]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

X = train_set[['horsepower', 'weight']]
y = train_set['mpg']

X_test = test_set[['horsepower', 'weight']]
y_test = test_set['mpg']

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
poly3 = PolynomialFeatures(degree=3, include_bias=False)
scale = StandardScaler()
lr_model = LinearRegression()

stages = [('imp_median', imp_median),
 ('poly3', poly3),
 ('scale', scale),
 ('lr_model', lr_model),
 ]
pipe_model = Pipeline(stages)

pipe_model.fit(X,y)

y_pred = pipe_model.predict(X)
print('Results for pipeline linear regression on training data')
#print('  Default settings')
#print('Internal parameters:')
print('   Bias is ', pipe_model.predict([[0, 0]]))
#print('   Coefficients', pipe_model.coef_)
print('   Score', pipe_model.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

y_test_pred = pipe_model.predict(X_test)
print()
print('Results for pipeline linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))


Results for pipeline linear regression on training data
   Bias is  [52.16797207]
   Score 0.7641271307122832
MAE is   2.881941587573485
RMSE is  3.8099705709124447
MSE is  14.515875751218902
R^2     0.7641271307122832

Results for pipeline linear regression on test data
MAE is   3.7235264163884794
RMSE is  7.1845967713840775
MSE is  51.618430767382506
R^2     0.11499528907748058


