# Inclass-Lab Exercise - Solution

In [4]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
os.chdir('C:\\Users\\DILIP\\Downloads')

# Any results you write to the current directory are saved as output.

In [8]:
data = pd.read_csv('Automobile.csv')
list(data)

['symboling',
 'normalized_losses',
 'make',
 'fuel_type',
 'aspiration',
 'number_of_doors',
 'body_style',
 'drive_wheels',
 'engine_location',
 'wheel_base',
 'length',
 'width',
 'height',
 'curb_weight',
 'engine_type',
 'number_of_cylinders',
 'engine_size',
 'fuel_system',
 'bore',
 'stroke',
 'compression_ratio',
 'horsepower',
 'peak_rpm',
 'city_mpg',
 'highway_mpg',
 'price']

In [9]:
data.shape

(201, 26)

In [10]:
data['horsepower'] = pd.to_numeric(data['horsepower'], errors = 'coerce')
data['price'] = pd.to_numeric(data['price'], errors = 'coerce')
# data.any().isna()
data.dropna(subset=['price', 'horsepower'], inplace=True)
# type(data['horsepower'][1])

In [11]:
from scipy.stats.stats import pearsonr
pearsonr(data['horsepower'], data['price'])
data['horsepower'].head()

0    111
1    111
2    154
3    102
4    115
Name: horsepower, dtype: int64

In [30]:
from bokeh.io import output_notebook
from bokeh.plotting import ColumnDataSource, figure, show

# enable notebook output
output_notebook()

source = ColumnDataSource(data=dict(
    x=data['horsepower'],
    y=data['price'],
    make=data['make'],
))

tooltips = [
    ('make', '@make'),
    ('horsepower', '$x'),
    ('price', '$y{$0}')
]

p = figure(plot_width=600, plot_height=400, tooltips=tooltips)
p.xaxis.axis_label = 'Horsepower'
p.yaxis.axis_label = 'Price'

# add a square renderer with a size, color, and alpha
p.circle('x', 'y', source=source, size=8, color='blue', alpha=0.5)

# show the results
show(p)

In [13]:
from sklearn.model_selection import train_test_split
train, test=  train_test_split(data, test_size = 0.25)

# **Linear Regression**

In [18]:
from sklearn import linear_model
model = linear_model.LinearRegression()
training_x = np.array(train['horsepower']).reshape(-1,1)
training_y = np.array(train['price'])
model.fit(training_x, training_y)
slope = np.asscalar(np.squeeze(model.coef_))
intercept = model.intercept_
print('slope:', slope, 'intercept:', intercept)


slope: 180.61074173890165 intercept: -5309.058965078328


In [19]:
# Now let's add the line to our graph
from bokeh.models import Slope
best_line = Slope(gradient=slope, y_intercept=intercept, line_color='red', line_width=3)
p.add_layout(best_line)
show(p)

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# function to predict the mean_absolute_error, mean_squared_error and r-squared
def predict_metrics(lr, x, y):
    pred = lr.predict(x)
    mae = mean_absolute_error(y, pred)
    mse = mean_squared_error(y, pred)
    r2 = r2_score(y, pred)
    return mae, mse, r2

training_mae, training_mse, training_r2 = predict_metrics(model, training_x, training_y)

test_x = np.array(test['horsepower']).reshape(-1,1)
test_y = np.array(test['price'])

test_mae, test_mse, test_r2 = predict_metrics(model, test_x, test_y)

print('training mean error:', training_mae, 'training mse:', training_mse, 'training r2:', training_r2)
print('test mean error:', test_mae, 'test mse:', test_mse, 'test r2:', test_r2)

training mean error: 3267.174335061284 training mse: 21988194.538786925 training r2: 0.6496723395377622
test mean error: 3518.2054721092723 test mse: 20630148.755393162 test r2: 0.6686963805880475


In [22]:
#Getting the correlation between other variables/columns

cols = ['horsepower', 'engine_size', 'peak_rpm', 'length', 'width', 'height']
for col in cols:
    data[col] = pd.to_numeric(data[col], errors = 'coerce')
data.dropna(subset = ['price', 'horsepower'], inplace = True)

# Let's see how strongly each column is correlated to price
for col in cols:
    print(col, pearsonr(data[col], data['price']))

horsepower (0.8107950503290707, 3.586874782556137e-48)
engine_size (0.8723351674455185, 9.265491622197996e-64)
peak_rpm (-0.10484596598109105, 0.1385364891159878)
length (0.690628380448364, 8.016477466159053e-30)
width (0.7512653440522673, 9.20033551048166e-38)
height (0.13548630756805974, 0.05514627325101211)


In [23]:
# split train and test data as before
model_cols = ['horsepower', 'engine_size', 'length', 'width']
multi_x = np.column_stack(tuple(data[col] for col in model_cols))
y = data['price']

multi_train_x, multi_test_x, multi_train_y, multi_test_y = train_test_split(multi_x, y, test_size = 0.25)



In [24]:
# fit the model as before
multi_model = linear_model.LinearRegression()
multi_model.fit(multi_train_x, multi_train_y)
multi_model_intercept = multi_model.intercept_
multi_coefficient = dict(zip(model_cols,multi_model.coef_))
print('intercept:', multi_model_intercept)
print('Co-efficients:', multi_coefficient)


intercept: -51456.23164172357
Co-efficients: {'horsepower': 58.67432830807245, 'engine_size': 87.06019583288064, 'length': 20.35537283350099, 'width': 665.6953785395418}


In [26]:
# calculate error metrics
m_train_mae, m_train_mse, m_train_r2 = predict_metrics(multi_model, multi_train_x, multi_train_y)
m_test_mae, m_test_mse, m_test_r2 = predict_metrics(multi_model, multi_test_x, multi_test_y)

print('m_train_mean_error:', m_train_mae, 'm_train_mae:', m_train_mse, 'm_train_r2', m_train_r2 )
print('m_test_mean_error:', m_test_mae, 'm_test_mae:', m_test_mse, 'm_test_r2', m_test_r2 )

m_train_mean_error: 2629.3957120596056 m_train_mae: 12576862.868699118 m_train_r2 0.7884932433281803
m_test_mean_error: 1875.2626466953886 m_test_mae: 9610915.930515248 m_test_r2 0.867173774413863


# **Ridge Regression**

In [27]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.05, normalize = True)
ridge.fit(multi_train_x, multi_train_y)

r_train_mae, r_train_mse, r_train_r2 = predict_metrics(ridge, multi_train_x, multi_train_y)
r_test_mae, r_test_mse, r_test_r2 = predict_metrics(ridge, multi_test_x, multi_test_y)

print('r_train_mean_error:', r_train_mae, 'r_train_mae:', r_train_mse, 'r_train_r2', r_train_r2 )
print('r_test_mean_error:', r_test_mae, 'r_test_mae:', r_test_mse, 'r_test_r2', r_test_r2 )


r_train_mean_error: 2619.2433198719605 r_train_mae: 12618436.537701948 r_train_r2 0.7877940934697827
r_test_mean_error: 1913.1216754032603 r_test_mae: 9981013.91788469 r_test_r2 0.8620588905552676


# **Lasso Regression**

In [28]:
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha = 0.05, normalize = True)
lasso_model.fit(multi_train_x, multi_train_y)

l_train_mae, l_train_mse, l_train_r2 = predict_metrics(lasso_model, multi_train_x, multi_train_y)
l_test_mae, l_test_mse, l_test_r2 = predict_metrics(lasso_model, multi_test_x, multi_test_y)

print('train_mean_error:', l_train_mae, 'train_mae:', l_train_mse, 'train_r2', l_train_r2 )
print('test_mean_error:', l_test_mae, 'test_mae:', l_test_mse, 'test_r2', l_test_r2 )

train_mean_error: 2629.3754377711884 train_mae: 12576863.372012146 train_r2 0.7884932348638989
test_mean_error: 1875.4582190867688 test_mae: 9612937.13811865 test_r2 0.8671458406166029


# **ElasticNet Regression**
  In statistics and, in particular, in the fitting of linear or logistic regression models, the elastic net is a regularized regression method that linearly combines the L1 and L2 penalties of the lasso and ridge methods.

In [16]:
from sklearn.linear_model import ElasticNet
enet_model = ElasticNet(alpha=0.01, l1_ratio=0.5, normalize=False)
enet_model.fit(multi_train_x, multi_train_y)

el_train_mae, el_train_mse, el_train_r2 = predict_metrics(enet_model, multi_train_x, multi_train_y)
el_test_mae, el_test_mse, el_test_r2 = predict_metrics(enet_model, multi_test_x, multi_test_y)

print('train_mean_error:', el_train_mae, 'train_mae:', el_train_mse, 'train_r2', el_train_r2 )
print('test_mean_error:', el_test_mae, 'test_mae:', el_test_mse, 'test_r2', el_test_r2 )

train_mean_error: 2572.026718903673 train_mae: 12674442.586477466 train_r2 0.7930843419728106
test_mean_error: 2024.0914080569275 test_mae: 8841648.940255782 test_r2 0.8727540633518162
