In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
from scipy.stats.stats import pearsonr
from bokeh.io import output_notebook
from bokeh.models import Slope
from bokeh.plotting import ColumnDataSource, figure, show
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['Automobile_data.csv']


In [2]:
#load dataset
data = pd.read_csv('../input/Automobile_data.csv')
data.sample(3)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
28,-1,110,dodge,gas,std,four,wagon,fwd,front,103.3,174.6,64.6,59.8,2535,ohc,four,122,2bbl,3.34,3.46,8.5,88,5000,24,30,8921
135,2,104,saab,gas,std,four,sedan,fwd,front,99.1,186.6,66.5,56.1,2758,ohc,four,121,mpfi,3.54,3.07,9.3,110,5250,21,28,15510
103,0,108,nissan,gas,std,four,sedan,fwd,front,100.4,184.6,66.5,55.1,3060,ohcv,six,181,mpfi,3.43,3.27,9.0,152,5200,19,25,13499


In [3]:
#proces data
data['horsepower'] = pd.to_numeric(data['horsepower'], errors='coerce')
data['price'] = pd.to_numeric(data['price'], errors='coerce')
data.dropna(subset=['price', 'horsepower'], inplace=True)
data.sample(3)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
121,1,154,plymouth,gas,std,four,sedan,fwd,front,93.7,167.3,63.8,50.8,1989,ohc,four,90,2bbl,2.97,3.23,9.4,68.0,5500,31,38,6692.0
197,-1,74,volvo,gas,std,four,wagon,rwd,front,104.3,188.8,67.2,57.5,3042,ohc,four,141,mpfi,3.78,3.15,9.5,114.0,5400,24,28,16515.0
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111.0,5000,21,27,13495.0


In [4]:
# we check corelation
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html
pearsonr(data['horsepower'], data['price'])

(0.8105330821322063, 1.1891278276946011e-47)

In [5]:
# display data
output_notebook()

source = ColumnDataSource(data=dict(
    x=data['horsepower'],
    y=data['price'],
    make=data['make']
))

tooltips = [
    ('make', '@make'),
    ('horsepower', '$x'),
    ('price', '$y{$0}')
]

p = figure(plot_width=600, plot_height=400, tooltips=tooltips)
p.xaxis.axis_label = 'horsepower'
p.yaxis.axis_label = 'price'

p.circle('x','y', source=source, size=8, color='blue', alpha=0.5)

show(p)

In [6]:
# we split data
train, test = train_test_split(data, test_size=0.25)
# we reshape data to fit model
x_train = np.array(train['horsepower']).reshape(-1,1)
y_train = np.array(train['price']).reshape(-1,1)

In [7]:
# model
model = LinearRegression()

In [8]:
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [9]:
slope = np.asscalar(np.squeeze(model.coef_))
intercept = model.intercept_
print("slope: ", slope, "intercept: ", intercept)

slope:  165.0508306776342 intercept:  [-3836.96822507]


In [10]:
# now we add our line to old graph
best_line = Slope(gradient = slope, y_intercept=intercept[0], line_color='red', line_width=3)
p.add_layout(best_line)
show(p)

In [11]:
# evaluation of model
def predict_metrics(model, x, y):
    pred = model.predict(x)
    mae = mean_absolute_error(y, pred)
    mse = mean_squared_error(y, pred)
    r2 = r2_score(y, pred)
    return mae, mse, r2

mae_train, mse_train, r2_train  = predict_metrics(model, x_train, y_train)   
# now test data

x_test = np.array(test['horsepower']).reshape(-1,1)
y_test = np.array(test['price']).reshape(-1,1)
mae_test, mse_test, r2_test  = predict_metrics(model, x_test, y_test) 
print(mae_train, mse_train, r2_train)
print(mae_test, mse_test, r2_test)

3269.3008985247693 19934901.425992344 0.670665815555382
3502.896420805657 27357692.2264854 0.6181334127223477


In [12]:
cols = ['horsepower', 'engine-size', 'peak-rpm', 'length', 'width']
for col in cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')
data.dropna(subset=['horsepower', 'engine-size', 'peak-rpm', 'length', 'width'], inplace=True)

for col in cols:
    print(col, pearsonr(data[col], data['price']))

horsepower (0.8105330821322063, 1.1891278276946011e-47)
engine-size (0.8738869517981516, 1.2650674479074428e-63)
peak-rpm (-0.10164886620219901, 0.15311824317199588)
length (0.6939647745646871, 6.39831060305001e-30)
width (0.7538710519013427, 8.679834788813268e-38)


In [13]:
model_cols = ['horsepower', 'engine-size', 'length', 'width']
x_multi = np.column_stack(tuple(data[col] for col in model_cols))
x_multi_train, x_multi_test, y_multi_train, y_multi_test = train_test_split(x_multi, data['price'], test_size=0.25)

In [14]:
multi_model = Ridge(alpha=1)
multi_model.fit(x_multi_train, y_multi_train)
multi_intercept = multi_model.intercept_
multi_coeffs = dict(zip(model_cols, multi_model.coef_))
print("multi_intercept: ", multi_intercept)
print("multi_coeffs: ", multi_coeffs)

multi_intercept:  -66588.52820456207
multi_coeffs:  {'horsepower': 63.06671064138941, 'engine-size': 91.25151226299465, 'length': -20.797643109359, 'width': 990.6755022435268}


In [15]:
multi_mae_train, multi_mse_train, multi_r2_train  = predict_metrics(multi_model, x_multi_train, y_multi_train)  
multi_mae_test, multi_mse_test, multi_r2_test  = predict_metrics(multi_model, x_multi_test, y_multi_test) 

In [16]:
print(multi_mae_train, multi_mse_train, multi_r2_train)
print(multi_mae_test, multi_mse_test, multi_r2_test)
#normal
#2376.4758795516805 10962113.804023953 0.8245020624954748
#2604.3178614633566 14202010.770502798 0.7843551696529536
#lasso
#2464.008345934287 11880255.504195975 0.7931496649137988
#2398.921208586225 11335885.832721867 0.8597886425684198
#ridge
#2421.143908191097 11545393.950756984 0.8296656404339962
#2491.4053863851514 11935785.619589407 0.7611422511134462


2483.4768444469787 11801642.534850808 0.8066668346152439
2391.674846460825 11383812.223262327 0.837798520347339


*  Apply both ridge and lasso regression at the bottom using the example code above. Are you able to improve results?

In [17]:
#lasso regression
#The optimization objective for Lasso is:
#(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1

In [18]:
model = Lasso(alpha=1)

In [19]:
model.fit(x_train, y_train)

Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [20]:
mae_train, mse_train, r2_train  = predict_metrics(model, x_train, y_train)   
# now test data

x_test = np.array(test['horsepower']).reshape(-1,1)
y_test = np.array(test['price']).reshape(-1,1)
mae_test, mse_test, r2_test  = predict_metrics(model, x_test, y_test) 
print(mae_train, mse_train, r2_train)
print(mae_test, mse_test, r2_test)

3269.300460128114 19934901.42666339 0.6706658155442959
3502.900038686752 27357745.815678798 0.6181326647088738


In [21]:
# ridge

In [22]:
model = Ridge(alpha=1)

In [23]:
model.fit(x_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [24]:
mae_train, mse_train, r2_train  = predict_metrics(model, x_train, y_train)   
# now test data

x_test = np.array(test['horsepower']).reshape(-1,1)
y_test = np.array(test['price']).reshape(-1,1)
mae_test, mse_test, r2_test  = predict_metrics(model, x_test, y_test) 
print(mae_train, mse_train, r2_train)
print(mae_test, mse_test, r2_test)

3269.300412904593 19934901.42681575 0.670665815541779
3502.9004284002253 27357751.58829894 0.618132584132971
