In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score

In [2]:
# calculate the adjusted r2
def adjusted_r2(y, yfit, model):
    r2 = r2_score(y, yfit)
    p = len(model.coef_)
    n = len(y)
    return 1-(1-r2)*(n-1)/(n-p-1)

This example uses the [Toyota Corolla](https://www.kaggle.com/klkwak/toyotacorollacsv) data set and some example code of running linear regression models from chapter 6 of [Data Mining for Business Analytics](https://www.dataminingbook.com/book/python-edition)

The dataset contains pricing of used Toyotal Corolla cars.
The goal of this exercise is to predict the price.
On of the challenge with this data is the large number of variables (features).
Thus, we would like to select the subset of the features that serve to make a good prediction of the price of cars.

1. Predict the price of the car using linear regression and the following features: 'Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors', 'Quarterly_Tax', 'Weight'

Evaluate the resulting model on the training and the testing sets. How well did it do?

2. Perform a [Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) regression on the same data. How did the result change?

3. Perform a Lasso regression using all predictors. How well does it predict car price?

In [3]:
data = pd.read_csv('data/ToyotaCorolla.csv')
data.head(1)

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,0,0,0,1,0,0,0,0


1. Predict the price of the car using linear regression and the following features: 'Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors', 'Quarterly_Tax', 'Weight'

In [4]:
predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 
              'Doors', 'Quarterly_Tax', 'Weight']
outcome = 'Price'

# partition data
X = pd.get_dummies(data[predictors], drop_first=True)
y = data[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

# print coefficients
print('intercept ', car_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))

intercept  -258.6042161510268
           Predictor  coefficient
0          Age_08_04  -124.110305
1                 KM    -0.016059
2                 HP    75.549218
3          Met_Color    47.715778
4          Automatic   462.441526
5                 CC    -5.027585
6              Doors    58.417871
7      Quarterly_Tax    13.009195
8             Weight    14.156177
9   Fuel_Type_Diesel  4481.088703
10  Fuel_Type_Petrol  2413.063717


In [5]:
pred_y = car_lm.predict(train_X)
r2 = r2_score(train_y, pred_y)
ar2= adjusted_r2(train_y, pred_y, car_lm)
r2, ar2

(0.8740198772919548, 0.872387625996562)

In [6]:
pred_y = car_lm.predict(valid_X)
r2 = r2_score(valid_y, pred_y)
ar2= adjusted_r2(valid_y, pred_y, car_lm)
r2, ar2

(0.10270749215842268, 0.08517602220059439)

2. Perform a [Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) regression on the same data. How did the result change?

In [7]:
lasso = Lasso(normalize=True, alpha=1)
lasso.fit(train_X, train_y)

Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=True,
      positive=False, precompute=False, random_state=None, selection='cyclic',
      tol=0.0001, warm_start=False)

In [8]:
pred_y = lasso.predict(train_X)
r2 = r2_score(train_y, pred_y)
ar2= adjusted_r2(train_y, pred_y, lasso)
r2, ar2

(0.8668009390372908, 0.8650751561508482)

In [9]:
pred_y = lasso.predict(valid_X)
r2 = r2_score(valid_y, pred_y)
ar2= adjusted_r2(valid_y, pred_y, lasso)
r2, ar2

(0.8077865250316318, 0.8040310219683067)

In [10]:
print('intercept ', car_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'linear regression': car_lm.coef_,
                    'lasso regression':lasso.coef_}))

intercept  -258.6042161510268
           Predictor  linear regression  lasso regression
0          Age_08_04        -124.110305       -124.898842
1                 KM          -0.016059         -0.017357
2                 HP          75.549218         39.702256
3          Met_Color          47.715778          0.000000
4          Automatic         462.441526        245.978043
5                 CC          -5.027585         -1.368554
6              Doors          58.417871          0.000000
7      Quarterly_Tax          13.009195          8.770124
8             Weight          14.156177         15.737167
9   Fuel_Type_Diesel        4481.088703         37.341459
10  Fuel_Type_Petrol        2413.063717        255.791282


3. Perform a Lasso regression using all predictors. How well does it predict car price?

In [11]:
predictors = set(data.columns) - {'Id', 'Model', 'Price'}
outcome = 'Price'

# partition data
X = pd.get_dummies(data[predictors], drop_first=True)
y = data[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

# print coefficients
print('intercept ', car_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))

intercept  -28583.86390501009
            Predictor   coefficient
0             Tow_Bar -1.077228e+02
1      Power_Steering -5.404451e+02
2   Parking_Assistant -2.577994e+02
3      Radio_cassette -1.970121e+02
4                 ABS -1.831979e+02
5            Airbag_1  3.690600e+02
6        Metallic_Rim  2.945070e+02
7           Mfg_Month -9.750444e+01
8       Quarterly_Tax  1.328909e+01
9           CD_Player  2.717617e+02
10                 KM -1.695016e-02
11          Age_08_04 -1.178998e+02
12          Mistlamps -2.218154e+01
13                 HP  6.050464e+01
14    Automatic_airco  2.272019e+03
15             Weight  6.952605e+00
16    BOVAG_Guarantee  3.522989e+02
17        Sport_Model  4.137309e+02
18              Doors  1.756881e+02
19           Mfg_Year  1.795035e+01
20   Guarantee_Period  6.499244e+01
21          Automatic  6.645881e+02
22              Airco  1.855080e+02
23      Mfr_Guarantee  1.963081e+02
24      Boardcomputer -3.794442e+02
25              Gears -1.174036e+0

In [12]:
pred_y = car_lm.predict(train_X)
r2 = r2_score(train_y, pred_y)
ar2= adjusted_r2(train_y, pred_y, car_lm)
r2, ar2

(0.9150890286887544, 0.9104006928494832)

In [13]:
pred_y = car_lm.predict(valid_X)
r2 = r2_score(valid_y, pred_y)
ar2= adjusted_r2(valid_y, pred_y, car_lm)
r2, ar2

(0.3902422723281229, 0.33837252233713155)

In [14]:
lasso = Lasso(normalize=True, alpha=1)
lasso.fit(train_X, train_y)

Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=True,
      positive=False, precompute=False, random_state=None, selection='cyclic',
      tol=0.0001, warm_start=False)

In [15]:
pred_y = lasso.predict(train_X)
r2 = r2_score(train_y, pred_y)
ar2= adjusted_r2(train_y, pred_y, lasso)
r2, ar2

(0.9075866964228843, 0.902484121378749)

In [16]:
pred_y = lasso.predict(valid_X)
r2 = r2_score(valid_y, pred_y)
ar2= adjusted_r2(valid_y, pred_y, lasso)
r2, ar2

(0.887336520296831, 0.8777526704165992)

In [17]:
print('intercept ', car_lm.intercept_)
pd.DataFrame({'Predictor': X.columns, 'linear regression': car_lm.coef_,
                    'lasso regression':lasso.coef_})

intercept  -28583.86390501009


Unnamed: 0,Predictor,linear regression,lasso regression
0,Tow_Bar,-107.7228,-93.74542
1,Power_Steering,-540.4451,-147.25071
2,Parking_Assistant,-257.7994,-0.0
3,Radio_cassette,-197.0121,0.0
4,ABS,-183.1979,-115.494242
5,Airbag_1,369.06,22.936827
6,Metallic_Rim,294.507,270.765842
7,Mfg_Month,-97.50444,0.0
8,Quarterly_Tax,13.28909,8.589679
9,CD_Player,271.7617,127.013076
