In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv(r"C:\Users\Herald\Documents\ML_Zoomcamp\02_linear_regression\data.csv")

In [3]:
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


# Data Preparation

In [4]:
req =['Make',
'Model',
'Year',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Vehicle Style',
'highway MPG',
'city mpg',
'MSRP']

In [5]:
data = data[req]

In [6]:
# Clean column names
data.columns = data.columns.str.lower().str.replace(' ', '_')
data.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'msrp'],
      dtype='object')

In [7]:
data.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [8]:
# Fill in the missing values of the selected features with 0.
data.fillna(0, inplace=True)

In [9]:
data.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [10]:
data.rename(columns={'msrp':'price'}, inplace=True)

In [11]:
# Question 1 What is the most frequent observation (mode) for the column transmission_type?

data.transmission_type.value_counts().sort_values(ascending=False)

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11914 non-null  float64
 4   engine_cylinders   11914 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   price              11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


In [13]:
categorical = data.dtypes[data.dtypes == 'object'].index
categorical

Index(['make', 'model', 'transmission_type', 'vehicle_style'], dtype='object')

In [14]:
numerical = data.dtypes[~(data.dtypes == 'object')].index
numerical

Index(['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg',
       'price'],
      dtype='object')

In [15]:
# Question 2 Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

df_numeric = data[numerical]
df_numeric.corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


# Make Price Binary
* Now we need to turn the price variable from numeric into a binary format.
* Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [16]:
data['above_average'] = (data.price > data.price.mean()).astype(int)
data['above_average']

0        1
1        1
2        0
3        0
4        0
        ..
11909    1
11910    1
11911    1
11912    1
11913    0
Name: above_average, Length: 11914, dtype: int32

In [17]:
data.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price', 'above_average'],
      dtype='object')

# Split the Data
* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value (above_average) is not in your dataframe.

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [20]:
# Get the target values as an array
y_full_train = df_full_train['above_average'].values
y_train = df_train['above_average'].values
y_val = df_val['above_average'].values
y_test = df_test['above_average'].values

In [21]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [22]:
df_train.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price'],
      dtype='object')

In [23]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [24]:
# Question 3 
# Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
# Round the scores to 2 decimals using round(score, 2).
# Which of these variables has the lowest mutual information score?

In [25]:
from sklearn.metrics import mutual_info_score

In [26]:
def mutual_info_price_score(s):
    return mutual_info_score(s, df_full_train.above_average)

In [27]:
# Apply mutual info metrics to categorical columns of our data
df_full_train[categorical].apply(mutual_info_price_score).sort_values(ascending=False).round(2)

model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: float64

# Training the model: Logistic Regression
* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
* To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
* model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [28]:
# Apply one-hot encoding
from sklearn.feature_extraction import DictVectorizer

In [29]:
categorical

Index(['make', 'model', 'transmission_type', 'vehicle_style'], dtype='object')

In [30]:
numerical = numerical[numerical!='price']
numerical

Index(['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg'], dtype='object')

In [31]:
df_train[categorical.tolist() + numerical.tolist()]

Unnamed: 0,make,model,transmission_type,vehicle_style,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
3972,Mitsubishi,Endeavor,AUTOMATIC,4dr SUV,2011,225.0,6.0,19,15
1997,Kia,Borrego,AUTOMATIC,4dr SUV,2009,276.0,6.0,21,17
5216,Lamborghini,Gallardo,MANUAL,Convertible,2012,570.0,10.0,20,12
2805,Chevrolet,Colorado,AUTOMATIC,Crew Cab Pickup,2016,200.0,4.0,27,20
11369,Pontiac,Vibe,AUTOMATIC,4dr Hatchback,2009,158.0,4.0,26,20
...,...,...,...,...,...,...,...,...,...
9232,Toyota,Sienna,AUTOMATIC,Passenger Minivan,2016,266.0,6.0,25,18
5710,Chevrolet,HHR,MANUAL,Wagon,2009,260.0,4.0,29,21
11306,Hyundai,Veracruz,AUTOMATIC,4dr SUV,2012,260.0,6.0,22,17
4414,Mitsubishi,Expo,MANUAL,2dr Hatchback,1993,136.0,4.0,26,19


In [32]:
# Convert to dictionary
train_dicts = df_train[categorical.tolist() + numerical.tolist()].to_dict(orient='records')

In [33]:
#train_dicts

In [34]:
# Create DictVectorizer object

dv = DictVectorizer(sparse=False)

# Fit training data in dictionary form
X_train = dv.fit_transform(train_dicts)

In [36]:
# Train model using Logistic regression
from sklearn.linear_model import LogisticRegression

In [37]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
# Fit/Train model with training dataets X and y
model.fit(X_train, y_train)

In [38]:
# Take a look at our w
#model.coef_[0].round(3)

In [39]:
# Take a look at our w0
model.intercept_[0]

-0.3922024649900234

In [40]:
# Get hard predictions on train dataset

y_train_pred = model.predict(X_train)

In [41]:
y_train_pred

array([0, 0, 1, ..., 0, 0, 0])

In [42]:
# Calculate accuracy on train dataset

train_accuracy = (y_train_pred == y_train).mean()
train_accuracy.round(2)

0.95

In [44]:
df_val[categorical.tolist() + numerical.tolist()]

Unnamed: 0,make,model,transmission_type,vehicle_style,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
1918,Volkswagen,Beetle,MANUAL,2dr Hatchback,2015,210.0,4.0,31,23
9951,Audi,SQ5,AUTOMATIC,4dr SUV,2015,354.0,6.0,24,17
5486,Pontiac,Grand Am,AUTOMATIC,Sedan,2005,140.0,4.0,31,22
292,Nissan,350Z,MANUAL,Convertible,2009,306.0,6.0,24,17
3644,Ford,E-150,AUTOMATIC,Passenger Van,1996,199.0,6.0,15,11
...,...,...,...,...,...,...,...,...,...
4385,Ford,Explorer Sport,AUTOMATIC,2dr SUV,2003,203.0,6.0,19,14
7339,Subaru,Outback,AUTOMATIC,4dr SUV,2016,175.0,4.0,33,25
9806,GMC,Sonoma,MANUAL,Extended Cab Pickup,2003,190.0,6.0,17,12
11162,Aston Martin,V8 Vantage,AUTOMATED_MANUAL,Coupe,2015,430.0,8.0,21,14


In [45]:
# Get hard predictions on validation dataset

val_dicts = df_val[categorical.tolist() + numerical.tolist()].to_dict(orient='records')

X_val = dv.transform(val_dicts)

y_val_pred = model.predict(X_val)

In [46]:
# Calculate accuracy on train dataset

val_accuracy = (y_val_pred == y_val).mean()
val_accuracy.round(2)

0.95

# Feature Elimination

In [58]:
feat_elim = ['year',
            'engine_hp',
            'transmission_type',
            'city_mpg']

In [64]:
# Original train 
elim_train_dict = df_train[feat_elim].to_dict(orient='records')

dv_elim = DictVectorizer(sparse=False)

X_train_elim = dv_elim.fit_transform(elim_train_dict)

model_elim = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model_elim.fit(X_train_elim, y_train)

y_train_elim_pred = model_elim.predict(X_train_elim)

elim_val_dict = df_val[feat_elim].to_dict(orient='records')
X_val_elim = dv_elim.transform(elim_val_dict)
y_val_elim_pred = model_elim.predict(X_val_elim)

print(f'Training accu:{(y_train_elim_pred == y_train).mean().round(3)}')
print(f'Validation accu:{(y_val_elim_pred == y_val).mean().round(3)}')

val_accu = (y_val_elim_pred == y_val).mean().round(3)

Training accu:0.868
Validation accu:0.885


In [66]:
for feat in feat_elim:
    cols = feat_elim.copy()
    cols.remove(feat)
    print(f'Removed {feat}')
    
    feat_dict = df_train[cols].to_dict(orient='records')
    
    dv_feat = DictVectorizer(sparse=False)

    X_train_feat = dv_feat.fit_transform(feat_dict)

    model_feat = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model_feat.fit(X_train_feat, y_train)

    y_train_feat_pred = model_feat.predict(X_train_feat)

    feat_val_dict = df_val[cols].to_dict(orient='records')
    X_val_feat = dv_feat.transform(feat_val_dict)
    y_val_feat_pred = model_feat.predict(X_val_feat)

    print(f'Training accu:{(y_train_feat_pred == y_train).mean().round(3)}')
    print(f'Validation accu:{(y_val_feat_pred == y_val).mean().round(3)}')
    print(f'Diff: {abs(val_accu - (y_val_feat_pred == y_val).mean().round(3))}')
    print()
    print()

Removed year
Training accu:0.871
Validation accu:0.885
Diff: 0.0


Removed engine_hp
Training accu:0.748
Validation accu:0.744
Diff: 0.14100000000000001


Removed transmission_type
Training accu:0.87
Validation accu:0.882
Diff: 0.0030000000000000027


Removed city_mpg
Training accu:0.861
Validation accu:0.877
Diff: 0.008000000000000007




# Q6 Ridge Regression

In [67]:
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


In [83]:
ridge_df = data[[col for col in data.columns if col != 'above_average']]

In [85]:
# Apply logarithmic transformation to price
ridge_df['price_log'] = np.log1p(ridge_df.price)

In [87]:
del ridge_df['price']

In [88]:
ridge_df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price_log
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,10.739349
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,10.612779
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,10.500977
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,10.290483
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,10.448744
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.739024
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.945018
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.832122
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.838031


In [89]:
# Ridge Regression Model
from sklearn.linear_model import Ridge

In [90]:
alpha = [0,
0.01,
0.1,
1,
10]

In [94]:
# Prepare the datasets
ridge_full_train, ridge_test = train_test_split(ridge_df, test_size=0.2, random_state=42)
ridge_train, ridge_val = train_test_split(ridge_full_train, test_size=0.25, random_state=42)

In [95]:
y_ridge_full = ridge_full_train['price_log'].values
y_train = ridge_train['price_log'].values
y_val = ridge_val['price_log'].values
y_test = ridge_test['price_log'].values

In [96]:
del ridge_full_train['price_log']
del ridge_train['price_log']
del ridge_val['price_log']
del ridge_test['price_log']

In [104]:
dv_ridge = DictVectorizer(sparse=True)

In [105]:
ridge_train_dicts = ridge_train.to_dict(orient='records')

In [106]:
X_train_ridge = dv_ridge.fit_transform(ridge_train_dicts)

In [108]:
model_ridge0 = Ridge(alpha=0, solver='sag', random_state=42)
model_ridge0.fit(X_train_ridge, y_train) 
y_pred0 = model_ridge0.predict(X_train_ridge)
y_pred0

array([10.33818432, 10.22257577, 12.16841953, ..., 10.52033959,
        7.43492033, 10.46655329])

In [109]:
def rmse(y, y_pred):
    error = y - y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [113]:
rmse0 = rmse(y_train, y_pred0)
rmse0.round(3)

0.228

In [115]:
model_ridge0_1 = Ridge(alpha=0.1, solver='sag', random_state=42)
model_ridge0_1.fit(X_train_ridge, y_train) 
y_pred0_1 = model_ridge0_1.predict(X_train_ridge)
y_pred0_1 

array([10.33788472, 10.21835396, 12.17156837, ..., 10.51691878,
        7.44252748, 10.47136505])

In [116]:
rmse0_1 = rmse(y_train, y_pred0_1)
rmse0_1.round(3)

0.228

In [117]:
model_ridge0_01 = Ridge(alpha=0.01, solver='sag', random_state=42)
model_ridge0_01.fit(X_train_ridge, y_train) 
y_pred0_01 = model_ridge0_01.predict(X_train_ridge)
y_pred0_01 

array([10.320507  , 10.14708934, 12.23147484, ..., 10.54441413,
        7.45101931, 10.39646256])

In [118]:
rmse0_01 = rmse(y_train, y_pred0_01)
rmse0_01.round(3)

0.234

In [119]:
model_ridge1 = Ridge(alpha=1, solver='sag', random_state=42)
model_ridge1.fit(X_train_ridge, y_train) 
y_pred1 = model_ridge1.predict(X_train_ridge)
y_pred1 

array([10.31947281, 10.12478658, 12.24605698, ..., 10.51008022,
        7.51272067, 10.44859   ])

In [120]:
rmse1 = rmse(y_train, y_pred1)
rmse1.round(3)

0.237

In [121]:
model_ridge10 = Ridge(alpha=10, solver='sag', random_state=42)
model_ridge10.fit(X_train_ridge, y_train) 
y_pred10 = model_ridge10.predict(X_train_ridge)
y_pred10

array([10.30771493, 10.07349935, 12.32906546, ..., 10.37619539,
        7.78188588, 10.69367463])

In [122]:
rmse10 = rmse(y_train, y_pred10)
rmse10.round(3)

0.317