In [303]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [304]:
df = pd.read_csv('../data/01_data.csv')

In [305]:
df.shape

(11914, 16)

In [306]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [307]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [308]:
df.head().T

Unnamed: 0,0,1,2,3,4
Make,BMW,BMW,BMW,BMW,BMW
Model,1 Series M,1 Series,1 Series,1 Series,1 Series
Year,2011,2011,2011,2011,2011
Engine Fuel Type,premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required)
Engine HP,335.0,300.0,300.0,230.0,230.0
Engine Cylinders,6.0,6.0,6.0,6.0,6.0
Transmission Type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
Driven_Wheels,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive
Number of Doors,2.0,2.0,2.0,2.0,2.0
Market Category,"Factory Tuner,Luxury,High-Performance","Luxury,Performance","Luxury,High-Performance","Luxury,Performance",Luxury


In [309]:
df.columns = df.columns.str.replace(' ','_').str.lower()

In [310]:
selected_columns = ['make','model','year','engine_hp','engine_cylinders','transmission_type','vehicle_style','highway_mpg','city_mpg']

In [311]:
#df[selected_columns].fillna(0,inplace=True)
df.engine_cylinders = df.engine_cylinders.fillna(0)
df.engine_hp = df.engine_hp.fillna(0)

In [312]:
df.isna().sum()

make                    0
model                   0
year                    0
engine_fuel_type        3
engine_hp               0
engine_cylinders        0
transmission_type       0
driven_wheels           0
number_of_doors         6
market_category      3742
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
msrp                    0
dtype: int64

In [313]:
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

In [314]:
df.rename(columns={"msrp": "price"},inplace=True)

In [315]:
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'price'],
      dtype='object')

In [316]:
df['transmission_type'].value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

### Answer 1: AUTOMATIC           

In [317]:
df[['year','engine_cylinders','engine_hp']].corrwith(df.engine_hp)

year                0.338714
engine_cylinders    0.774851
engine_hp           1.000000
dtype: float64

In [318]:
df[['engine_cylinders','city_mpg','highway_mpg']].corrwith(df.highway_mpg )

engine_cylinders   -0.614541
city_mpg            0.886829
highway_mpg         1.000000
dtype: float64

### Answer 2: city_mpg, highway_mpg

In [319]:
df['price'].describe().round(3)

count      11914.000
mean       40594.737
std        60109.104
min         2000.000
25%        21000.000
50%        29995.000
75%        42231.250
max      2065902.000
Name: price, dtype: float64

In [320]:
df['price_above_average'] = (df.price>=df.price.mean()).astype(int)

In [321]:
df.head()

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,price,price_above_average
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135,1
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650,1
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350,0
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450,0
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500,0


In [322]:
from sklearn.model_selection import train_test_split

In [359]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [360]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=42)

In [361]:
y_train = df_train['price_above_average']
y_val = df_val['price_above_average']
y_test = df_test['price_above_average']

In [362]:
del df_train['price_above_average']
del df_val['price_above_average']
del df_test['price_above_average']

In [363]:
df_train = df_train[selected_columns].copy()
df_val  = df_val[selected_columns].copy()
df_test  = df_val[selected_columns].copy()

In [364]:
df_train.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
2575,Honda,Civic,2016,174.0,4.0,AUTOMATIC,Sedan,42,31
3794,Ford,Edge,2015,245.0,4.0,AUTOMATIC,4dr SUV,30,20
8685,Lexus,RX 400h,2007,268.0,6.0,AUTOMATIC,4dr SUV,25,27
4476,Ford,F-150,2015,325.0,6.0,AUTOMATIC,Extended Cab Pickup,26,19
991,Volvo,940,1994,114.0,4.0,AUTOMATIC,Wagon,24,17


In [365]:
df_train_full['price_above_average'].mean()

0.2767810303221068

In [366]:
from sklearn.metrics import mutual_info_score

In [367]:
def mutual_info_churn_score(serie):
    return mutual_info_score(serie,df_train_full.price_above_average)

In [368]:
mi = df_train_full[selected_columns].apply(mutual_info_churn_score)

In [369]:
mi.round(2).sort_values(ascending=False)

model                0.46
engine_hp            0.36
make                 0.24
engine_cylinders     0.12
vehicle_style        0.08
year                 0.07
city_mpg             0.06
highway_mpg          0.04
transmission_type    0.02
dtype: float64

### Answer 3: model

In [370]:
from sklearn.feature_extraction import DictVectorizer

In [371]:
train_dicts = df_train[selected_columns].to_dict(orient='records')

In [372]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)

In [373]:
dv.get_feature_names_out()

array(['city_mpg', 'engine_cylinders', 'engine_hp', 'highway_mpg',
       'make=Acura', 'make=Alfa Romeo', 'make=Aston Martin', 'make=Audi',
       'make=BMW', 'make=Bentley', 'make=Bugatti', 'make=Buick',
       'make=Cadillac', 'make=Chevrolet', 'make=Chrysler', 'make=Dodge',
       'make=FIAT', 'make=Ferrari', 'make=Ford', 'make=GMC',
       'make=Genesis', 'make=HUMMER', 'make=Honda', 'make=Hyundai',
       'make=Infiniti', 'make=Kia', 'make=Lamborghini', 'make=Land Rover',
       'make=Lexus', 'make=Lincoln', 'make=Lotus', 'make=Maserati',
       'make=Maybach', 'make=Mazda', 'make=McLaren', 'make=Mercedes-Benz',
       'make=Mitsubishi', 'make=Nissan', 'make=Oldsmobile',
       'make=Plymouth', 'make=Pontiac', 'make=Porsche',
       'make=Rolls-Royce', 'make=Saab', 'make=Scion', 'make=Spyker',
       'make=Subaru', 'make=Suzuki', 'make=Tesla', 'make=Toyota',
       'make=Volkswagen', 'make=Volvo', 'model=1 Series', 'model=100',
       'model=124 Spider', 'model=190-Class', 'model

In [374]:
X_train = dv.transform(train_dicts)

In [375]:
from sklearn.linear_model import LogisticRegression

In [376]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [377]:
val_dicts = df_val[selected_columns].to_dict(orient='records')
#dv.fit(val_dicts)

In [378]:
X_val = dv.transform(val_dicts)

In [379]:
y_val_pred = model.predict(X_val)

In [380]:
model.intercept_[0]

-0.04954342766963281

In [381]:
model.coef_[0].round(3)

array([ 2.700e-02, -8.900e-02,  3.300e-02,  7.800e-02,  8.080e-01,
        3.560e-01,  1.700e-01,  2.372e+00,  1.880e+00,  6.200e-02,
        0.000e+00, -4.720e-01,  1.874e+00, -1.326e+00, -9.680e-01,
       -2.489e+00, -1.660e-01,  1.050e-01, -1.264e+00, -7.010e-01,
        1.160e-01,  2.900e-02, -1.101e+00, -1.635e+00,  1.580e-01,
       -1.137e+00,  2.000e-03,  1.398e+00,  1.217e+00,  1.002e+00,
        1.488e+00,  3.110e-01,  1.000e-03, -7.900e-01,  0.000e+00,
        1.147e+00, -6.270e-01, -1.048e+00, -4.080e-01, -1.500e-02,
       -9.880e-01,  1.073e+00,  2.450e-01,  7.270e-01, -7.800e-02,
        7.600e-02, -1.113e+00, -7.040e-01,  8.960e-01, -5.450e-01,
       -8.410e-01,  8.510e-01, -1.390e-01, -3.700e-02, -1.200e-02,
       -2.000e-03, -2.000e-03, -1.590e-01, -2.490e-01, -1.000e-03,
       -1.000e-03, -1.000e-03, -9.300e-02,  1.880e-01,  3.190e-01,
       -2.390e-01, -6.000e-02, -3.500e-02, -5.700e-02, -3.300e-02,
       -0.000e+00, -2.000e-03, -3.670e-01,  4.900e-02, -1.870e

In [382]:
model.predict_proba(X_val)[:,1]

array([0.00682842, 0.99140752, 0.00288249, ..., 0.09552365, 0.16896004,
       0.09710876])

In [383]:
all_features_accuracy = model.score(X_val,y_val)

In [384]:
#y_val == y_val_pred).astype(int)
y_val.sum(),y_val_pred.sum()

(862, 856)

In [385]:
(y_val == y_val_pred).mean().round(2)

0.92

In [386]:
differences = dict()
total_features = selected_columns
for feature in total_features:
    #print(feature)
    
    df_train_copy = df_train[selected_columns].copy()
    del df_train_copy[feature]
    
    df_val_copy = df_val[selected_columns].copy()
    del df_val_copy[feature]
    
    dv = DictVectorizer(sparse=False)
    train_dict = df_train_copy.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val_copy.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    #print(dv.get_feature_names())
    model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
    model.fit(X_train, y_train)
    
    #y_pred = model.predict_proba(X_val)[:, 1]
    #above_average_decision = (y_pred >= 0.5)
    #no_feature_accuracy = (above_average_val == above_average_decision).mean()
    no_feature_accuracy = model.score(X_val, y_val)
    differences[feature] = round(all_features_accuracy - no_feature_accuracy, 4)
    
for key, value in differences.items():
    print(f"{key}:{value}")

make:-0.0168
model:0.0051
year:-0.0242
engine_hp:0.0013
engine_cylinders:-0.0223
transmission_type:-0.013
vehicle_style:-0.0111
highway_mpg:-0.0159
city_mpg:-0.0156


### Answer 5 engine_hp :0.0013

In [387]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [388]:
y_train = np.log1p(y_train)
y_val = np.log1p(y_val)
y_test = np.log1p(y_test)

In [389]:
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [390]:
from sklearn.linear_model import Ridge

In [391]:
results = dict()
best_rmse = None
best_rmse_arg = None

for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    raw_rmse = rmse(y_val, y_pred)
    if best_rmse is None:
        best_rmse = raw_rmse
        best_rmse_arg = a
    elif raw_rmse < best_rmse:
        best_rmse = raw_rmse
        best_rmse_arg = a
    results[a] = round(raw_rmse, 3)

print(f"[ANSWER-6] Best RMSE is {round(best_rmse, 3)} for r value: {best_rmse_arg}")

[ANSWER-6] Best RMSE is 0.146 for r value: 0.1
