In [1]:
#@ IMPORTING LIBRARIES AND DEPENDENCIES:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

In [2]:
#@ IMPORTING DATASET:
PATH = "./chapter-02-car-price_data.csv"
select_cols = ["Make", "Model", "Year", "Engine HP", "Engine Cylinders", "Transmission Type", "Vehicle Style", 
               "highway MPG", "city mpg", "MSRP"]
df = pd.read_csv(PATH, usecols=select_cols)

In [3]:
df

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [4]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [5]:
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11845 non-null  float64
 4   engine_cylinders   11884 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   msrp               11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


In [7]:
df['transmission_type'].mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

In [8]:
df.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [9]:
df.fillna('0', inplace = True)

In [10]:
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [11]:
df.rename(columns = {'msrp':'price'}, inplace = True)

In [12]:
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [13]:
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

In [14]:
numericfeatures_df = df.copy()
numerifeatures_df = df.drop(["make","model","transmission_type","vehicle_style"],axis = 1)


In [15]:
numericfeatures_df.describe()

Unnamed: 0,year,highway_mpg,city_mpg,price
count,11914.0,11914.0,11914.0,11914.0
mean,2010.384338,26.637485,19.733255,40594.74
std,7.57974,8.863001,8.987798,60109.1
min,1990.0,12.0,7.0,2000.0
25%,2007.0,22.0,16.0,21000.0
50%,2015.0,26.0,18.0,29995.0
75%,2016.0,30.0,22.0,42231.25
max,2017.0,354.0,137.0,2065902.0


In [16]:
numericfeatures_df.corr()

  numericfeatures_df.corr()


Unnamed: 0,year,highway_mpg,city_mpg,price
year,1.0,0.25824,0.198171,0.22759
highway_mpg,0.25824,1.0,0.886829,-0.160043
city_mpg,0.198171,0.886829,1.0,-0.157676
price,0.22759,-0.160043,-0.157676,1.0


In [17]:
numericfeatures_df.corr().unstack().sort_values(ascending = False)[:15]

  numericfeatures_df.corr().unstack().sort_values(ascending = False)[:15]


year         year           1.000000
highway_mpg  highway_mpg    1.000000
city_mpg     city_mpg       1.000000
price        price          1.000000
highway_mpg  city_mpg       0.886829
city_mpg     highway_mpg    0.886829
year         highway_mpg    0.258240
highway_mpg  year           0.258240
year         price          0.227590
price        year           0.227590
year         city_mpg       0.198171
city_mpg     year           0.198171
             price         -0.157676
price        city_mpg      -0.157676
highway_mpg  price         -0.160043
dtype: float64

In [18]:
df1 = df.copy()
mean = df1['price'].mean()
df1['above_average'] = np.where(df['price']>=mean,1,0)

In [19]:
df1 = df1.drop('price',axis=1)

In [20]:
df_train_full, df_test = train_test_split(df1, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [21]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [22]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [23]:
catvar = ['make','model','transmission_type','vehicle_style']

In [24]:

def calculate_mi(series):
    return mutual_info_score(series, df_train.above_average)


df_mi = df_train[catvar].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
model,0.462344
make,0.239769
vehicle_style,0.084143
transmission_type,0.020958


In [25]:
df_train = df_train.drop('above_average', axis=1)
df_val = df_val.drop('above_average', axis=1)
df_test = df_test.drop('above_average', axis=1)

In [26]:
numvar = ["year","highway_mpg","city_mpg","engine_hp"]
train_dict = df_train[catvar + numvar].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

In [27]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

val_dict = df_val[catvar + numvar].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict(X_val)

accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(accuracy)

0.94


In [28]:
features = catvar + numvar
features

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'year',
 'highway_mpg',
 'city_mpg',
 'engine_hp']

In [29]:
orig_score = accuracy

for c in features:
    subset = features.copy()
    subset.remove(c)
    
    train_dict = df_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    score = accuracy_score(y_val, y_pred)
    print(c, orig_score - score, score)

make 0.014695761644985206 0.9253042383550147
model 0.017633235417540916 0.922366764582459
transmission_type 0.014695761644985206 0.9253042383550147
vehicle_style -0.0012505245488879657 0.9412505245488879
year -0.006286193873269008 0.946286193873269
highway_mpg -0.005027276542173831 0.9450272765421738
city_mpg -0.0041879983214435645 0.9441879983214435
engine_hp 0.03693663449433482 0.9030633655056651


In [30]:
df['price']=np.log1p(df['price'])

In [31]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [32]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [33]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [34]:
del df_train['price']
del df_val['price']
del df_test['price']

In [35]:
train_dict = df_train[catvar + numvar].to_dict(orient='records')

In [43]:
dv = DictVectorizer(sparse=True)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

val_dict = df_val[catvar + numvar].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [44]:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    print(a, round(score, 4))

0 0.2563
0.01 0.247
0.1 0.2518
1 0.275
10 0.3403


10 0.3468
