In [128]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [129]:
df = pd.read_csv('./data/data.csv')

In [130]:
columns = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg']

categorical = ['make', 'model', 'transmission_type', 'vehicle_style']
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

# df = df[columns]

In [131]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

# string_columns = list(df.dtypes[df.dtypes == 'object'].index)

# for col in string_columns:
#     df[col] = df[col].str.lower().str.replace(' ', '_')

In [132]:
df.isnull().sum()

make                    0
model                   0
year                    0
engine_fuel_type        3
engine_hp              69
engine_cylinders       30
transmission_type       0
driven_wheels           0
number_of_doors         6
market_category      3742
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
msrp                    0
dtype: int64

In [133]:
df = df.fillna(0)

In [134]:
df.rename(columns={'msrp': 'price'}, inplace=True)

# Question 1

In [135]:
df['transmission_type'].value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

# Question 2

In [136]:
df[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


Answer: `highway_mpg` and `city_mpg`

# Question 3

In [137]:
mean = df['price'].mean()
df['above_average'] = (df['price'] > mean).astype(int)

In [138]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [139]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [140]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [141]:

def calculate_mi(series):
    return round(mutual_info_score(series, df_train.above_average), 2)

df_mi = df_train[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
model,0.46
make,0.24
vehicle_style,0.08
transmission_type,0.02


# Question 4

In [142]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')

X_train = dv.fit_transform(train_dict)
y_train = df_train['above_average']

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_val = df_val['above_average']

y_pred = model.predict(X_val)
accuracy = np.round(accuracy_score(y_val, y_pred), 2)
accuracy

0.93

# Question 5

In [143]:
features = categorical + numerical

for feature in features:
    
    copy = features.copy()
    copy.remove(feature)
    dv = DictVectorizer(sparse=False)
    
    train_dict = df_train[copy].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[copy].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)
    
    new_score = accuracy_score(y_val, y_pred)
    difference =  accuracy - new_score

    print(feature, ": ", difference)

make :  -0.015027276542173729
model :  0.005954678976080596
transmission_type :  -0.015866554762903884
vehicle_style :  -0.0028577423415862002
year :  -0.017545111204364194
engine_hp :  7.973143096939861e-05
engine_cylinders :  -0.015866554762903884
highway_mpg :  -0.01670583298363404
city_mpg :  -0.015866554762903884
