## Imports

In [19]:
from IPython.display import display

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

# import matplotlib.pyplot as plt
# import seaborn as sns
# %matplotlib inline

### Utilities

In [20]:
def numerical_features(df: pd.DataFrame):
    cols = df.columns[ (df.dtypes != 'object') ]
    return list(cols)

def categorical_features(df: pd.DataFrame):
    cols = df.columns[ (df.dtypes == 'object') ]
    return list(cols)

In [21]:
def validation_testing_training_full_split(dataframe: pd.DataFrame, seed: int = 42, validation: float = 0.2, testing: float = 0.2):
    assert 0 < validation and 0 < testing and 1 > (validation + testing)

    validation_of_full = validation / (1 - testing)
    if validation_of_full == 0:
        validation_of_full = None
        
    df_full,     df_testing    = train_test_split(dataframe, test_size=testing,            random_state=seed, shuffle=True)
    df_training, df_validation = train_test_split(df_full,   test_size=validation_of_full, random_state=seed, shuffle=True)
    
    df_validation = df_validation.reset_index(drop=True)
    df_testing = df_testing.reset_index(drop=True)
    df_training = df_training.reset_index(drop=True)
    df_full = df_full.reset_index(drop=True)
    
    return df_validation, df_testing, df_training, df_full

In [22]:
def y_split(dataframe: pd.DataFrame, yColumn: str, drop: list[str] = []):
    columns = set(dataframe.columns)   
    assert columns.issuperset([yColumn]), f'{yColumn} not found in dataframe'
    assert columns.issuperset(drop), f'At least one of {drop} not found in dataframe'
    
    df = dataframe.copy()
    y = df[yColumn]
    for col in drop + [yColumn]:
        del df[col]
        
    return df, y

In [23]:
def regularize(X, r=0.000000001):
    return X + np.eye(X.shape[0]) * r

In [24]:
def display_predictive_features_for_target(dataframe: pd.DataFrame, target: str, categorical = []):
    global_target = dataframe[target].mean()
    for c in categorical:
        df_group = dataframe.groupby(c)[target].agg('mean','count')
        df_group['diff'] = df_group.mean - global_target
        df_group['risk'] = df_group.mean / global_target
        display(df_group)

In [25]:
def sigmoid():
    pass

## Data Preparation

In [26]:
df = pd.read_csv('./car-prices.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


### Drop excluded features (see homework instructions)

In [27]:
for col in df.columns:
    if col not in ['make','model','year','engine_hp','engine_cylinders','transmission_type','vehicle_style','highway_mpg','city_mpg','msrp']:
        del df[col]
        
df.rename(columns={'msrp': 'price'}, inplace=True)
        
df.head().T

Unnamed: 0,0,1,2,3,4
make,BMW,BMW,BMW,BMW,BMW
model,1 Series M,1 Series,1 Series,1 Series,1 Series
year,2011,2011,2011,2011,2011
engine_hp,335.0,300.0,300.0,230.0,230.0
engine_cylinders,6.0,6.0,6.0,6.0,6.0
transmission_type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
vehicle_style,Coupe,Convertible,Coupe,Coupe,Convertible
highway_mpg,26,28,28,28,28
city_mpg,19,19,20,18,18
price,46135,40650,36350,29450,34500


### Fill with zeros (0) (see homework instructions)

In [28]:
df.fillna(0, inplace=True)
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

### Question 1

In [29]:
df.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

What is the most frequent observation (mode) for the column `transmission_type`?

- `AUTOMATIC`

## Data Exploration

### Question 2

In [30]:
feature_combinations = [
    ('engine_hp', 'year'),
    ('engine_hp', 'engine_cylinders'),
    ('highway_mpg', 'engine_cylinders'),
    ('highway_mpg', 'city_mpg'),
]

for k,v in feature_combinations:
    print(k, 'correlates to', v, ': ')
    display( df[k].corr( df[v] ) )


engine_hp correlates to year : 


0.338714184762447

engine_hp correlates to engine_cylinders : 


0.7748509807813191

highway_mpg correlates to engine_cylinders : 


-0.6145414173953333

highway_mpg correlates to city_mpg : 


0.8868294962591363

What are the two features that have the biggest correlation in this dataset?

- `highway_mpg` and `city_mpg`

### Make `price` binary (-> `above_average`)

In [31]:
df['above_average'] = df.price >= df.price.mean()
df['above_average'] = df['above_average'].astype(int)
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


### Split the data

#### Training, Testing, Validation, & Full (Training + Validation)

In [32]:
df_val, df_test, df_train, df_full = validation_testing_training_full_split(df)

nTotal = len(df)
nVal = len(df_val)
nTest = len(df_test)
nTrain = len(df_train)
nFull = len(df_full)

round(nVal/nTotal, 1), round(nTest/nTotal, 1), round(nTrain/nTotal, 1), round(nFull/nTotal,1), round(nTotal/nTotal)

(0.2, 0.2, 0.6, 0.8, 1)

#### Split out `y` = target feature `above_average`, dropping `price`, from all datasets 

In [33]:
df_val, y_val = y_split(df_val, 'above_average', ['price'])
df_test, y_test = y_split(df_test, 'above_average', ['price'])
df_train, y_train = y_split(df_train, 'above_average', ['price'])
df_full, y_full = y_split(df_full, 'above_average', ['price'])

assert df_val.shape[1] == df_test.shape[1] and df_test.shape[1] == df_train.shape[1] and df_train.shape[1] == df_full.shape[1]

In [34]:
df_val.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,Volkswagen,Beetle,2015,210.0,4.0,MANUAL,2dr Hatchback,31,23
1,Audi,SQ5,2015,354.0,6.0,AUTOMATIC,4dr SUV,24,17
2,Pontiac,Grand Am,2005,140.0,4.0,AUTOMATIC,Sedan,31,22
3,Nissan,350Z,2009,306.0,6.0,MANUAL,Convertible,24,17
4,Ford,E-150,1996,199.0,6.0,AUTOMATIC,Passenger Van,15,11


### Question 3

In [35]:
categorical_features(df_train)

['make', 'model', 'transmission_type', 'vehicle_style']

In [44]:
for feat in categorical_features(df_train):
    display(f'{feat} correlates with y_train: { round( mutual_info_score(df_train[feat], y_train) , 2) }')

'make correlates with y_train: 0.24'

'model correlates with y_train: 0.46'

'transmission_type correlates with y_train: 0.02'

'vehicle_style correlates with y_train: 0.08'