In [86]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [87]:
cars_df = pd.read_csv(r'data/data.csv')

# Renaming 'MSRP' column to 'price'
cars_df = cars_df.rename(columns={"MSRP": "price"})

cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  price              11914 non-null  int64  
dtypes: float64(3), int64(5

In [88]:
cars_df.columns = cars_df.columns.str.replace(' ', '_').str.lower()
cars_df.fillna(0, inplace=True)
cars_df.rename(columns={"MSRP": "price"}, inplace=True)

In [89]:
useful_columns = ["make", 'model', 'year', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']

categorical_columns = []
numerical_columns = []

for col in useful_columns:
    if cars_df[col].dtype == 'object':
        categorical_columns.append(col)
    else:
        numerical_columns.append(col)

print(categorical_columns)
print(numerical_columns)

['make', 'model', 'transmission_type', 'vehicle_style']
['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']


In [90]:
cars_df.mode()

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,price
0,Chevrolet,Silverado 1500,2015,regular unleaded,200.0,4.0,AUTOMATIC,front wheel drive,4.0,0,Compact,Sedan,24,17,1385,2000


In [91]:
def calc_above_average_price(df: pd.DataFrame) -> pd.DataFrame:
    df['above_average'] = df['price'] > df['price'].mean()
    return df

cars_df = calc_above_average_price(cars_df)

In [92]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_fuel_type   11914 non-null  object 
 4   engine_hp          11914 non-null  float64
 5   engine_cylinders   11914 non-null  float64
 6   transmission_type  11914 non-null  object 
 7   driven_wheels      11914 non-null  object 
 8   number_of_doors    11914 non-null  float64
 9   market_category    11914 non-null  object 
 10  vehicle_size       11914 non-null  object 
 11  vehicle_style      11914 non-null  object 
 12  highway_mpg        11914 non-null  int64  
 13  city_mpg           11914 non-null  int64  
 14  popularity         11914 non-null  int64  
 15  price              11914 non-null  int64  
 16  above_average      119

In [93]:
correlation_matrix = cars_df[numerical_columns].corr()
correlation_matrix

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


In [94]:
shuffled_data = cars_df.sample(frac=1, random_state=42)

train_size = int(0.6 * len(shuffled_data))
val_size = int(0.2 * len(shuffled_data))

train_data = shuffled_data[:train_size]
val_data = shuffled_data[train_size:train_size+val_size]
test_data = shuffled_data[train_size+val_size:]

y_train = train_data.above_average.values
y_val = val_data.above_average.values
y_test = test_data.above_average.values

del train_data['above_average']
del val_data['above_average']
del test_data['above_average']

In [95]:
print(len(train_data), len(val_data), len(test_data))

7148 2382 2384


In [96]:
print(train_data[categorical_columns + numerical_columns].shape, val_data[categorical_columns + numerical_columns].shape, test_data[categorical_columns + numerical_columns].shape)

(7148, 9) (2382, 9) (2384, 9)


In [97]:
from sklearn.metrics import mutual_info_score

mi_scores = {}

for col in categorical_columns:
    mi_scores[col] = mutual_info_score(train_data[col], y_train)

mi_scores

{'make': 0.24107355650518433,
 'model': 0.46060863137526176,
 'transmission_type': 0.01906153802385519,
 'vehicle_style': 0.08209259666769804}

In [98]:
from sklearn.feature_extraction import DictVectorizer

In [99]:
dv = DictVectorizer(sparse=False)

train_dict = train_data[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = val_data[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [100]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [101]:
# Training the model

model.fit(X_train, y_train)

LogisticRegression(C=10, max_iter=1000, random_state=42, solver='liblinear')

In [102]:
# Calculate the accuracy in the validation dataset

y_pred = model.predict(X_val)

from sklearn.metrics import accuracy_score

print(round(accuracy_score(y_val, y_pred), 2))
accuracy = round(accuracy_score(y_val, y_pred), 2)


0.92


In [103]:
all_features = categorical_columns + numerical_columns

feature_importance = {}

for c in all_features:
    # Create a copy of train, val and test datasets withour the feature 'c'
    c_train_data = train_data[all_features].drop(c, axis=1)
    c_val_data = val_data[all_features].drop(c, axis=1)
    c_test_data = test_data[all_features].drop(c, axis=1)

    print(c_train_data.shape, c_val_data.shape, c_test_data.shape)

    # Transform the datasets to dictionaries
    c_train_dict = c_train_data.to_dict(orient='records')
    c_val_dict = c_val_data.to_dict(orient='records')
    c_test_dict = c_test_data.to_dict(orient='records')

    # Transform the datasets to matrices
    c_dv = DictVectorizer(sparse=False)
    X_train_c = c_dv.fit_transform(c_train_dict)
    X_val_c = c_dv.transform(c_val_dict)

    # Create a model
    model_c = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

    # Train the model
    model_c.fit(X_train_c, y_train)

    # Calculate the accuracy in the validation dataset
    y_pred = model_c.predict(X_val_c)
    acc = accuracy_score(y_val, y_pred)

    # Calculate the difference between the accuracy with all features and the accuracy without the feature 'c'
    feature_importance[c] = accuracy - acc



(7148, 8) (2382, 8) (2384, 8)


(7148, 8) (2382, 8) (2384, 8)
(7148, 8) (2382, 8) (2384, 8)
(7148, 8) (2382, 8) (2384, 8)
(7148, 8) (2382, 8) (2384, 8)
(7148, 8) (2382, 8) (2384, 8)
(7148, 8) (2382, 8) (2384, 8)
(7148, 8) (2382, 8) (2384, 8)
(7148, 8) (2382, 8) (2384, 8)


In [104]:
# Sort the features by importance
sorted_features = sorted(feature_importance, key=feature_importance.get)

In [105]:
least_useful_feature = sorted_features[0]
print(least_useful_feature)

year


In [106]:
# Price instead of above_average now
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

y_train = np.log(train_data.price.values)
y_val = np.log(val_data.price.values)
y_test = np.log(test_data.price.values)

del train_data['price']
del val_data['price']
del test_data['price']

alpha = [0, 0.01, 0.1, 1, 10]

rmse_scores = {}

for a in alpha:
    model = Ridge(alpha=a, solver='sag', random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, predictions))
    rmse_scores[a] = round(rmse, 3)

rmse_scores



{0: 0.485, 0.01: 0.485, 0.1: 0.485, 1: 0.485, 10: 0.485}