In [4]:
import pandas as pd


df = pd.read_csv('data.csv')

print(df.columns)

df.head()

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


## Preprocess

In [5]:
cols_to_keep = ['Make', 'Model', 'Year', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Vehicle Style',
       'highway MPG', 'city mpg', 'MSRP']

df = df[cols_to_keep]

In [8]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [10]:
df.fillna(0, inplace=True)

In [14]:
# Make price binary

average_price =  df['msrp'].mean()

df['above_average'] = (df['msrp'] > average_price).astype(int)

df.head()


Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


## Split the data

In [15]:
from sklearn.model_selection import train_test_split

# Split the data into 80% "train + validation" and 20% test
df_train_val, df_test = train_test_split(df, test_size=0.2, random_state=1)

# Split the "train + validation" data into 75% train and 25% validation
df_train, df_val = train_test_split(df_train_val, test_size=0.25, random_state=1)

print("Train:", len(df_train))
print("Validation:", len(df_val))
print("Test:", len(df_test))


Train: 7148
Validation: 2383
Test: 2383


### 1. Question 1: ROC AUC feature importance

In [16]:
from sklearn.metrics import roc_auc_score

# List of numerical variables to evaluate
numerical_vars = ["engine_hp", "engine_cylinders", "highway_mpg", "city_mpg"]

# Dictionary to store the AUC for each variable
auc_scores = {}

# Loop through each variable and compute AUC
for var in numerical_vars:
    auc = roc_auc_score(df_train['above_average'], df_train[var])
    
    # If AUC is below 0.5, negate the variable and compute AUC again
    if auc < 0.5:
        auc = roc_auc_score(df_train['above_average'], -df_train[var])
    
    # Store the AUC in the dictionary
    auc_scores[var] = auc

# Determine which variable has the highest AUC
max_var = max(auc_scores, key=auc_scores.get)

print(auc_scores)
print(f"The variable with the highest AUC is: {max_var}")


{'engine_hp': 0.9171031265539011, 'engine_cylinders': 0.766116490165669, 'highway_mpg': 0.6330587871772013, 'city_mpg': 0.6734244643245233}
The variable with the highest AUC is: engine_hp
