# HW3
use car price dataset and do a classificiation model

### Data Preparation

In [139]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [140]:
selected_cols =[
'Make',
'Model',
'Year',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Vehicle Style',
'highway MPG',
'city mpg',
'MSRP'
]

In [141]:
df = pd.read_csv('data.csv')
df = df[selected_cols]
df.columns = df.columns.str.replace(' ', '_').str.lower()
df = df.fillna(0)
df.rename(columns={'msrp': 'price'}, inplace=True)
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


### Question 1

What is the most frequent observation (mode) for the column `transmission_type`?

In [142]:
df.transmission_type.value_counts().idxmax()

'AUTOMATIC'

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

In [143]:
numeric_cols = df.select_dtypes(include='number')
corr_matrix = numeric_cols.corr()
corr_matrix

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [144]:
np.fill_diagonal(corr_matrix.values, np.nan)
max_corr = corr_matrix.unstack().idxmax()
max_corr_features = max_corr[0], max_corr[1]
correlation = corr_matrix[max_corr[0]][max_corr[1]]

print("Pair with the highest correlation:")
print(max_corr_features)
print("Correlation value:")
print(correlation)

Pair with the highest correlation:
('highway_mpg', 'city_mpg')
Correlation value:
0.8868294962591425


### Make price binary
* Let's create a variable `above_average` which is `1` if the `price` is above its mean value and `0` otherwise.

In [145]:
df2 = df.copy()
mean_price = df.price.mean()
df['above_average'] = np.where(df.price > mean_price, 1, 0)
df = df.drop('price', axis=1)

### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value (`above_average`) is not in your dataframe.

In [146]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [147]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.2, random_state=42)
y_train = df_train.above_average.values
y_val = df_val.above_average.values
del df_train['above_average']
del df_val['above_average']

### Question 3

* Calculate the mutual information score between `above_average` and other categorical variables in our dataset. 
  Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the lowest mutual information score?
  
- `make`
- `model`
- `transmission_type`
- `vehicle_style`


In [149]:

def calculate_mi(series):
    return mutual_info_score(series, df_train_full.above_average)

categorical_cols = df_train.select_dtypes(include='object').columns
df_mi = df_train_full[categorical_cols].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=True).to_frame(name='Mutual Information')

display(df_mi.head())

Unnamed: 0,Mutual Information
transmission_type,0.020884
vehicle_style,0.08339
make,0.238724
model,0.460994


### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?


In [150]:
train_dict = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)


In [151]:
X_train = dv.transform(train_dict)

In [152]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [153]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict_proba(X_val)[:, 1]

In [154]:
above_avg = y_pred > 0.5
acc = (y_val == above_avg).mean()
print(f"Accuracy: {round(acc,2)}" )

Accuracy: 0.95


### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

In [155]:
def feature_elimination(col_excluded):
    df_train_exclude = df_train.drop([col_excluded], axis=1)
    train_dict = df_train_exclude.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    X_train = dv.transform(train_dict)
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    df_val_exclude = df_val.drop([col_excluded], axis=1)
    val_dict = df_val_exclude.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict_proba(X_val)[:, 1]
    above_avg = y_pred > 0.5
    acc = (y_val == above_avg).mean()
    acc_diff = abs(0.95 - acc)

    print(f"{col_excluded} feature excluded and accuracy difference is {acc_diff}")
    return acc_diff

In [156]:
col_list = ['year', 'engine_hp', 'transmission_type', 'city_mpg']
res = []
for col in col_list:
    acc = feature_elimination(col)
    res.append([col, acc])

year feature excluded and accuracy difference is 0.005060304142632366
engine_hp feature excluded and accuracy difference is 0.021316203460933325
transmission_type feature excluded and accuracy difference is 0.005060304142632366
city_mpg feature excluded and accuracy difference is 0.006633455689564749


In [157]:
sorted_data = sorted(res, key=lambda x: x[1])
print(sorted_data[0])

['year', 0.005060304142632366]



### Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn.
* We'll need to use the original column `price`. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data with a solver `'sag'`. Set the seed to `42`.
* This model also has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`.
* Round your RMSE scores to 3 decimal digits.

Which of these alphas leads to the best RMSE on the validation set?


In [161]:
df = df2.copy()
df['price'] = np.log1p(df['price'])
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,10.739349
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,10.612779
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,10.500977
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,10.290483
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,10.448744


In [182]:
categorical_cols = df.select_dtypes(include=['object']).columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('price')

In [184]:



X_train_num = df_train[numeric_cols].values
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_cat = ohe.fit_transform(df_train[categorical_cols].values)
X_train = np.column_stack([X_train_num, X_train_cat])

model.fit(X_train, y_train)



In [160]:

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.2, random_state=42)
y_train = df_train.price.values
y_val = df_val.price.values
del df_train['price']
del df_val['price']

In [164]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.discriminant_analysis import StandardScaler
model = Ridge(solver='sag', random_state=42)

In [185]:
alpha_values = [0, 0.01, 0.1, 1, 10]
for alpha in alpha_values:
    model.set_params(alpha=alpha)

    # train_dict = df_train.to_dict(orient='records')
    # dv = DictVectorizer(sparse=False)
    # dv.fit(train_dict)
    # X_train = dv.transform(train_dict)
    # model.fit(X_train, y_train)
    # X_train_num = df_train[numeric_cols].values
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(X_train_num)

    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_train_cat = ohe.fit_transform(df_train[categorical_cols].values)
    X_train = np.column_stack([X_train_num, X_train_cat])

    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_train)
    rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    rmse_rounded = round(rmse, 3)
    print(f"RMSE score for alpha={alpha}: {rmse_rounded}")



RMSE score for alpha=0: 0.19




RMSE score for alpha=0.01: 0.191




RMSE score for alpha=0.1: 0.191




RMSE score for alpha=1: 0.204
RMSE score for alpha=10: 0.303
