In [1]:
import requests

url = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv"
response = requests.get(url)

if response.status_code == 200:
    with open("data/car-price.csv", "wb") as file:
        file.write(response.content)
else:
    print("Failed to download the file.")

In [2]:
import pandas as pd 

In [3]:
df = pd.read_csv('data/car-price.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [4]:
# Select the columns you want to keep
selected_columns = [
    'Make',
    'Model',
    'Year',
    'Engine HP',
    'Engine Cylinders',
    'Transmission Type',
    'Vehicle Style',
    'highway MPG',
    'city mpg',
    'MSRP'
]

# Create a new DataFrame with only the selected columns
df = df[selected_columns]
df.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [5]:
# Replace spaces with underscores and convert column names to lowercase
df.columns = df.columns.str.replace(' ', '_').str.lower()

# Fill in missing values with 0
df = df.fillna(0)

# Rename the 'MSRP' column to 'price'
df = df.rename(columns={'msrp': 'price'})

df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [6]:
df['year'] = df['year'].astype(str) 
df = df[['make','model','year','engine_hp','engine_cylinders','transmission_type','vehicle_style','highway_mpg','city_mpg','price']]
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)


for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.isna().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

In [7]:
# Assuming you have the 'df' DataFrame with the 'Transmission Type' column
mode_transmission = df['transmission_type'].mode().values[0]

print("The most frequent Transmission Type is:", mode_transmission)

The most frequent Transmission Type is: automatic


In [8]:
# Assuming you have the 'df' DataFrame with the numerical features
numerical_features = df.select_dtypes(include=['number'])  # Select numerical columns

# Create a correlation matrix
correlation_matrix = numerical_features.corr()

# Find the two features with the highest correlation
max_corr = correlation_matrix.abs().unstack().sort_values(ascending=False)
top_correlation = max_corr[max_corr != 1].head(2)  # Exclude self-correlations

# Print the two features with the highest correlation and their correlation coefficient
print("Two features with the highest correlation:")
print(top_correlation)

Two features with the highest correlation:
city_mpg     highway_mpg    0.886829
highway_mpg  city_mpg       0.886829
dtype: float64


In [9]:
# Calculate the mean price
mean_price = df['price'].mean()

# # Create the 'above_average' binary variable
# df['above_average'] = df.price.apply(lambda x: 1 if x > df.price.mean() else 0 )

df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500


In [10]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [11]:
from sklearn.model_selection import train_test_split

# Define your features (X) and target variable (y)
X = df.drop(columns=['price'], axis=1).copy()
y = (df['price'] > mean_price).astype(int)

# Split the df into train (60%), validation (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [12]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(7148, 9) (7148,)
(2383, 9) (2383,)
(2383, 9) (2383,)


In [13]:
from sklearn.metrics import mutual_info_score

def mutual_info_price_score(series, target):
    return round(mutual_info_score(series, target), 2)

# Calculate mutual information scores for different features
make_score = mutual_info_price_score(X_train.make, y_train)
model_score = mutual_info_price_score(X_train.model, y_train)
transmission_score = mutual_info_price_score(X_train.transmission_type, y_train)
vehicle_style_score = mutual_info_price_score(X_train.vehicle_style, y_train)

# Find the feature with the lowest mutual information score
lowest_score_feature = min([(make_score, 'make'), (model_score, 'model'), (transmission_score, 'transmission_type'), (vehicle_style_score, 'vehicle_style')], key=lambda x: x[0])

# Print the scores and the feature with the lowest score
print(f"Make score: {make_score}")
print(f"Model score: {model_score}")
print(f"Transmission type score: {transmission_score}")
print(f"Vehicle style score: {vehicle_style_score}")
print(f"Feature with the lowest mutual information score: {lowest_score_feature[1]}")

Make score: 0.24
Model score: 0.46
Transmission type score: 0.02
Vehicle style score: 0.08
Feature with the lowest mutual information score: transmission_type


In [14]:
X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
original_accuracy = round(accuracy_score(y_val, y_pred),4)
print(f"Accuracy on the validation dataset: {original_accuracy}")

Accuracy on the validation dataset: 0.9442


In [15]:
X.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg'],
      dtype='object')

In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

err = {}
# features_to_evaluate = ["year", "engine_hp", "transmission_type", "city_mpg"]

for i in numerical:
    newf = X.drop(columns=i, axis=1)
    categorical_columns = list(newf.dtypes[newf.dtypes == 'object'].index)
    X_encoded = pd.get_dummies(newf, columns=categorical_columns, drop_first=True)
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = np.abs(original_accuracy - round(accuracy_score(y_val, y_pred), 4))
    err[i] = accuracy

sorted_dict = dict(sorted(err.items(), key=lambda x: x[1]))

# Find the feature with the smallest difference
min_diff_feature = min(sorted_dict, key=sorted_dict.get)
min_diff_accuracy = sorted_dict[min_diff_feature]

print(f"The feature with the smallest difference is '{min_diff_feature}' with a difference of {min_diff_accuracy:.4f}")

The feature with the smallest difference is 'city_mpg' with a difference of 0.0000


In [17]:
def rmse(y, y_pred):
    error =  y - y_pred
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [18]:
from sklearn.linear_model import Ridge
import numpy as np
from sklearn.metrics import mean_squared_error

X = df.drop(columns=['price'] + categorical_columns).copy()  # Features
y = np.log1p(df['price'])  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

alpha_values = [0, 0.01, 0.1, 1, 10]
rmse_scores = {}

for alpha in alpha_values:
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores[alpha] = round(rmse, 3)

# Find the alpha with the best RMSE
best_alpha = min(rmse_scores, key=rmse_scores.get)
best_rmse = rmse_scores[best_alpha]

print(f"The best alpha for Ridge Regression is {best_alpha} with RMSE = {best_rmse}")

The best alpha for Ridge Regression is 0 with RMSE = 0.759
