# Set Up

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from constants import models
from datetime import datetime
import pickle
import gzip

In [None]:
raw_data = pd.read_csv('vehicles.csv')

In [None]:
columns = ['price','year','manufacturer','model','condition','cylinders','fuel','odometer','title_status','transmission','drive']
df = raw_data[columns]
df = df.dropna()

# Cleaning Data

One of the biggest challenges with this dataset was finding the correct make and model of the car. These features will be huge. However because of free text, it was hard to make this happen. (i.e. "f150" vs. "f-150", or "rav4" vs "rav4 se"). I decided to go through manually and create a list of the most common models and save them in constants.py. 

In [None]:
def clean_model(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]', '', text) 
    text = re.sub(r'\s+', ' ', text).strip() 
    return text
df['cleaned_model'] = df['model'].apply(clean_model)

df['base_model'] = 'other' 
for model in models:
    mask = df['cleaned_model'].str.contains(model, case=False, na=False)
    df.loc[mask, 'base_model'] = model

print(f"Number of rows with 'other' base_model: {df[df['base_model'] == 'other'].shape[0]}")
print(f"Dropping these rows. That's {100 * df[df['base_model'] == 'other'].shape[0]/ df.shape[0]:.2f}% rows.")

df = df[df['base_model'] != 'other']
df['car_name'] = df['manufacturer'] + ' ' + df['base_model']
df.drop(columns=['manufacturer', 'model', 'cleaned_model', 'base_model'], inplace=True)

In [None]:
# Caclucating the median and IQR for price and then mileage
median_price = df['price'].median()
Q1_PRICE = df['price'].quantile(0.25)
Q3_PRICE = df['price'].quantile(0.75)
IQR_PRICE = Q3_PRICE - Q1_PRICE

median_mileage = df['odometer'].median()
Q1_MILEAGE = df['odometer'].quantile(0.25)
Q3_MILEAGE= df['odometer'].quantile(0.75)
IQR_MILEAGE = Q3_MILEAGE - Q1_MILEAGE

threshold = 2.5

outliers_price = (df['price'] < (Q1_PRICE - threshold * IQR_PRICE)) | (df['price'] > (Q3_PRICE + threshold * IQR_PRICE))
outliers_mileage = (df['odometer'] < (Q1_MILEAGE - threshold * IQR_MILEAGE)) | (df['odometer'] > (Q3_MILEAGE + threshold * IQR_MILEAGE))
outliers = outliers_price | outliers_mileage
df_no_outliers = df[~outliers]

# Since both lower bounds are also negative numbers, we will also manually remove all zeros
df_no_outliers = df_no_outliers[(df_no_outliers['price'] > 0) & (df_no_outliers['odometer'] > 0)]

# As dicussed in exploration notebook, we will manually remove all cars before 2000
df_no_outliers = df_no_outliers[df_no_outliers['year'] >= 2000]

print(f'Number of outliers removed: {df.shape[0] - df_no_outliers.shape[0]}')
print(f'That is {100 * (df.shape[0] - df_no_outliers.shape[0]) / df.shape[0]:.2f}% of the remaining data')

# Feature Engineering

In [None]:
# Since cylinders is an ordinal feature, we will convert it to a numerical value
# About 100 have "other" so we will just remove them
df_no_outliers = df_no_outliers[df_no_outliers['cylinders'] != 'other']
df_no_outliers['cylinders'] = df_no_outliers['cylinders'].str.extract('(\d+)').astype(int)

# Mapping conditions to numerical values. I checked the value counts and it seems like a good mapping
condition_mapping = {'salvage': 1, 'fair': 2, 'good': 3, 'like new': 4, 'excellent': 5, 'new': 6}
df_no_outliers['condition'] = df_no_outliers['condition'].map(condition_mapping)

# Having a clean title is more important that title status, so will just make this a boolean
df_no_outliers['clean_title'] = df_no_outliers['title_status'].apply(lambda x: 1 if x == 'clean' else 0)
df_no_outliers = df_no_outliers.drop(columns=['title_status'])

# Year is a bad feature, so will turn it into "Age"
reference_year = 2023
df_no_outliers['age'] = reference_year - df_no_outliers['year']
df_no_outliers = df_no_outliers.drop(columns=['year'])

# One hot encoding all categorical variables
encoded_columns = ['car_name','fuel','transmission','drive']
df_no_outliers = pd.get_dummies(df_no_outliers, columns=encoded_columns, drop_first=True).astype(int)
df_no_outliers.head(3)

# Modeling

Note: I tried using the log_price, but this actually gave me worse results. I also consistently added and dropped features and tried different models. Most of that isn't shown here, and rather I only kept the end product

In [None]:
X = df_no_outliers.drop(columns=['price'])
y = df_no_outliers['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Note I early on decided linear regression wasn't the best model for this task
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

In [None]:
# Note: I decided not to drop cylinders since Random Forest handle multicollinearity well
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse:,.2f}")
print(f"Root Mean Squared Error: {rmse:,.2f}")
print(f"R-squared: {r2:.4f}")

Note: Tried Hyperparameter tuning and it gave me worse results. Going to stick with my original model as it is pretty solid

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20, 
    cv=3, 
    scoring='neg_mean_squared_error',  # Use negative MSE for minimization
    n_jobs=-1
)


random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_

print("Best parameters:", random_search.best_params_)
print("Best CV score:", np.sqrt(-random_search.best_score_))

y_pred_tuned = best_rf.predict(X_test)
rmse_tuned = np.sqrt(mean_squared_error(y_test, y_pred_tuned))
r2_tuned = r2_score(y_test, y_pred_tuned)

print(f"Tuned RMSE: ${rmse_tuned:,.2f}")
print(f"Tuned R²: {r2_tuned:.4f}")

In [None]:
from constants import car_names
car_cylinders_mapping = (
    df[df['car_name'].isin(car_names)]
    .groupby('car_name')['cylinders']
    .first()
    .to_dict()
)

In [None]:
model_package = {
    'model': model,
    'feature_names': X_train.columns.tolist(),
    'car_cylinders_mapping': car_cylinders_mapping,
    'condition_mapping': condition_mapping,
    'reference_year': reference_year,
    'encoded_columns': encoded_columns,
    'version': '1.0',
    'created_date': datetime.now().isoformat(),
}

# Save using gzip compression
#with gzip.open('car_price_model_complete.pkl.gz', 'wb') as file:
#    pickle.dump(model_package, file)

import joblib
import bz2

with bz2.BZ2File("car_price_model_complete.joblib.bz2", "wb") as f:
    joblib.dump(model_package, f)

In [None]:
with bz2.BZ2File('car_price_model_complete.joblib.bz2', 'rb') as file:
    model_package = joblib.load(file)

In [None]:
def preprocess_data(inputs, model_package):
    df = pd.DataFrame([inputs])
    df['cylinders'] = df['car_name'].map(model_package['car_cylinders_mapping'])
    df['cylinders'] = df['cylinders'].str.extract('(\d+)').astype(int)
    df['condition'] = df['condition'].map(model_package['condition_mapping'])
    df['clean_title'] = df['title_status'].apply(lambda x: 1 if x == 'clean' else 0)
    df = df.drop(columns=['title_status'])
    df['age'] = model_package['reference_year'] - df['year']
    df = df.drop(columns=['year'])
    df = pd.get_dummies(df, columns=model_package['encoded_columns'], drop_first=True).astype(int)
    missing_cols = set(model_package['feature_names']) - set(df.columns)
    if missing_cols:
        missing_df = pd.DataFrame(0, index=df.index, columns=list(missing_cols))
        df = pd.concat([df, missing_df], axis=1)
    df = df[model_package['feature_names']]

    return df


In [None]:
test_input = {
    'odometer': 150000,
    'car_name': 'dodge charger',
    'condition': 'good',
    'title_status': 'clean',
    'year': 2010,
    'fuel': 'gas',
    'transmission': 'automatic',
    'drive': 'fwd'
}

In [None]:
processed_data = preprocess_data(test_input, model_package)

In [None]:
tree_predictions = np.array([tree.predict(processed_data.values)[0] for tree in model_package['model'].estimators_])
point_estimate = np.mean(tree_predictions)
std_error = np.std(tree_predictions) / np.sqrt(len(tree_predictions))

lower_bound = point_estimate - 1.96 * std_error
upper_bound = point_estimate + 1.96 * std_error

In [None]:
print(f"Point Estimate: ${point_estimate:,.2f}")
print(f"Upper Bound: ${upper_bound:,.2f})")
print(f"Lower Bound: ${lower_bound:,.2f})")

# Invesitagtion

In [None]:
df['transmission'].unique()

In [None]:
model_package['car_cylinders_mapping'].keys()

In [None]:
df

In [None]:
df[(df['car_name'] == 'ford f150') & (df['year'] == 2014) & (df['price'] != 0)]['price'].hist()