In [1]:
import pandas as pd
df = pd.read_csv("data/vehicles.csv")
print(df.shape)

(426880, 26)


 ## Data Cleaning:
 ### Removed unnecessary columns (url, VIN, image_url, etc.)
 ### filtered edge cases - kept prices $1K-80K$, car age 0-30 years, odometer <1M miles

In [2]:
# remove column
df.drop(columns=['url', 'region_url', 'VIN', 'image_url', 'description', 
                 'county', 'size', 'lat', 'long', 'title_status'], inplace=True, errors='ignore')

In [3]:
# postdate convert to datetime
df['posting_date'] = pd.to_datetime(df['posting_date'], errors='coerce', utc=True)

In [4]:
# adding car age column
df['car_age'] = df['posting_date'].dt.year - df['year']

In [5]:
# run this to check remaining missing values
print("Remaining missing values by column:")
print(df.isnull().sum().sort_values(ascending=False))
print(f"\nTotal missing: {df.isnull().sum().sum()}")

Remaining missing values by column:
cylinders       177678
condition       174104
drive           130567
paint_color     130203
type             92858
manufacturer     17646
model             5277
odometer          4400
fuel              3013
transmission      2556
year              1205
car_age           1205
posting_date        68
id                   0
region               0
price                0
state                0
dtype: int64

Total missing: 740780


In [7]:
num_region = df['region'].nunique()
num_state  = df['state'].nunique()
print(f"region unique values: {num_region}")
print(f"state unique values:  {num_state}")

region unique values: 404
state unique values:  51


In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.hist(np.log1p(df['price']), bins=50)
plt.title('Log-Scaled Price Distribution')
plt.xlabel('log(Price + 1)')
plt.ylabel('Count')
plt.show()

plt.hist(df['car_age'], bins=30)
plt.title('Car Age Distribution')
plt.xlabel('Car Age')
plt.ylabel('Count')
plt.show()

In [None]:
# remove edge value
df = df[(df['price'] >= 1000) & (df['price'] <= 80000)]
df = df[df['car_age'].between(0, 30, inclusive='both')]
df = df[df['odometer'] <= 1000000]

##  Feature Engineering:
### Created car_age column, filled missing values with 'unknown'
### applied one-hot encoding to categorical features

In [None]:
# remove necessary empty value row
df.dropna(subset=['price', 'year', 'manufacturer', 'model', 'odometer'], inplace=True)

In [None]:
# fill empty value with 'unknown'
fill_unknown_cols = ['condition', 'cylinders', 'fuel', 'drive', 'transmission', 'paint_color', 'type']
for col in fill_unknown_cols:
    if col in df.columns:
        df[col] = df[col].fillna('unknown')

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
import seaborn as sns

sns.set(style="whitegrid")

# Price
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Count')
plt.xlim(0, 80000)
plt.show()

# car age
plt.figure(figsize=(10, 6))
sns.histplot(df['car_age'], bins=30, kde=True)
plt.title('Car Age Distribution')
plt.xlabel('Car Age')
plt.ylabel('Count')
plt.show()

# price and car age
plt.figure(figsize=(10, 6))
sns.scatterplot(x='car_age', y='price', data=df, alpha=0.3)
plt.title('Price vs. Car Age')
plt.xlabel('Car Age')
plt.ylabel('Price')
plt.ylim(0, 80000)
plt.show()

# price Odometer
plt.figure(figsize=(10, 6))
sns.scatterplot(x='odometer', y='price', data=df, alpha=0.3)
plt.title('Price vs. Odometer')
plt.xlabel('Odometer (miles)')
plt.ylabel('Price')
plt.ylim(0, 80000)
plt.show()

## Category features EDA

In [None]:
import seaborn as sns

sns.set(style="whitegrid")

# Function: plot average price by category
def plot_avg_price_by_category(col_name, top_n=10):
    plt.figure(figsize=(12, 6))
    
    # Calculate average price and sort
    top_categories = df.groupby(col_name)['price'].mean().sort_values(ascending=False).head(top_n).index
    filtered_df = df[df[col_name].isin(top_categories)]

    sns.barplot(x=col_name, y='price', data=filtered_df, order=top_categories)
    plt.title(f'Average Price by {col_name.capitalize()}')
    plt.xticks(rotation=45)
    plt.ylabel('Average Price')
    plt.xlabel(col_name.capitalize())
    plt.tight_layout()
    plt.show()

# top 10 manufacturers by avg price
plot_avg_price_by_category('manufacturer')

# top 10 models by avg price
plot_avg_price_by_category('model')

# top fuel types by avg price
plot_avg_price_by_category('fuel')

# top transmission types by avg price
plot_avg_price_by_category('transmission')

# top drive types by avg price
plot_avg_price_by_category('drive')

# top vehicle types by avg price
plot_avg_price_by_category('type')

### apply one-hot encoding to categorical features

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd

num_cols = ['year', 'car_age', 'odometer']
X_num = df[num_cols].reset_index(drop=True)

# one hot
low_cats = ['manufacturer', 'condition', 'cylinders', 'fuel',
          'transmission', 'drive', 'type', 'paint_color']
low_cats = [col for col in low_cats if col in df.columns]

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(df[low_cats])
cat_names = encoder.get_feature_names_out(low_cats)
X_cat = pd.DataFrame(X_cat, columns=cat_names)

# label encode
high_cats = ['model', 'state', 'region']
high_cats = [col for col in high_cats if col in df.columns]

X_high_list = []
for col in high_cats:
   le = LabelEncoder()
   encoded = le.fit_transform(df[col].astype(str))
   X_high_list.append(pd.DataFrame({f'{col}_enc': encoded}))

X_high = pd.concat(X_high_list, axis=1) if X_high_list else pd.DataFrame()

X_baseline = pd.concat([X_num, X_cat, X_high], axis=1)

print(f"Shape: {X_baseline.shape}")
print(f"Numeric: {len(num_cols)}, One-hot: {X_cat.shape[1]}, Label: {X_high.shape[1]}")

## XGBoost Baseline:
### Feature Importance Analysis
### Use XGBoost feature importance to identify top 5 impactful features
### 87% accuracy (R²: 0.8778)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import subprocess

try:
    subprocess.check_output('nvidia-smi')
    device = 'cuda'
    print("Using GPU")
except:
    device = 'cpu'
    print("Using CPU")

X = X_baseline
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train
model = XGBRegressor(
   n_estimators=200,
   learning_rate=0.1,
   max_depth=6,
   tree_method='hist',
   device='cuda',
   random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print("XGBoost Model Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# feature importance
feat_imp = pd.DataFrame({
   'feature': X.columns,
   'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

print(f"\nTop 10 important features:")
for i, row in feat_imp.head(10).iterrows():
   print(f"{row['feature']}: {row['importance']:.4f}")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# feature importance
axes[0].barh(feat_imp['feature'], feat_imp['importance'])
axes[0].set_xlabel("Importance")
axes[0].set_title("Top 20 Features (XGBoost)")
axes[0].invert_yaxis()

# predicted vs actual
axes[1].scatter(y_test, y_pred, alpha=0.3)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
axes[1].set_xlabel("Actual Price")
axes[1].set_ylabel("Predicted Price")
axes[1].set_title("Predicted vs Actual")

# residuals
residuals = y_test - y_pred
axes[2].scatter(y_test, residuals, alpha=0.3)
axes[2].axhline(0, color='red', linestyle='--')
axes[2].set_xlabel("Actual Price")
axes[2].set_ylabel("Residuals")
axes[2].set_title("Residual Plot")

plt.tight_layout()
plt.show()

## Data Preprocessing for Deep Learning:
### Apply StandardScaler to numeric features and RareCategoryCombiner for categorical features

In [None]:
from sklearn.preprocessing import StandardScaler

numeric_features = ['car_age', 'odometer']
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(df[numeric_features])
X_numeric_scaled = pd.DataFrame(X_numeric_scaled, columns=numeric_features)

X_categorical = X_baseline.drop(columns=numeric_features).reset_index(drop=True)

# combine
X_full_scaled = pd.concat([X_numeric_scaled.reset_index(drop=True), X_categorical], axis=1)

print(X_full_scaled.shape)
X_full_scaled.head()

## MLP Model (DL)
### 92% accuracy (R²: 0.9257)

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

df_clean = df.dropna(subset=['model', 'region']).copy()

# num features
num_cols = ['year', 'car_age', 'odometer']
X_num = df_clean[num_cols].copy()

# categoricals feature
high_card_cols = ['model', 'region']
encoders = {}
X_high_encoded = {}

for col in high_card_cols:
   le = LabelEncoder()
   X_high_encoded[col] = le.fit_transform(df_clean[col].astype(str))
   encoders[col] = le
   print(f"{col}: {len(le.classes_)} unique categories")

# one hot
low_card_cols = ['manufacturer', 'fuel', 'transmission', 'drive', 'type']
low_card_cols = [col for col in low_card_cols if col in df_clean.columns]

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_low_encoded = ohe.fit_transform(df_clean[low_card_cols])

scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

y = df_clean['price'].values

In [None]:
# train test split

indices = range(len(y))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

# Split all features
X_num_train, X_num_test = X_num_scaled[train_idx], X_num_scaled[test_idx]
X_low_train, X_low_test = X_low_encoded[train_idx], X_low_encoded[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

X_high_train, X_high_test = {}, {}
for col in high_card_cols:
   X_high_train[col] = X_high_encoded[col][train_idx]
   X_high_test[col] = X_high_encoded[col][test_idx]

In [None]:
num_input = Input(shape=(len(num_cols),), name='numeric')
num_dense = Dense(64, activation='relu')(num_input)

embedding_layers = []
for col in high_card_cols:
   vocab_size = len(encoders[col].classes_)
   embed_dim = min(50, vocab_size // 2)

   input_layer = Input(shape=(1,), name=f'{col}_input')
   embed_layer = Embedding(vocab_size, embed_dim, name=f'{col}_embed')(input_layer)
   embed_flat = Flatten()(embed_layer)
   embedding_layers.append((input_layer, embed_flat))

low_input = Input(shape=(X_low_encoded.shape[1],), name='categorical')
low_dense = Dense(64, activation='relu')(low_input)

all_inputs = [num_input, low_input] + [layer[0] for layer in embedding_layers]
all_features = [num_dense, low_dense] + [layer[1] for layer in embedding_layers]

concat = Concatenate()(all_features)

# Deep layers
x = Dense(256, activation='relu')(concat)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)

x = Dense(128, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)

x = Dense(64, activation='relu')(x)
output = Dense(1, name='price')(x)

model = Model(inputs=all_inputs, outputs=output)

model.compile(
   optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
   loss='mse',
   metrics=['mae']
)

print(f"Model summary:")
print(f"Total parameters: {model.count_params():,}")

In [None]:
# train
train_data = [X_num_train, X_low_train] + [X_high_train[col].reshape(-1, 1) for col in high_card_cols]
test_data = [X_num_test, X_low_test] + [X_high_test[col].reshape(-1, 1) for col in high_card_cols]

history = model.fit(
   train_data, y_train,
   validation_split=0.2,
   epochs=20,
   batch_size=256,
   verbose=1
)

y_pred = model.predict(test_data).flatten()

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"\nMulti-Input Neural Network Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

plt.tight_layout()
plt.show()

## Embedding Model
### Implemented embedding layer for high-cardinality 'model' feature and used deep MLP to achieve 92% accuracy (R²: 0.9251)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout, BatchNormalization, Add
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df_clean = df[df['model'].notna()].copy()
y = df_clean['price'].values

embed_cols = ['model', 'manufacturer']
encoders = {}
X_embed = {}

for col in embed_cols:
    le = LabelEncoder()
    X_embed[col] = le.fit_transform(df_clean[col].astype(str))
    encoders[col] = le

num_cols = ['year', 'car_age', 'odometer']
cat_cols = ['fuel', 'transmission', 'drive', 'type']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

X_other = preprocessor.fit_transform(df_clean)

# train test split
indices = range(len(y))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

X_other_train, X_other_test = X_other[train_idx], X_other[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

X_embed_train, X_embed_test = {}, {}
for col in embed_cols:
    X_embed_train[col] = X_embed[col][train_idx]
    X_embed_test[col] = X_embed[col][test_idx]

inputs = []
embeddings = []

for col in embed_cols:
    vocab_size = len(encoders[col].classes_)
    embed_dim = min(16, vocab_size // 4)

    input_layer = Input(shape=(1,), name=f'{col}_input')
    embed_layer = Embedding(vocab_size, embed_dim, name=f'{col}_embed')(input_layer)
    embed_flat = Flatten()(embed_layer)
    inputs.append(input_layer)
    embeddings.append(embed_flat)

input_other = Input(shape=(X_other_train.shape[1],), name='other_features')
inputs.append(input_other)

if embeddings:
    x = Concatenate()(embeddings + [input_other])
else:
    x = input_other

x = Dense(256, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)

x1 = Dense(128, activation='relu')(x)
x1 = BatchNormalization()(x1)
x1 = Dropout(0.3)(x1)

x2 = Dense(128, activation='relu')(x1)
x2 = BatchNormalization()(x2)

x_skip = Dense(128)(x)
x = Add()([x2, x_skip])
x = Dropout(0.3)(x)

x = Dense(64, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

output = Dense(1, name='price')(x)

model = Model(inputs=inputs, outputs=output)
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss='mse',
    metrics=['mae']
)

callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.5, patience=3, min_lr=1e-6)
]

train_data = [X_embed_train[col].reshape(-1, 1) for col in embed_cols] + [X_other_train]
test_data = [X_embed_test[col].reshape(-1, 1) for col in embed_cols] + [X_other_test]

history = model.fit(
    train_data, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=128,
    callbacks=callbacks,
    verbose=1
)

y_pred = model.predict(test_data).flatten()
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"\nAdvanced MLP with Multiple Embeddings Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")