In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [None]:
data = pd.read_csv('cleaned_data.csv')

In [None]:
# select features
features = ['brand', 'model', 'year', 'mileage', 'price', 'transmission_type', 'city']
df = data[features]
df.head()

Unnamed: 0,brand,model,year,mileage,price,transmission_type,city
0,Audi,Q3,2021,65000,2250000,أتوماتيك,6 أكتوبر
1,Mercedes,C 180,2022,38000,2650000,أتوماتيك,6 أكتوبر
2,Hyundai,Accent RB,2019,27000,705000,أتوماتيك,القاهرة
3,Kia,Sportage,2021,68,1400000,أتوماتيك,الجيزة
4,Hyundai,Tucson GDI,2020,146000,1250000,أتوماتيك,الزقازيق


In [None]:
# Handle outliers (cap at 1st and 99th percentiles)
df.loc[:, 'price'] = df['price'].clip(lower=df['price'].quantile(0.01), upper=df['price'].quantile(0.99)).astype(float)
df.loc[:, 'mileage'] = df['mileage'].clip(lower=df['mileage'].quantile(0.01), upper=df['mileage'].quantile(0.99)).astype(float)

  df.loc[:, 'mileage'] = df['mileage'].clip(lower=df['mileage'].quantile(0.01), upper=df['mileage'].quantile(0.99)).astype(float)


In [None]:
# add age
current_year = 2025
df.loc[:, 'age'] = current_year - df['year']
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'age'] = current_year - df['year']


Unnamed: 0,brand,model,year,mileage,price,transmission_type,city,age
0,Audi,Q3,2021,65000,2250000,أتوماتيك,6 أكتوبر,4
1,Mercedes,C 180,2022,38000,2650000,أتوماتيك,6 أكتوبر,3
2,Hyundai,Accent RB,2019,27000,705000,أتوماتيك,القاهرة,6
3,Kia,Sportage,2021,1000,1400000,أتوماتيك,الجيزة,4
4,Hyundai,Tucson GDI,2020,146000,1250000,أتوماتيك,الزقازيق,5


In [None]:
# get rare models (< 5 occurences)
model_counts = df['model'].value_counts()
rare_models = model_counts[model_counts < 5].index
df.loc[:, 'model'] = df['model'].apply(lambda x: 'Rare_Model' if x in rare_models else x)
df.head()

Unnamed: 0,brand,model,year,mileage,price,transmission_type,city,age
0,Audi,Q3,2021,65000,2250000,أتوماتيك,6 أكتوبر,4
1,Mercedes,C 180,2022,38000,2650000,أتوماتيك,6 أكتوبر,3
2,Hyundai,Accent RB,2019,27000,705000,أتوماتيك,القاهرة,6
3,Kia,Sportage,2021,1000,1400000,أتوماتيك,الجيزة,4
4,Hyundai,Tucson GDI,2020,146000,1250000,أتوماتيك,الزقازيق,5


In [None]:
# Encode transmission_type (handle unexpected values)
valid_transmissions = {'أتوماتيك': 1, 'مانيوال': 0}
df.loc[:, 'transmission_type'] = df['transmission_type'].map(valid_transmissions)
# If there are NaNs due to unmapped values, fill with mode (most common transmission)
if df['transmission_type'].isna().any():
    print("Warning: Unmapped transmission_type values found. Filling with mode.")
    df.loc[:, 'transmission_type'] = df['transmission_type'].fillna(df['transmission_type'].mode()[0])



  df.loc[:, 'transmission_type'] = df['transmission_type'].fillna(df['transmission_type'].mode()[0])


In [None]:
df.head()

Unnamed: 0,brand,model,year,mileage,price,transmission_type,city,age
0,Audi,Q3,2021,65000,2250000,1.0,6 أكتوبر,4
1,Mercedes,C 180,2022,38000,2650000,1.0,6 أكتوبر,3
2,Hyundai,Accent RB,2019,27000,705000,1.0,القاهرة,6
3,Kia,Sportage,2021,1000,1400000,1.0,الجيزة,4
4,Hyundai,Tucson GDI,2020,146000,1250000,1.0,الزقازيق,5


In [None]:
# Target encoding for brand, model, and city
brand_mean = df.groupby('brand')['price'].mean()
model_mean = df.groupby('model')['price'].mean()
city_mean = df.groupby('city')['price'].mean()
df.loc[:, 'brand_encoded'] = df['brand'].map(brand_mean)
df.loc[:, 'model_encoded'] = df['model'].map(model_mean)
df.loc[:, 'city_encoded'] = df['city'].map(city_mean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'brand_encoded'] = df['brand'].map(brand_mean)


In [None]:
# Log-transform mileage
df.loc[:, 'mileage_log'] = np.log1p(df['mileage'])

In [None]:
# Drop original columns
df = df.drop(['brand', 'model', 'year', 'mileage', 'city'], axis=1)

In [None]:
df.head()

Unnamed: 0,price,transmission_type,age,brand_encoded,model_encoded,city_encoded,mileage_log
0,2250000,1.0,4,1755635.0,2068333.0,778968.671679,11.082158
1,2650000,1.0,3,2111376.0,1949072.0,778968.671679,10.545368
2,705000,1.0,6,711138.8,677479.2,819582.689039,10.203629
3,1400000,1.0,4,883287.6,1503322.0,603773.807143,6.908755
4,1250000,1.0,5,711138.8,1324333.0,735269.651685,11.891369


In [None]:
# Features and target
X = df.drop('price', axis=1)
y = df['price']

# Ensure all features are numeric
X = X.astype(float)


In [None]:
# Check for multicollinearity
print("Correlation Matrix:")
print(X.corr())

Correlation Matrix:
                   transmission_type       age  brand_encoded  model_encoded  \
transmission_type           1.000000 -0.425470       0.276424       0.367816   
age                        -0.425470  1.000000      -0.068453      -0.343803   
brand_encoded               0.276424 -0.068453       1.000000       0.660800   
model_encoded               0.367816 -0.343803       0.660800       1.000000   
city_encoded                0.204422 -0.208651       0.203271       0.239623   
mileage_log                -0.185931  0.486367      -0.147847      -0.317113   

                   city_encoded  mileage_log  
transmission_type      0.204422    -0.185931  
age                   -0.208651     0.486367  
brand_encoded          0.203271    -0.147847  
model_encoded          0.239623    -0.317113  
city_encoded           1.000000    -0.131985  
mileage_log           -0.131985     1.000000  


In [None]:
# lets check the var infalation values for each coeff (defines multicolinearity), helps you to decide wether to remove a featur or not
try:
    vif_data = pd.DataFrame()
    vif_data['Feature'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    print("\nVariance Inflation Factor (VIF):")
    print(vif_data)
except Exception as e:
    print(f"Warning: VIF calculation failed: {e}")
    print("Proceeding without VIF.")



Variance Inflation Factor (VIF):
             Feature        VIF
0  transmission_type   6.197620
1                age   4.695791
2      brand_encoded   9.034560
3      model_encoded   6.971000
4       city_encoded  20.029486
5        mileage_log  28.433216


In [None]:
# Scale all features by mean
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
# Add polynomial features (interaction terms only)
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_poly = poly.fit_transform(X)
poly_feature_names = poly.get_feature_names_out(X.columns)

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

In [None]:
# Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np

alphas = [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
best_r2 = 0
best_alpha = 0
best_mae = float('inf')
best_model = None

for alpha in alphas:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"\nRidge Regression (alpha={alpha}):")
    print(f"R-squared (actual): {r2}")
    print(f"Mean Absolute Error: {mae:.2f}")

    if r2 > best_r2 or (abs(r2 - best_r2) < 1e-2 and mae < best_mae):
        best_r2 = r2
        best_alpha = alpha
        best_mae = mae
        best_model = ridge_model


Ridge Regression (alpha=0.01):
R-squared (actual): 0.8506513566710707
Mean Absolute Error: 142228.69

Ridge Regression (alpha=0.1):
R-squared (actual): 0.8506513365037163
Mean Absolute Error: 142227.84

Ridge Regression (alpha=1.0):
R-squared (actual): 0.8506511276853592
Mean Absolute Error: 142219.27

Ridge Regression (alpha=10.0):
R-squared (actual): 0.8506483302221471
Mean Absolute Error: 142134.60

Ridge Regression (alpha=100.0):
R-squared (actual): 0.8505542013348517
Mean Absolute Error: 141346.82

Ridge Regression (alpha=1000.0):
R-squared (actual): 0.8456273611266789
Mean Absolute Error: 138798.70

Ridge Regression (alpha=10000.0):
R-squared (actual): 0.7565368692107245
Mean Absolute Error: 199045.30


In [None]:
print(f"\nBest Ridge Model (alpha={best_alpha}):")
print(f"R-squared: {best_r2:.2f}")
print(f"Mean Absolute Error: {best_mae:.2f}")


Best Ridge Model (alpha=1000.0):
R-squared: 0.85
Mean Absolute Error: 138798.70


In [None]:
# Cross-validation for best model
cv_scores = cross_val_score(Ridge(alpha=best_alpha), X_poly, y, cv=10, scoring='r2')
print(f"\nCross-Validation R-squared (alpha={best_alpha}):")
print(cv_scores)
print(f"Mean R-squared: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")


Cross-Validation R-squared (alpha=1000.0):
[0.8383129  0.85746647 0.81786752 0.85590969 0.83110162 0.86500318
 0.86300301 0.85926646 0.84201428 0.85839541]
Mean R-squared: 0.85 (+/- 0.03)


In [None]:

# Coefficients for best model
coefficients = pd.DataFrame({
    'Feature': poly_feature_names,
    'Coefficient': best_model.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Coefficients (Best Ridge Model):")
print(coefficients)


Feature Coefficients (Best Ridge Model):
                            Feature    Coefficient
3                     model_encoded  246652.997833
2                     brand_encoded  157781.291385
0                 transmission_type   58289.401329
16       brand_encoded city_encoded   27615.012128
4                      city_encoded   26763.251741
8   transmission_type model_encoded   26632.841661
7   transmission_type brand_encoded   10387.786004
14                  age mileage_log    6484.482354
6             transmission_type age   -1121.175761
18       model_encoded city_encoded   -3836.566560
9    transmission_type city_encoded   -6394.985070
13                 age city_encoded  -10884.307757
20         city_encoded mileage_log  -11107.033644
10    transmission_type mileage_log  -12568.352395
15      brand_encoded model_encoded  -15507.432070
17        brand_encoded mileage_log  -21656.569223
19        model_encoded mileage_log  -25465.579614
11                age brand_encoded  -42

In [None]:
# lets try another model
from xgboost import XGBRegressor # type: ignore
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print(f"XGBoost R-squared: {r2_score(y_test, y_pred_xgb):.2f}")
print(f"XGBoost MAE: {mean_absolute_error(y_test, y_pred_xgb):.2f}")

XGBoost R-squared: 0.91
XGBoost MAE: 97472.49
