In [22]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")

In [23]:
pd.set_option('display.float_format', '{:,.2f}'.format)

In [24]:
def load_and_explore_dataset(filepath):
    print("=" * 60)
    print("LOAD AND  EXPLORE DATASET")
    print("=" * 60)

    df = pd.read_csv(filepath)

    print("Shape of the dataset")
    print(df.shape)
    print("\nCheck for missing values")
    print(df.isnull().sum())
    print("\nFirst five rows:")
    print(df.head())
    print("\nDescriptive stats")
    print(df.describe())
    print("\nDataset Info")
    print(df.info())
    print("\nCondition Distribution:")
    print(df["condition"].value_counts())
    print("\nTransmission Distribution:")
    print(df["transmission"].value_counts())

    return df
    

In [25]:
def preprocessing_data(df):
    print("\n" + "=" * 60)
    print("PREPROCESSING LOADED DATA")
    print("=" * 60)

    df_processed = df.copy()
    df_processed = df.dropna()

    label_encoder = {}
    df_columns = {"make", "model", "condition", "transmission"}
    for col in df_columns:
        le = LabelEncoder()
        df_processed[col + "_encoded"] = le.fit_transform(df_processed[col])
        label_encoder[col] = le
        print(f"\n{col} encoded")
        for i, label in enumerate(le.classes_):
            print(f" {label} : {i}")
    print("preprocessed Dataset shape", df_processed.shape)

    return df_processed, label_encoder

In [26]:
def features_data(df_processed):
    print("\n" + "=" * 60)
    print("FEATURES DATA")
    print("=" * 60)

    feature_columns = ["year", "make_encoded", "model_encoded", "condition_encoded", "transmission_encoded"]
    target_column = "price"

    X = df_processed[feature_columns]
    y = df_processed[target_column]
   

    print("\nFeatures Shape", df_processed[feature_columns].shape)
    print("\nTarget Shape", df_processed[target_column].shape)
    print("\nFeatures", feature_columns)
    

    return X, y, feature_columns

In [27]:
def split_data(X, y, test_size=0.2, random_state=42):
    print("\n" + "=" * 60)
    print("SPLITING DATA")
    print("=" * 60)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    print("\nTraining set size", X_train.shape[0])
    print("Testing set size", X_test.shape[0])
    print("\nTraining set price range {:.2f} - {:.2f}".format(
        float(y_train.min()), float(y_train.max())
    ))
    print("\nTesting set price range {:.2f} - {:.2f}".format(
        float(y_test.min()), float(y_test.max())
    ))

    return X_train, X_test, y_train, y_test,
    

In [28]:
def scale_features(X_train, X_test):
    print("\n" + "=" * 60)
    print("SCALE FEATURES DATA")
    print("=" * 60)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)
    
    print("\nFeatures scaled successfully")
    print("Training scaled set", X_train_scaled.shape)
    print("Testing scaled set", X_test_scaled.shape)

    return X_train_scaled, X_test_scaled, scaler

In [29]:
def train_model(X_train_scaled, y_train, feature_columns): 
    print("\n" + "=" * 60) 
    print("TRAINING MODEL") 
    print("=" * 60) 
    
    model = LinearRegression() 
    model.fit(X_train_scaled, y_train) 
    
    print("\nModel train succesfully") 
    print("\nmodel coefficient") 
    for features, coef in zip(feature_columns, model.coef_.ravel()): 
        print(f" {features}: {coef:.2f}") 
    print(f"\nModel intercept: {float(model.intercept_):.2f}") 
        
    return model

In [30]:
def train_random_forest_model(X_train_scaled, y_train, feature_columns): 
    print("\n" + "=" * 60) 
    print("TRAINING RANDOM FOREST MODEL") 
    print("=" * 60) 

    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=1
    )
    rf_model.fit(X_train_scaled, y_train)
    print("Random forest successfully saved")
    print("Mosel estimator:", rf_model.estimator)
    print("Model max depth:", rf_model.max_depth)

    feature_importance = sorted(zip(feature_columns, rf_model.feature_importances_),
                               key=lambda x: x[1], reverse=True)
    for feature, importance in feature_importance:
        print(f"  {feature}: {float(importance):.4f}")

    return rf_model

In [31]:
def evaluate_rfmodel(rf_model, X_train_scaled, X_test_scaled, y_train, y_test):
    print("\n" + "=" * 60)
    print("RANDOM FOREST MODEL PERFORMANCE")
    print("=" * 60)

    #make prediction
    y_train_pred = rf_model.predict(X_train_scaled)
    y_test_pred = rf_model.predict(X_test_scaled)

    # evaluate
    train_r2 = r2_score(y_train_pred, y_train)
    test_r2 = r2_score(y_test_pred, y_test)
    train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
    test_rmse = np.sqrt(mean_squared_error(y_test_pred, y_test))
    train_mae = mean_absolute_error(y_train_pred, y_train)
    test_mae = mean_absolute_error(y_test_pred, y_test)

    print("\n" + "=" * 60)
    print("MODEL PERFORMANCE")
    print("=" * 60)
    print("\ntTraining set")
    print(f" R2 score: {train_r2:.4}")
    print(f" RMSE: {train_rmse:.2}")
    print(f" MAE: {train_mae:.2}")
    print("\ntTesting set")
    print(f" R2 score: {test_r2:.4}")
    print(f" RMSE: {test_rmse:.2}")
    print(f" MAE: {test_mae:.2}")

    # cross validation
    cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring="r2")
    print("\nCross Validation (5-folds)")
    print(f" R2 scores: {cv_scores}")
    print(f" Cross validation mean: {cv_scores.mean():.4f}")
    print(f" Cross validation STD: {cv_scores.std():.4f}")

    metrics = {
        "train_r2": train_r2,
        "test_r2": test_r2,
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "y_train_pred": y_train_pred,
        "y_test_pred": y_test_pred,
        "cv_scores": cv_scores
    }

    return metrics

In [32]:
def evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test):
    print("\n" + "=" * 60)
    print("EVALUATING MODEL")
    print("=" * 60)

    #make prediction
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # evaluate
    train_r2 = r2_score(y_train_pred, y_train)
    test_r2 = r2_score(y_test_pred, y_test)
    train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
    test_rmse = np.sqrt(mean_squared_error(y_test_pred, y_test))
    train_mae = mean_absolute_error(y_train_pred, y_train)
    test_mae = mean_absolute_error(y_test_pred, y_test)

    print("\n" + "=" * 60)
    print("MODEL PERFORMANCE")
    print("=" * 60)
    print("\ntTraining set")
    print(f" R2 score: {train_r2:.4}")
    print(f" RMSE: {train_rmse:.2}")
    print(f" MAE: {train_mae:.2}")
    print("\ntTesting set")
    print(f" R2 score: {test_r2:.4}")
    print(f" RMSE: {test_rmse:.2}")
    print(f" MAE: {test_mae:.2}")

    # cross validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="r2")
    print("\nCross Validation (5-folds)")
    print(f" R2 scores: {cv_scores}")
    print(f" Cross validation mean: {cv_scores.mean():.4f}")
    print(f" Cross validation STD: {cv_scores.std():.4f}")

    metrics = {
        "train_r2": train_r2,
        "test_r2": test_r2,
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "y_train_pred": y_train_pred,
        "y_test_pred": y_test_pred,
        "cv_scores": cv_scores
    }

    return metrics

In [33]:
def save_model_artifact(model, scaler, label_encoders, feature_columns):
    print("\n" + "=" * 60)
    print("SAVING MODEL ARTIFACT")
    print("=" * 60)

    joblib.dump(model, "car_price_prediction.pkl")
    print("Car price prediction model saved successfully")

    joblib.dump(scaler, "scaler_features.pkl")
    print("Scaler features saved successfully")

    joblib.dump(label_encoders, "label_encoders.pkl")
    print("Label encoders saved successfully")

    joblib.dump(feature_columns, "feature_columns.pkl")
    print("Feature columns saved successfully")


    print("\n" + "=" * 60)
    print("ALL MODEL ARTIFACT SAVED SUCCESSFULLY")
    print("=" * 60)

In [34]:
def save_rf_model_artifact(rf_model):
    print("\n" + "=" * 60)
    print("SAVING RANDOM FOREST MODEL ARTIFACT")
    print("=" * 60)

    joblib.dump(rf_model, "rf_model_prediction.pkl")
    print("Car price prediction model saved successfully")

    print("\n" + "=" * 60)
    print("RANDOM FOREST MODEL ARTIFACT SAVED SUCCESSFULLY")
    print("=" * 60)

In [35]:
def predict_car_price(year, make, model_name, condition, transmission):
    model = joblib.load("car_price_prediction.pkl")
    scaler = joblib.load("scaler_features.pkl")
    label_encoders = joblib.load("label_encoders.pkl")
    feature_columns = joblib.load("feature_columns.pkl")

    try:
        make_encoded = label_encoders["make"].transform([make])[0]
        model_encoded = label_encoders["model"].transform([model_name])[0]
        condition_encoded = label_encoders["condition"].transform([condition])[0]
        transmission_encoded =label_encoders["transmission"].transform([transmission])[0]
    except ValueError as e:
        return f"Unknown category - {e}"

    features_dict = {
        "year": year,
        "make_encoded": make_encoded,
        "model_encoded": model_encoded,
        "condition_encoded": condition_encoded,
        "transmission_encoded": transmission_encoded
    }

    features = np.array([[features_dict[col] for col in features_dict]])

    features_scale = scaler.transform(features)

    predicted_price = model.predict(features_scale)[0]

    return predicted_price
        
    

In [36]:
def rf_model_predict_car_price(year, make, model_name, condition, transmission):
    rf_model = joblib.load("rf_model_prediction.pkl")
    scaler = joblib.load("scaler_features.pkl")
    label_encoders = joblib.load("label_encoders.pkl")
    feature_columns = joblib.load("feature_columns.pkl")

    try:
        make_encoded = label_encoders["make"].transform([make])[0]
        model_encoded = label_encoders["model"].transform([model_name])[0]
        condition_encoded = label_encoders["condition"].transform([condition])[0]
        transmission_encoded =label_encoders["transmission"].transform([transmission])[0]
    except ValueError as e:
        return f"Unknown category - {e}"

    features_dict = {
        "year": year,
        "make_encoded": make_encoded,
        "model_encoded": model_encoded,
        "condition_encoded": condition_encoded,
        "transmission_encoded": transmission_encoded
    }

    features = np.array([[features_dict[col] for col in features_dict]])

    features_scale = scaler.transform(features)

    predicted_price = rf_model.predict(features_scale)[0]

    return predicted_price
        
    

In [40]:
def test_prediction():
    print("\n" + "=" * 60)
    print("TESTING PREIDICTION")
    print("=" * 60)

    # example 1
    price1 = predict_car_price(2015, "Toyota", "Camry", "Local used", "Automatic")
    print("\n 2015 Toyota Camary (Foreign used Automatic)")
    print(f" ₦{float(price1):,.2f}")

    # example 2
    price2 = predict_car_price(2010, "Honda", "Accord", "Local used", "Automatic")
    print("\n 2010 Honda Accord (Local used Automatic)")
    print(f" ₦{float(price2):,.2f}")

    price3 = predict_car_price(2012, "Lexus", "RX 350", "Local used", "Automatic")
    print("\n 2012 Lexus RX 350 (Foreign used Automatic)")
    print(f" ₦{float(price3):,.2f}")

    print("=" * 60)

In [45]:
def test_rf_model_prediction():
    print("\n" + "=" * 60)
    print("TESTING PREIDICTION")
    print("=" * 60)

    # example 1
    price1 = rf_model_predict_car_price(2015, "Toyota", "Camry", "Local used", "Automatic")
    print("\n 2015 Toyota Camary (Foreign used Automatic)")
    print(f" ₦{float(price1):,.2f}")

    # example 2
    price2 = rf_model_predict_car_price(2010, "Honda", "Accord", "Local used", "Automatic")
    print("\n 2010 Honda Accord (Local used Automatic)")
    print(f" ₦{float(price2):,.2f}")

    price3 = rf_model_predict_car_price(2012, "Lexus", "RX 350", "Local used", "Automatic")
    print("\n 2012 Lexus RX 350 (Local used Automatic)")
    print(f" ₦{float(price3):,.2f}")

    print("=" * 60)

In [39]:
def main():
    filepath = "cleaned_jiji_car_dataset.csv"

    df = load_and_explore_dataset(filepath)

    df_processed, label_encoders = preprocessing_data(df)

    X, y, feature_columns = features_data(df_processed)

    X_train, X_test, y_train, y_test = split_data(X, y)

    X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)

    model = train_model(X_train_scaled, y_train, feature_columns)

    rf_model = train_random_forest_model(X_train_scaled, y_train, feature_columns)

    mertics = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test,)

    rf_metrics = evaluate_rfmodel(rf_model, X_train_scaled, X_test_scaled, y_train, y_test,)

    save_model_artifact(model, scaler, label_encoders, feature_columns)

    test_prediction()

    rf_model = train_random_forest_model(X_train_scaled, y_train, feature_columns)

    rf_metrics = evaluate_rfmodel(rf_model, X_train_scaled, X_test_scaled, y_train, y_test,)

    save_rf_model_artifact(rf_model)

    test_rf_model_prediction()
    


In [46]:
 if __name__ == "__main__":
     main()

LOAD AND  EXPLORE DATASET
Shape of the dataset
(1755, 7)

Check for missing values
title           0
make            0
model           0
year            0
condition       0
transmission    0
price           0
dtype: int64

First five rows:
                                               title           make  \
0                            Lexus RX 350 2009 White          Lexus   
1  Hyundai Sonata Limited w/Brown Leather 4dr Sed...        Hyundai   
2                Toyota Highlander Limited 2012 Gray         Toyota   
3  Mercedes-Benz C300 Base AWD 4Matic Sedan (2.0L...  Mercedes-Benz   
4                        Hyundai Elantra 2014 Silver        Hyundai   

                model  year     condition transmission         price  
0              RX 350  2009  Foreign used    Automatic 12,850,000.00  
1      Sonata Limited  2015  Foreign used    Automatic 15,450,000.00  
2  Highlander Limited  2012    Local used    Automatic 14,500,000.00  
3           C300 Base  2015    Local used    Auto