In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [18]:
df = pd.read_csv('cleaned_data.csv')
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

In [19]:
df.shape

(78410, 9)

# Train Test Split


In [20]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(columns=["price"], axis=1), df["price"], test_size=0.3, random_state=12
)

In [21]:
x_train.head(2)

Unnamed: 0,model_year,mileage,registered_in,color,brand,vehicle,transmission,engine_type
37490,2006,101000,Karachi,Black,Toyota,Prado,Automatic,Petrol
39918,2020,9500,Islamabad,White,KIA,Sportage,Automatic,Petrol


# Linear regression


In [22]:
tf1_lr = ColumnTransformer(
    [("OHE", OneHotEncoder(sparse_output=False, drop="first"), [6, 7])],
    remainder="passthrough",
)
tf2_lr = ColumnTransformer(
    [
        (
            "Ordinal Encoding",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
            [4, 5, 6, 7, 8],
        )
    ],
    remainder="passthrough",
)
tf3_lr = ColumnTransformer(
    [
        (
            "Nan Imputer",
            SimpleImputer(missing_values=np.nan, strategy="most_frequent"),
            slice(0, 9),
        )
    ]
)
tf4_lr = ColumnTransformer([("Standard Scaler", StandardScaler(), slice(0, 9))])

m_lr =  LinearRegression()


pipe_lr = Pipeline([('One Hot Encoder',tf1_lr),('Ordinal Encoding',tf2_lr),('Nan Imputer',tf3_lr),('StandardScaler',tf4_lr),('Linear Regression',m_lr)])


pipe_lr.fit(x_train,y_train)


y_predict_lr = pipe_lr.predict(x_test)

print("Linear Regression Model Evaluation Metrics")
print("MAE = ",mean_absolute_error(y_test,y_predict_lr))
print("MSE = ",mean_squared_error(y_test,y_predict_lr))
print("RMSE = ",np.sqrt(mean_squared_error(y_test,y_predict_lr)))
print('r2 Score = ',r2_score(y_test,y_predict_lr))


set_config(display='diagram')
pipe_lr

Linear Regression Model Evaluation Metrics
MAE =  12.760292075299427
MSE =  730.0211946325977
RMSE =  27.018904393638866
r2 Score =  0.3186763406919827


# Random Forest Regressor


In [23]:
tf1_rfr = ColumnTransformer(
    [("OHE", OneHotEncoder(sparse_output=False, drop="first"), [6, 7])],
    remainder="passthrough",
)
tf2_rfr = ColumnTransformer(
    [
        (
            "Ordinal Encoding",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
            [4, 5, 6, 7, 8],
        )
    ],
    remainder="passthrough",
)
tf3_rfr = ColumnTransformer(
    [
        (
            "Nan Imputer",
            SimpleImputer(missing_values=np.nan, strategy="most_frequent"),
            slice(0, 9),
        )
    ]
)
tf4_rfr = ColumnTransformer([("Standard Scaler", StandardScaler(), slice(0, 9))])
m_rfr = RandomForestRegressor()

model = Pipeline(
    [
        ("One Hot Encoder", tf1_rfr),
        ("Ordinal Encoding", tf2_rfr),
        ("Nan Imputer", tf3_rfr),
        ("StandardScaler", tf4_rfr),
        ("Random Forest Regressor", m_rfr),
    ]
)

model.fit(x_train, y_train)


y_predict_rfr = model.predict(x_test)

print("Random Forest Regressor Model Evaluation Metrics")
print("MAE = ", mean_absolute_error(y_test, y_predict_rfr))
print("MSE = ", mean_squared_error(y_test, y_predict_rfr))
print("RMSE = ", np.sqrt(mean_squared_error(y_test, y_predict_rfr)))
print("r2 Score = ", r2_score(y_test, y_predict_rfr))


set_config(display="diagram")
model

Random Forest Regressor Model Evaluation Metrics
MAE =  2.703906561441815
MSE =  43.48159654707264
RMSE =  6.594057669377228
r2 Score =  0.959418930998413


# XG Boost Regressor

In [24]:
# Create an XGBoost Regressor model
xgb_model = XGBRegressor()

# Replace the 'Random Forest Regressor' in your pipeline with 'XGBoost Regressor'
model = Pipeline([
    ("One Hot Encoder", tf1_rfr),
    ("Ordinal Encoding", tf2_rfr),
    ("Nan Imputer", tf3_rfr),
    ("StandardScaler", tf4_rfr),
    ("XGBoost Regressor", xgb_model),
])

# Fit the model
model.fit(x_train, y_train)

# Predictions
y_predict_xgb = model.predict(x_test)

# Evaluation
print("XG Boost Regressor Model Evaluation Metrics")
print("MAE = ", mean_absolute_error(y_test, y_predict_xgb))
print("MSE = ", mean_squared_error(y_test, y_predict_xgb))
print("RMSE = ", np.sqrt(mean_squared_error(y_test, y_predict_xgb)))
print("r2 Score = ", r2_score(y_test, y_predict_xgb))


XG Boost Regressor Model Evaluation Metrics
MAE =  2.8241034750588825
MSE =  37.66799108159461
RMSE =  6.137425444076254
r2 Score =  0.96484472819257
