## Importing Libraries


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import pickle as pk
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import FeatureHasher
from sklearn import set_config
import math
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.model_selection import GridSearchCV

# from tensorflow import keras
# from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

## Read Dataset


In [None]:
df = pd.read_csv("pakwheels.csv")

## --------------------------------------Understand the data---------------------------------------------------------------

## 1) Data preprocessing

## 2) Data Exploration

## 3) Data cleaning


In [None]:
df.head(3)

# -----------------------------------------------------------------------------------------------------------------


In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df["ad_last_updated"] = pd.to_datetime(df["ad_last_updated"])

In [None]:
df["ad_last_updated"].max()

### The dataset we have is of vehicles posted on pakwheels upto 10/october/2022


# ---------------------------------------------------------------------------------------------------------------


# Data Cleaning


## 1) Drop irrelevant columns


In [None]:
df.drop(["Unnamed: 0", "ad_url", "description", "car_features"], axis=1, inplace=True)

In [None]:
df.head(1)

In [None]:
df.shape

In [None]:
print("We have data of", df.shape[0], "vehicles")

## Create individual columns for vehicle name, variant from title columns


In [None]:
df["title"].str.split(" ")[0][0]

In [None]:
df["brand"] = df.title.apply(lambda x: (x.split(" ")[0]))

In [None]:
df.brand.nunique()

In [None]:
df["vehicle"] = df.title.apply(lambda x: (x.split(" ")[1]))

In [None]:
df.head(1)

In [None]:
print("NO OF UNIQUE VEHICLES ARE = ", df.vehicle.nunique())

In [None]:
df["variant"] = df.title.apply(lambda x: (x.split(" ")[2:-1]))

In [None]:
df.variant = df.variant.str.join(" ")

In [None]:
df.head(1)

In [None]:
pd.set_option("display.max_rows", None)

In [None]:
df.brand.value_counts().nlargest(30).plot(kind="bar")

## Dropping brands having less than 1000 listed vehicles


In [None]:
df = df[
    (df["brand"] == "Toyota")
    | (df["brand"] == "Suzuki")
    | (df["brand"] == " Honda")
    | (df["brand"] == "KIA")
    | (df["brand"] == "Daihatsu")
    | (df["brand"] == "Hyundai")
    | (df["brand"] == "Changan")
    | (df["brand"] == "Nissan")
    | (df["brand"] == "Mitsubishi")
]
# df = df[(df['brand'] == 'Toyota') | (df['brand'] == 'Suzuki') | (df['brand'] == ' Honda') | (df['brand'] == 'KIA') | (df['brand'] == 'Daihatsu') | (df['brand'] == 'Hyundai') | (df['brand'] == 'Changan') | (df['brand'] == 'Nissan') | (df['brand'] == 'Mitsubishi') | (df['brand'] == 'Mercedes') | (df['brand'] == 'MG') | (df['brand'] == 'DFSK') | (df['brand'] == 'Audi') | (df['brand'] == 'FAW') | (df['brand'] == 'Proton') | (df['brand'] == 'Prince') | (df['brand'] == 'BMW') | (df['brand'] == 'Mazda') ]

In [None]:
df.shape

In [None]:
df.variant.nunique()

In [None]:
df.head(1)

## Unique value in every columns


In [None]:
columns = df.columns
print("No of unique items in each columns")
print("--------------------------------------------------")
for column in columns:
    print("No of unique items in ", column, " = ", df[column].nunique())
print("--------------------------------------------------")

In [None]:
df.info()

In [None]:
df.drop(["location", "assembly", "body_type"], axis=1, inplace=True)

In [None]:
df.price.value_counts()

In [None]:
df.price.value_counts().nlargest(30).plot(kind="bar")

## Remove data where price is not provided and correct the format of price and convert it into lacs


In [None]:
df = df[df.price != "Call for price"]

In [None]:
df.head(1)

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
df["crore"] = df.price.apply(
    lambda x: (x.split(" ")[1]) if "crore" in x.split(" ") else 0
)

In [None]:
df.head(1)

In [None]:
df.crore = df.crore.astype(float)

In [None]:
df.crore = df.crore * 100

In [None]:
df.price.value_counts()

In [None]:
df.price = df.price.apply(lambda x: (x.split(" ")[1] if "lacs" in x.split(" ") else 0))

In [None]:
df.price.value_counts()

In [None]:
df.price = df.price.astype(float)

In [None]:
df.price = df.price + df.crore

In [None]:
df.price.value_counts()

In [None]:
df.head(1)

In [None]:
df.drop("crore", axis=1, inplace=True)

In [None]:
df.head(1)

In [None]:
df.info()

In [None]:
df.model_year = df.model_year.astype(int)

In [None]:
df.mileage = df.mileage.apply(lambda x: (x.split(" ")[0]))

In [None]:
df.mileage = df.mileage.apply(lambda x: (x.replace(",", "")))

In [None]:
df.head(1)

In [None]:
df.drop("title", axis=1, inplace=True)

In [None]:
columns = df.columns
print("No of unique items in each columns")
print("--------------------------------------------------")
for column in columns:
    print("No of unique items in ", column, " = ", df[column].nunique())
print("--------------------------------------------------")

In [None]:
df.engine_capacity = df.engine_capacity.apply(lambda x: (x.split(" ")[0]))

In [None]:
df.head(1)

In [None]:
df.info()

In [None]:
df.engine_capacity = df.engine_capacity.astype(int)

In [None]:
df.transmission.value_counts()

In [None]:
df.transmission.value_counts().plot(kind="bar")

In [None]:
df.engine_type.value_counts()

In [None]:
df.engine_type.value_counts().plot(kind="bar")

In [None]:
df.sample(5)

In [None]:
df.drop("ad_last_updated", axis=1, inplace=True)

In [None]:
group = df.groupby("vehicle")
df1 = group.apply(lambda x: x["variant"].unique())

In [None]:
z = df["vehicle"].value_counts()

In [None]:
y = z[z < 30]

In [None]:
a = y.index

In [None]:
a = list(a)

# ------------------------------------------------------------------------------------------------------------


In [None]:
df.head(2)

In [None]:
column_order = [
    "model_year",
    "mileage",
    "registered_in",
    "color",
    "engine_capacity",
    "brand",
    "vehicle",
    "variant",
    "transmission",
    "engine_type",
    "price",
]
df = df[column_order]

In [None]:
df.head(2)

## Convert all data to lowercase


In [None]:
# lowercase_if_string = lambda x: x.lower() if isinstance(x, str) else x
# df = df.applymap(lowercase_if_string)

In [None]:
df.head(10)

In [None]:
print("We have data of", df.shape[0], "vehicles")

In [None]:
# reset index and save data
df.reset_index(drop=True, inplace=True)
df.to_csv("cleaned_data.csv", index=True)

In [None]:
df.color.value_counts().nlargest(30).plot(kind="bar")

In [None]:
df = df[
    (df["color"] == "Grey")
    | (df["color"] == "Black")
    | (df["color"] == "Silver")
    | (df["color"] == "White")
    | (df["color"] == "Blue")
    | (df["color"] == "Green")
    | (df["color"] == "Maroon")
    | (df["color"] == "Red")
    | (df["color"] == "Burgundy")
    | (df["color"] == "Turquoise")
    | (df["color"] == "Gold")
    | (df["color"] == "Beige")
    | (df["color"] == "Brown")
    | (df["color"] == "Navy")
    | (df["color"] == "Yellow")
]

In [None]:
df.shape

In [None]:
df.drop(['variant', 'engine_capacity'], axis=1, inplace=True)

In [None]:
df.shape

In [None]:
df.reset_index(drop=True, inplace=True)
df.to_csv("New_cleaned_data.csv", index=True)

# ---------------------------------------------------------------------------------------------------------------


# -------------------------Try different model to select one of them----------------------------


# Train Test Split


In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(columns=["price"], axis=1), df["price"], test_size=0.3, random_state=12
)

In [None]:
x_train.head(2)

# Lets try Random Forest Regressor


In [None]:
tf1_rfr = ColumnTransformer(
    [("OHE", OneHotEncoder(sparse_output=False, drop="first"), [6, 7])],
    remainder="passthrough",
)
tf2_rfr = ColumnTransformer(
    [
        (
            "Ordinal Encoding",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
            [4, 5, 6, 7, 8],
        )
    ],
    remainder="passthrough",
)
tf3_rfr = ColumnTransformer(
    [
        (
            "Nan Imputer",
            SimpleImputer(missing_values=np.nan, strategy="most_frequent"),
            slice(0, 9),
        )
    ]
)
tf4_rfr = ColumnTransformer([("Standard Scaler", StandardScaler(), slice(0, 9))])
m_rfr = RandomForestRegressor()

pipe_rfr = Pipeline(
    [
        ("One Hot Encoder", tf1_rfr),
        ("Ordinal Encoding", tf2_rfr),
        ("Nan Imputer", tf3_rfr),
        ("StandardScaler", tf4_rfr),
        ("Random Forest Regressor", m_rfr),
    ]
)

pipe_rfr.fit(x_train, y_train)


y_predict_rfr = pipe_rfr.predict(x_test)


print("MAE = ", mean_absolute_error(y_test, y_predict_rfr))
print("MSE = ", mean_squared_error(y_test, y_predict_rfr))
print("RMSE = ", np.sqrt(mean_squared_error(y_test, y_predict_rfr)))
print("r2 Score = ", r2_score(y_test, y_predict_rfr))


set_config(display="diagram")
pipe_rfr

In [None]:
import pickle

# Save the trained pipeline to a file
model_filename = "rfr_model.pkl"
with open(model_filename, "wb") as model_file:
    pickle.dump(pipe_rfr, model_file)

print(f"Random Forest Regressor Model saved as {model_filename}")

In [None]:
# import joblib

# # Save the trained pipeline to a file using joblib
# model_file = "rfr_model.joblib"
# joblib.dump(pipe_rfr, model_file)

# print(f"Random Forest Regressor Model saved as {model_file}")

In [None]:
# import joblib
# import os

# # Define the chunk size (e.g., 85MB)
# chunk_size = 85 * 1024 * 1024  # 85 MB

# # Save the trained pipeline to a file using joblib
# model_file = "rfr_model.joblib"
# output_dir = "model_chunks/"

# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# with open(model_file, "rb") as f_in:
#     chunk_number = 0
#     while True:
#         chunk = f_in.read(chunk_size)
#         if not chunk:
#             break
#         chunk_filename = os.path.join(output_dir, f"chunk_{chunk_number}.joblib")
#         joblib.dump(chunk, chunk_filename)
#         chunk_number += 1

# print(f"Random Forest Regressor Model saved as split chunks in {output_dir}")


# Lets try Ensemble learning with 2 Random Forest Regressor


In [None]:
# tf1 = ColumnTransformer(
#     [("OHE", OneHotEncoder(sparse=False, drop="first"), [6, 7])],
#     remainder="passthrough",
# )
# tf2 = ColumnTransformer(
#     [
#         (
#             "Ordinal Encoding",
#             OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
#             [4, 5, 6, 7, 8],
#         )
#     ],
#     remainder="passthrough",
# )
# tf3 = ColumnTransformer(
#     [
#         (
#             "Nan Imputer",
#             SimpleImputer(missing_values=np.nan, strategy="most_frequent"),
#             slice(0, 9),
#         )
#     ]
# )
# tf4 = ColumnTransformer([("Standard Scaler", StandardScaler(), slice(0, 9))])
# m1 = RandomForestRegressor()
# m2 = RandomForestRegressor()
# ensemble_model = VotingRegressor(estimators=[("model1", m1), ("model2", m2)])


# pipe_2rfr = Pipeline(
#     [
#         ("One Hot Encoder", tf1),
#         ("Ordinal Encoding", tf2),
#         ("Nan Imputer", tf3),
#         ("StandardScaler", tf4),
#         ("EnsembleModel", ensemble_model),
#     ]
# )


# pipe_2rfr.fit(x_train, y_train)


# y_predict = pipe_2rfr.predict(x_test)


# print("MAE = ", mean_absolute_error(y_test, y_predict))
# print("MSE = ", mean_squared_error(y_test, y_predict))
# print("RMSE = ", np.sqrt(mean_squared_error(y_test, y_predict)))
# print("r2 Score = ", r2_score(y_test, y_predict))


# set_config(display="diagram")
# pipe_2rfr

# Lets try Ensemble learning with 3 Random Forest Regressor


In [None]:
# tf1 = ColumnTransformer(
#     [("OHE", OneHotEncoder(sparse=False, drop="first"), [6, 7])],
#     remainder="passthrough",
# )
# tf2 = ColumnTransformer(
#     [
#         (
#             "Ordinal Encoding",
#             OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
#             [4, 5, 6, 7, 8],
#         )
#     ],
#     remainder="passthrough",
# )
# tf3 = ColumnTransformer(
#     [
#         (
#             "Nan Imputer",
#             SimpleImputer(missing_values=np.nan, strategy="most_frequent"),
#             slice(0, 9),
#         )
#     ]
# )
# tf4 = ColumnTransformer([("Standard Scaler", StandardScaler(), slice(0, 9))])
# m1 = RandomForestRegressor()
# m2 = RandomForestRegressor()
# m3 = RandomForestRegressor()
# ensemble_model = VotingRegressor(
#     estimators=[("model1", m1), ("model2", m2), ("model3", m3)]
# )


# pipe_3rfr = Pipeline(
#     [
#         ("One Hot Encoder", tf1),
#         ("Ordinal Encoding", tf2),
#         ("Nan Imputer", tf3),
#         ("StandardScaler", tf4),
#         ("EnsembleModel", ensemble_model),
#     ]
# )


# pipe_3rfr.fit(x_train, y_train)


# y_predict = pipe_3rfr.predict(x_test)


# print("MAE = ", mean_absolute_error(y_test, y_predict))
# print("MSE = ", mean_squared_error(y_test, y_predict))
# print("RMSE = ", np.sqrt(mean_squared_error(y_test, y_predict)))
# print("r2 Score = ", r2_score(y_test, y_predict))


# set_config(display="diagram")
# pipe_3rfr

# Best Perfomance is of Ensemble of 3 Random Forest Regressor with 96 % r2 Score


In [None]:
# # Input [['model','mileage','registered city','colour','model','brand','vehicle','variant','transmission','engine_type']]
# sample = [['2015','135000','karachi','black','1300','Honda','City','ivtec','automatic','petrol']]
# print('Predicted Price is Rs: ',round(pipe_3rfr.predict(sample)[0],3),'Lakhs.')