<a href="https://colab.research.google.com/github/hussainqadiim/-/blob/main/Fork_of_Saudi_cars_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'saudi-arabia-used-cars-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1430609%2F2395181%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240215%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240215T145254Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3472fee4f31abbe9773f9749fefc94c8c5da915bd434b518af45aa1a433847f87f1b37bf310daff9b651bffaf4001099c939dd959a221687e205e404494b706168d0f8c22f807eeada43a3d611c0a24fc363b74ad8321116cbbbf3ec03a047f2d7614abedd4ce925985b86804c55e19e9ad0a00ac1ba7d6ad28be9df97629217880fff6d88c7ef06bf2d20d8536fb2b8da93d7c603c5629cf06a949c8334cb299e16b36a10f08c921532947662082a9fcd1f456a28d2ae163dc0842081e32ce069af6d7eb338fdbf6226c898eea021ad7049a79cb079075033b9cd7a11c52805a216e8a91d5f429a8f427ba6d0d71eb0afee856a407f6c84994cd1229ec9bb67'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


###**Importing Modules and Predefined Functions**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from yellowbrick.regressor import PredictionError
from yellowbrick.features import RadViz
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

%matplotlib inline

plt.rcParams["figure.figsize"] = (7,4)
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
#pd.set_option('display.width', 1000)
#pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
#connecting to Google Drive
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/kaggle/input/saudi-arabia-used-cars-dataset/UsedCarsSA_Clean_EN.csv')
df.head(10)

In [None]:
df.info()

In [None]:
# Checking for missing values in the DataFrame 'df' and calculating the count of null values per column
df.isnull().sum()

##**Data Cleaning & Exploratory Data Analysis**



In [None]:
# Determine the 'Type' values that occur 50 times or less in the DataFrame 'df'
# and store these values' indices in 'drop_model'

drop_model = df.Type.value_counts()[df.Type.value_counts() <= 50].index
drop_model


In [None]:
# For each value 'i' in 'drop_model':
for i in drop_model:
    # Find the indices where the 'Type' column in the DataFrame 'df' matches the value 'i'
    drop_index = df[df['Type'] == i].index

    # Drop rows from 'df' based on the found indices
    df.drop(index=drop_index, inplace=True)

# Reset the indices of 'df' after dropping rows and make the changes permanent
df.reset_index(drop=True, inplace=True)


In [None]:
df.Type.value_counts()

In [None]:
# Select columns with data type 'object' from the DataFrame 'df' and retrieve the first few rows

df_object = df.select_dtypes(include ="object").head()
df_object

In [None]:
for col in df_object:
    # Print the column name and the number of unique values in the corresponding column in the original DataFrame 'df'
    print(f"{col:<30}:", df[col].nunique())

In [None]:
print(df.columns)

In [None]:
# Drop specific columns

df.drop(["Make","Origin","Color","Engine_Size","Gear_Type","Fuel_Type","Region","Negotiable"], axis=1, inplace=True)
df.head()

In [None]:
df.shape

In [None]:
#filter the DataFrame 'df' to show rows where the 'Price' column has a value of 0

df[df.Price == 0]

In [None]:
# Filter the DataFrame 'df' to exclude rows where the 'Price' column equals 0
df = df[df['Price'] != 0]
df.head(10)

In [None]:
df.shape

In [None]:
# Sort the DataFrame 'df' based on the 'Price' column in ascending order

df.sort_values(by='Price', ascending=1, inplace=True)
df.head(20)

In [None]:
#drop the Price less than 5000
df = df[df['Price'] > 5000]
df.head(10)

In [None]:
len(df[df.Price > 170000])

In [None]:
#drop the Price more than 170000
df = df[df.Price < 170000]
df.head(10)

In [None]:
#drop Mileage more than 700000
df = df[df['Mileage'] < 700000]
df.head(10)

In [None]:
df.describe().T

## **Data Visualization**

In [None]:
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, vmin=-1, vmax=1, cmap="coolwarm")
plt.show()

In [None]:
ax = df.Type.value_counts().iloc[:35].plot(kind ="bar", figsize=(20,5))

ax.bar_label(ax.containers[0]);

# we see the top models with the most observations in our data and their numbers.

In [None]:
# Calculate the age of vehicles by subtracting the 'Year' column values from 2023
df["vehicle_age"]=2023-df.Year

In [None]:
df.head()

In [None]:
# Drop the 'Year' column from the DataFrame 'df' along the columns axis
df.drop("Year", axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.select_dtypes("object").head()

In [None]:
for i in df.select_dtypes("object"):

    print(i, len(df[i].value_counts()))

# Split Data

In [None]:
X=df.drop("Price", axis=1)
y=df.Price

In [None]:

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=5)

print("Train features shape : ", X_train.shape)
print("Train target shape   : ", y_train.shape)
print("Test features shape  : ", X_test.shape)
print("Test target shape    : ", y_test.shape)

In [None]:
cat = X.select_dtypes("object").columns
cat

In [None]:

column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat),
    remainder=MinMaxScaler(),
    verbose_feature_names_out=False,
)

In [None]:
sns.histplot(df.Price, bins=100, kde=True);

In [None]:
total_outliers = []

for model in df.Type.unique():

    car_prices = df[df["Type"]== model]["Price"]

    Q1 = car_prices.quantile(0.25)
    Q3 = car_prices.quantile(0.75)
    IQR = Q3 - Q1
    lower_lim = Q1 - 1.5*IQR
    upper_lim = Q3 + 1.5*IQR

    count_of_outliers = (car_prices[(car_prices < lower_lim) | (car_prices > upper_lim)]).count()

    total_outliers.append(count_of_outliers)

    print(f" The count of outlier for {model:<24} : {count_of_outliers:<2},\
    The rate of outliers : {(count_of_outliers/len(df[df['Type']== model])).round(3)}")
print()
print("Total_outliers : ",sum(total_outliers), "The rate of total outliers :", (sum(total_outliers)/len(df)).round(3))

In [None]:
df

In [None]:
print('Duplicated Rows : ', df.duplicated().sum())

In [None]:
df = df.drop_duplicates()


In [None]:
print('Remaining Duplicated Rows:', df.duplicated().sum())

In [None]:
print('Missing Value   : ', df.isna().sum().sum())

In [None]:


# Assuming 'df' is your DataFrame
sns.boxplot(x=df['Price'])

# Display the plot
plt.show()

## Data Pre-Processing

In [None]:
cat_onehot = ['Type']

cat_ordinal = ['Options']
cat_for_Options = ["Standard" , "Semi Full" ,"Full"]

In [None]:
df['Options'].unique()

In [None]:


# Assuming cat_onehot, cat_ordinal, and cat_for_Options are defined elsewhere

# Initialize the scaler
scaler = MinMaxScaler()

column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore", sparse=False), cat_onehot),
    (OrdinalEncoder(categories=[cat_for_Options]), cat_ordinal),
    (scaler, ['Mileage', 'vehicle_age']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

# Setting output transformation to pandas dataframe
column_trans = column_trans.set_output(transform="pandas")


In [None]:
column_trans.fit_transform(X_train).head()

In [None]:
X_train_trans = column_trans.fit_transform(X_train)
X_test_trans = column_trans.transform(X_test)

In [None]:
X_train_trans.shape, X_test_trans.shape

In [None]:
X_train_trans.join(y_train).corr()

# Model Building

In [None]:
def train_val(model, X_train, y_train, X_test, y_test):

    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    scores = {"train": {"R2" : r2_score(y_train, y_train_pred),
                        "mae" : mean_absolute_error(y_train, y_train_pred),
                        "mse" : mean_squared_error(y_train, y_train_pred),
                        "rmse" : mean_squared_error(y_train, y_train_pred, squared=False)},
              "test": {"R2" : r2_score(y_test, y_pred),
                       "mae" : mean_absolute_error(y_test, y_pred),
                       "mse" : mean_squared_error(y_test, y_pred),
                       "rmse" : mean_squared_error(y_test, y_pred, squared=False)}}

    return pd.DataFrame(scores)

In [None]:


LR_model = LinearRegression()
operations = [("preprocess", column_trans),
              ("Linear", LinearRegression())]

LR_pipeline = Pipeline(steps=operations)
LR_pipeline.fit(X_train, y_train)

y_pred = LR_pipeline.predict(X_test)

In [None]:
train_val(LR_pipeline, X_train, y_train, X_test, y_test)

In [None]:



visualizer = RadViz(size=(720, 600))

LR_model = LR_pipeline
visualizer = PredictionError(LR_pipeline)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show();

In [None]:
operations = [("preprocess", column_trans), ("Lasso", Lasso())]

lasso_model = Pipeline(steps=operations)

lasso_model.fit(X_train, y_train)

In [None]:
train_val(lasso_model, X_train, y_train, X_test, y_test)

In [None]:
pip install xgboost

In [None]:


xgb_model = XGBRegressor()

operations = [("preprocess", column_trans),('xgb_model', xgb_model) ]

XGB_pipeline = Pipeline(steps=operations)

XGB_pipeline.fit(X_train, y_train)





In [None]:
train_val(XGB_pipeline, X_train, y_train, X_test, y_test)

In [None]:


RF_model = RandomForestRegressor()

operations = [("preprocess", column_trans),("RF_model", RandomForestRegressor(random_state=101)) ]

RF_pipeline = Pipeline(steps=operations)

RF_pipeline.fit(X_train, y_train)

In [None]:
train_val(RF_pipeline, X_train, y_train, X_test, y_test)

In [None]:


visualizer = RadViz(size=(720, 600))

RF_model = RF_pipeline
visualizer = PredictionError(RF_model)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show(); # Values bigger than 300000 effect our predictions.

#  Prediction

In [None]:
df_new = df[df.Price < 200000]
df_new.head()

In [None]:
X = df_new.drop(columns="Price")
y = df_new.Price

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
import pickle

# Assuming XGB_pipeline is your trained XGBoost pipeline
LR_model = 'LR_model.pkl'

with open(LR_model, 'wb') as file:
    pickle.dump(LR_pipeline, file)


In [None]:
data = {
    'Type': ['Camry', 'Tahoe', 'Hilux'],
    'Options': ['Full', 'Standard', 'Full'],
    'Mileage': [125000 , 81833 , 190000],
    'vehicle_age': [4 , 4 , 3]

}

new_test_data = pd.DataFrame(data)

In [None]:
#LG_model

import pickle

# Load the saved model
LR_model = 'LR_model.pkl'

with open(LR_model, 'rb') as file:
    loaded_model = pickle.load(file)


In [None]:
predictions = loaded_model.predict(new_test_data)

# Print the predicted prices
print("Predicted Prices:")
for prediction in predictions:
    print(prediction)