# **Import Library**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import scipy.stats as st
import math
import datetime
import missingno as msno
from scipy.stats import norm, skew
from sklearn import metrics
from collections import Counter

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score
from sklearn import model_selection
from sklearn.pipeline import make_pipeline

from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score, precision_score, r2_score, mean_absolute_error, mean_squared_error, log_loss
import matplotlib.pyplot as plt
from termcolor import colored
# to ignore warnings
import warnings
warnings.filterwarnings("ignore")

#to see model hyperparameters
from sklearn import set_config
set_config(print_changed_only = False)

# to show all columns
pd.set_option('display.max_columns', 15)

# **Load Dataset**

In [None]:
car = pd.read_csv("../input/vehicle-dataset-from-cardekho/Car details v3.csv")
df = car.copy()
df.head()

In [None]:
print("Rows: {0}, columns: {1}".
      format(df.shape[0], df.shape[1]))

In [None]:
df.info()

There are null values in 'mileage' (221), 'engine' (221), 'max_power' (215), 'torque' (222), 'seats' (221) variables.

The dataset consists of 8128 rows and 13 columns.

Of the variables, 9 are object, 3 are integer and 1 are float                                                                    

# **Convert values of the columns and change dtypes**

In [None]:
#make dtypes of some variables 'category'

categoric_columns = ["fuel", "transmission", "owner", "seller_type"]
for column in categoric_columns:
    df[column] = df[column].astype("category")

In [None]:
#create 'car_brand_name' feature from 'name' feature

df["car_brand_name"] = df["name"].str.extract('([^\s]+)')
df["car_brand_name"] = df["car_brand_name"].astype("category")

In [None]:
#extract values of 'engine' and 'mileage' variables

df["engine"] = df["engine"].str.extract('([^\s]+)').astype("float")
df["mileage"] = df["mileage"].str.extract('([^\s]+)').astype("float")

In [None]:
#extract values of 'max_power' variable

df["max_power"] = df["max_power"].str.extract('([^\s]+)')
df["max_power"] = df["max_power"][~(df["max_power"] == "bhp")]
df["max_power"] = df["max_power"].astype("float")

In [None]:
#create 'car_age' feature from 'year' column

df["car_age"] = (datetime.datetime.now().year) - (df["year"])

In [None]:
df.drop(["name", "year", "torque"], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df.info()

# **Get additional information about the dataset**

In [None]:
# check whether there are null values in the dataset
df.isnull().sum()

In [None]:
df.describe().T

The oldest car was produced in 1983 (age = 39), and the newest car was produced in 2020 (age = 2)   
Minimum selling price is 29999 USD, maximum price is 10000000 USD, and average selling price is 638271 USD    
The driving distance of the least driven car is 1 km, the most driven car's driving distance is 2360457 km,
average driving distance is 69819 km
The number of seats of cars change from 2 seats to 14 seats
Minimum mileage is 0, maximum mileage is 42, average mileage is 19.4  
Engine volume changes from 624 to 3604, average is 1458

In [None]:
df.describe(include = "category").T

About categoric variables

● Car brand name with highest frequency: Maruti (freq = 2448)

● Fuel kind with highest frequency: Diesel (freq = 4402)    
    
● Seller type with highest frequency: Individual (freq = 6766)  

● Transmission type with highest frequency: Manual (freq = 7078)                                                                         
    
● Owner type with highest frequency: First owner (freq = 5289)

# **Visualize missing values and fill them**

In [None]:
msno.matrix(df);

In [None]:
#fill null values with median (numeric) and frequent values (categoric)

numeric_data = [column for column in df.select_dtypes(["int", "float"])]
categoric_data = [column for column in df.select_dtypes(exclude = ["int", "float"])]

for col in numeric_data:
    df[col].fillna(df[col].median(), inplace = True)
        
#replace missing values in each categorical column with the most frequent value
for col in categoric_data:
    df[col].fillna(df[col].value_counts().index[0], inplace = True)

In [None]:
#check null values again

df.isnull().sum().sum()

 About filling nulls
    
    
    
We filled null values with medians of numeric variables and the most frequent values of categoric variables


In [None]:
#get class frequencies of some variables

print("Class frequencies of 'transmission' variable: \n\n", df["transmission"].value_counts())
print("Class frequencies of 'seller_type' variable: \n\n", df["seller_type"].value_counts())
print("Class frequencies of 'owner' variable: \n\n", df["owner"].value_counts())
print("Class frequencies of 'car_brand_name' variable: \n\n", df["car_brand_name"].value_counts())

In [None]:
#check correlation between the variables of dataset

df.corr()

# **Visualizations and Exploratory Data Analysis**

### Boxplot

In [None]:
fig, axes = plt.subplots(1, 3, figsize = (30, 7))

sns.boxplot(ax = axes[0], x = "selling_price", data = df, width = 0.5, fliersize = 3, linewidth = 1);
sns.boxplot(ax = axes[1], x = "km_driven", data = df, width = 0.5, fliersize = 3, linewidth = 1);
sns.boxplot(ax = axes[2], x = "mileage", data = df, width = 0.5, fliersize = 3, linewidth = 1);

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (30, 20))
axes = axes.flatten()

sns.boxplot(ax = axes[0], x = "engine", data = df, width = 0.5, fliersize = 3, linewidth = 1);
sns.boxplot(ax = axes[1], x = "max_power", data = df, width = 0.5, fliersize = 3, linewidth = 1);
sns.boxplot(ax = axes[2], x = "seats", data = df, width = 0.5, fliersize = 3, linewidth = 1);
sns.boxplot(ax = axes[3], x = "car_age", data = df, width = 0.5, fliersize = 3, linewidth = 1);

#### Histogram

In [None]:
df.hist(figsize = (30, 25), bins = 30, legend = False)
plt.show()

As it can be seen from the graphs above, the distributions of the variables (except 'mileage' variable) are not normal.

#### Catplot

In [None]:
sns.catplot(x = "seller_type",
            y = "selling_price",
            kind = "boxen",
            height = 7,
            aspect = 1,
            color = "#671A76",
            data = df).set(title = "Sale prices of the cars by seller type");

In [None]:
sns.catplot(x = "fuel",
            y = "selling_price",
            kind = "strip",
            hue = "seller_type",
            height = 7,
            aspect = 1.4,
            color = "#661E1D",
            data = df).set(title = "Sale prices of the cars by fuel type");

In [None]:
sns.catplot(x = "owner",
            y = "selling_price",
            kind = "boxen",
            height = 7,
            aspect = 1.37,
            color = "#F0F312",
            data = df).set(title = "Sale prices of the cars by owner");

#### Boxplot

In [None]:
price = df["selling_price"]
brand = df["car_brand_name"]
dff = pd.concat([price, brand], axis = 1)      
f, ax = plt.subplots(figsize = (50, 30))
fig = sns.boxplot(dff["car_brand_name"], dff["selling_price"]);

In [None]:
fig, axes = plt.subplots(1, 5, figsize = (50, 10))

sns.barplot(ax = axes[0], x = df["owner"].value_counts().index, y = df["owner"].value_counts(),
            saturation = 1).set(title = "Frequency of classes of the 'owner' variable");

sns.barplot(ax = axes[1], x = df["seller_type"].value_counts().index, y = df["seller_type"].value_counts(),
            saturation = 1).set(title = "Frequency of classes of the 'seller_type' variable");

sns.barplot(ax = axes[2], x = df["fuel"].value_counts().index, y = df["fuel"].value_counts(),
            saturation = 1).set(title = "Frequency of classes of the 'fuel' variable");

sns.barplot(ax = axes[3], x = df["transmission"].value_counts().index, y = df["transmission"].value_counts(),
            saturation = 1).set(title = "Frequency of classes of the 'transmission' variable");

sns.barplot(ax = axes[4], x = df["car_brand_name"].value_counts().index, y = df["car_brand_name"].value_counts(),
            saturation = 1).set(title = "Frequency of classes of the 'car_brand_name' variable");

In [None]:
plt.figure(figsize = (15, 8))
sns.barplot(x = "transmission", y = "selling_price", hue = "owner", data = df, saturation = 1);

In [None]:
plt.figure(figsize = (15, 8))
sns.barplot(x = "fuel", y = "selling_price", hue = "seller_type", data = df, saturation = 1);

In [None]:
plt.figure(figsize = [8, 8], clear = True, facecolor = "#FFFFFF")
df["fuel"].value_counts().plot.pie(explode = [0.1, 0.1, 0.2, 0.3], autopct='%1.3f%%', shadow = True);

In [None]:
plt.figure(figsize = [8, 8], clear = True, facecolor = "#FFFFFF")
df["owner"].value_counts().plot.pie(explode = [0.1, 0.1, 0.1, 0.1, 0.1], autopct='%1.3f%%', shadow = True);

In [None]:
sns.displot(data = df, x = "selling_price", hue = "owner", kind = "kde", height = 6,
            aspect = 1.3, clip=(0, None), palette="ch:rot=-.25, hue = 2, light=.20"
).set(title = "density of the classes of 'owner' variable by 'selling price' ");

In [None]:
sns.displot(
    data = df, x = "selling_price", hue = "owner",
    kind = "ecdf", height = 5, aspect = 1.8).set(title =  "density relationship between 'selling_price' and 'owner' variables");

sns.displot(
    data = df, x = "car_age", hue = "fuel",
    kind = "kde", height = 5, aspect = 1.8, multiple="fill").set(title = "density relationship between 'car_age' and 'fuel' variables");

sns.displot(
    data = df, x = "km_driven", hue = "owner",
    kind = "kde", height = 5, aspect = 1.8, multiple="fill").set(title = "density relationship between 'km_driven' and 'owner' variables");

#### Correlation and heatmap

In [None]:
plt.figure(figsize = [40, 20], facecolor = "#F7F4F4")
sns.heatmap(df.corr(), annot = True, linewidths = 2, linecolor = "white", cmap = "viridis");

In [None]:
df.corr().style.background_gradient(cmap = "binary")

As seen there is high correlation between some of the variables:
* between 'mileage' and 'engine' variables - -0.57
* between 'max_power' and 'engine' variables - 0.70
* between 'seats' and 'engine' variables - 0.61
* between 'max_power' and 'selling_price' variables - 0.74
    
There is middle level of correlation between other variables too.

## **Fix skewness of the variables**

### Target variable

In [None]:
print("Basic descriptive statistics of the target variable - 'selling_price': \n\n",
      df["selling_price"].describe())

## **Basic statistics of the target variable - 'selling_price'**
* average of the target variable is 638271 USD

* standard deviation of the target variable is 806253 USD

* minimum of the target variable is 29999 USD

* maximum of the target variable is 10000000 USD

* median of the target variable is 450000 USD

# Skew and Kurt

In [None]:
print("Skewness of target variable: ", df["selling_price"].skew())
print("Kurtosis of target variable: ", df["selling_price"].kurt())

 **Positive skewness and high kurtosis**
    
* Positive skewness, more weight is on the left side of the distribution.
    
* Kurtosis is greater than 3. It is leptokurtic.

In [None]:
sns.set(rc = {"figure.figsize" : (12, 7)})
sns.distplot(df["selling_price"], bins = 100, color = "red");

 What the graph shows?
    
We see positive skewness from the graph above. As the graphs shows, more weight is on the left side of the distribution. We will try to fix it using "log1p" function of numpy.

In [None]:
df["selling_price"] = np.log1p(df["selling_price"])
df["selling_price"].head(n = 10)

In [None]:
sns.distplot(df["selling_price"], fit = norm, color = "green");

 What the graph shows?
    
As seen from the graph above, we could fix the skewness and distribution of target variable is normal

In [None]:
# get skewness of other numeric variables

numeric_data = [column for column in df.select_dtypes(["int", "float"])]
for col in numeric_data:
    print("Skewness of", col, "variable is:", df[col].skew())

In [None]:
# fix skewness  of them with 'log1p' function

for c in numeric_data:
    df[c] = np.log1p(df[c])

**All skewness fixed**
    
We could fix the skewness of all other variables. So let's continue...

In [None]:
df.head()

# **Split and encode the dataset**

In [None]:
# select dependent variable (label)

y = df["selling_price"]

In [None]:
# select independent variable (estimator)
x = df.drop("selling_price", axis = 1)

#encode the variables of the dataset
x = pd.get_dummies(x, drop_first = True)

In [None]:
y.shape, x.shape

In [None]:
x.head()

In [None]:
# Split the dataset into x_train (y_train) and x_test (y_test) sets

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=1)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=1)
print(x_train.shape)
print(x_test.shape)
print(x_val.shape)

#### Robust Scaling

In [None]:
rob_scaler = RobustScaler()
rob_scaler.fit(x_train)
x_train_scaled = rob_scaler.transform(x_train)
x_test_scaled = rob_scaler.transform(x_test)
x_val_scaled = rob_scaler.transform(x_val)

In [None]:
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train.columns)
x_val_scaled_df = pd.DataFrame(x_val_scaled, columns=x_val.columns)
x_test_scaled_df = pd.DataFrame(x_test_scaled, columns=x_test.columns)

train_data = x_train_scaled_df.copy()
train_data['selling_price'] = y_train.reset_index(drop=True)
val_data = x_val_scaled_df.copy()
val_data['selling_price'] = y_val.reset_index(drop=True)
test_data = x_test_scaled_df.copy()
test_data['selling_price'] = y_test.reset_index(drop=True)

train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

# **Predicting with machine learning models**

### Define cross validation metrics and setup kfold

In [None]:
k_fold = KFold(n_splits = 10, random_state = 11, shuffle = True)

In [None]:
def cv_rmse(model, X = x_train):
    rmse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring = "neg_mean_squared_error", cv = k_fold))
    return rmse

def rmsle(y, y_pred):
    rmsle = np.sqrt(mean_squared_log_error(y, y_pred, squared = False))
    return rmsle

### Building ML models

In [None]:
from sklearn.ensemble import StackingRegressor
knn = KNeighborsRegressor(n_neighbors=5)
rf = RandomForestRegressor(n_estimators = 1000, random_state = 1)
lg = LinearRegression()
lasso = LassoCV(alphas = [1e-10, 1e-8, 1e-7, 1e-5, 1e-2, 9e-4, 9e-3, 5e-4, 3e-4, 1e-4, 1e-3, 1e-2, 0.1, 0.3, 0.6, 1, 3, 5, 7, 14, 18, 25, 30, 45, 50, 70, 90], n_jobs = -1, cv = k_fold)
estimators = [
    ('knn', knn),
    ('rf', rf),
    ('lg', lg),
    ('lasso', lasso)
]
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())


In [None]:
def mean_biased_error(y_true, y_pred):
    return np.mean(y_pred - y_true)

In [None]:
lg.fit(x_train_scaled, y_train)
y_pred_lg = lg.predict(x_test_scaled)

print('Mean Biased Error:', mean_biased_error(y_test,y_pred_lg))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_lg))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_lg))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_lg)))
print('R-squared:', r2_score(y_test, y_pred_lg))

In [None]:
rf = rf.fit(x_train_scaled, y_train)
y_pred_rf = rf.predict(x_test_scaled)

print('Mean Biased Error:', mean_biased_error(y_test,y_pred_rf))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_rf))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_rf))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print('R-squared:', r2_score(y_test, y_pred_rf))

In [None]:
knn = knn.fit(x_train_scaled, y_train)
y_pred_knn = knn.predict(x_test_scaled)

print('Mean Biased Error:', mean_biased_error(y_test,y_pred_knn))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_knn))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_knn))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_knn)))
print('R-squared:', r2_score(y_test, y_pred_knn))

In [None]:
lasso = lasso.fit(x_train_scaled, y_train)
y_pred_lasso = lasso.predict(x_test_scaled)

print('Mean Biased Error:', mean_biased_error(y_test,y_pred_lasso))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_lasso))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_lasso))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_lasso)))
print('R-squared:', r2_score(y_test, y_pred_lasso))

In [None]:
stacking_regressor = stacking_regressor.fit(x_train_scaled, y_train)
y_pred_stacking = stacking_regressor.predict(x_test_scaled)

print('Mean Biased Error:', mean_biased_error(y_test,y_pred_stacking))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_stacking))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_stacking))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_stacking)))
print('R-squared:', r2_score(y_test, y_pred_stacking))

### Make prediction on test dataset

In [None]:
# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mbe = mean_biased_error(y_true, y_pred)
    return r2, mae, mse, mbe

# Calculate metrics for each model
metrics_lasso = calculate_metrics(y_test, y_pred_lasso)
metrics_linear = calculate_metrics(y_test, y_pred_lg)
metrics_rf = calculate_metrics(y_test, y_pred_rf)
metrics_knn = calculate_metrics(y_test, y_pred_knn)
metrics_stacking = calculate_metrics(y_test, y_pred_stacking)

# Print metrics for each model
print(f'Lasso Metrics: {metrics_lasso}')
print(f'Linear Regression Metrics: {metrics_linear}')
print(f'Random Forest Metrics: {metrics_rf}')
print(f'KNN Metrics: {metrics_knn}')
print(f'Ensemble Metrics: {metrics_stacking}')

# Plot metrics
models = ['Lasso', 'Linear Regression', 'Random Forest', 'KNN', 'Ensemble']
metrics_names = ['R^2 Score', 'MAE', 'MSE', 'MBE']

# Convert metrics to a DataFrame for easier plotting
metrics_df = pd.DataFrame({
    'Model': models,
    'R^2 Score': [metrics_lasso[0], metrics_linear[0], metrics_rf[0], metrics_knn[0],metrics_stacking[0]],
    'MAE': [metrics_lasso[1], metrics_linear[1], metrics_rf[1], metrics_knn[1],metrics_stacking[1]],
    'MSE': [metrics_lasso[2], metrics_linear[2], metrics_rf[2], metrics_knn[2],metrics_stacking[2]],
    'MBE': [metrics_lasso[3], metrics_linear[3], metrics_rf[3], metrics_knn[3],metrics_stacking[3]]
})

# Plot each metric
for metric in metrics_names:
    plt.figure(figsize=(10, 5))
    plt.bar(metrics_df['Model'], metrics_df[metric])
    plt.xlabel('Models')
    plt.ylabel(metric)
    plt.title(f'{metric} for Different Models')
    plt.show()