# Car Price Prediction
![](https://cdn.dribbble.com/users/2374064/screenshots/4732016/car-jump.gif)

**Today, in this notebook we will work with the car price data set, where we will make some operations like cleaning data, explore it and we will create a ML model to deploy this data.
hope you enjoy it!!!!!**

# Import libs

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

[](http://www.google.fr/url?sa=i&url=https%3A%2F%2Fdribbble.com%2Fshots%2F4732016-Car-Jump-GIF-animation&psig=AOvVaw0y8Jb3aa7GZgVA2p5LkOjj&ust=1667523195660000&source=images&cd=vfe&ved=0CAoQjRxqFwoTCLimz6fmkPsCFQAAAAAdAAAAABAF)

# Data discovering  

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# first we start by visualizing the data we will use 
data_set = pd.read_csv('/kaggle/input/car-price-prediction-challenge/car_price_prediction.csv')
data_set.head(5)

In [None]:
# print the shape of the used data frame 
print('the shape of the car_price_prediction table is : ', data_set.shape)

In [None]:
data_set.columns

In [None]:
data_set.info()

In [None]:
# visualize the total values using a pie plot
data_set.dtypes.value_counts().plot.pie()

In [None]:
#data.columns
num_col = ['Price', 'Prod. year', 'Cylinders', 'Airbags']
data_set[num_col].describe()

# Data Cleaning/ Pre-Processing/ Exploratory Data Analysis

In [None]:
# we will make a copy from the original data_set to work on it 
data = data_set.copy()
data.columns

In [None]:
# we can see that the ID field is useless 
data.drop(columns='ID', inplace=True)
data.head(3)

In [None]:
# we have to rename some columns to make our work more easy 
data.rename(columns={'Prod. year': 'Prod_year'}, inplace=True)
data.rename(columns={'Leather interior': 'Leather_interior'}, inplace=True)
data.rename(columns={'Fuel type': 'Fuel_type'}, inplace=True)
data.rename(columns={'Engine volume': 'Engine_volume'}, inplace=True)
data.rename(columns={'Gear box type': 'Gear_box_type'}, inplace=True)
data.rename(columns={'Drive wheels': 'Drive_wheels'}, inplace=True)

In [None]:
# we will start by droping duplicates if they exists
data.drop_duplicates(inplace=True)
data.shape  # he new shape is (15725, 17)

In [None]:
# then will check the number of missing values in each column of training data
#missing = data.isna().sum()
missing = data.isnull().sum()
missing

In [None]:
data.head()

In [None]:
# as we can see from the above table that our data contain some empty cells in the 'Levy' column, thus we decide t check all columns 
# one by one but we found that 'Levy' is the only column with empty cells.
data['Levy'].value_counts()
# (5819 is a big number so we can't delete rows with this values thus will use other solutions like imputation)

**Levy column**

In [None]:
# we will replace the '-' with null
data['Levy'].replace({'-':np.nan}, inplace = True)
# we have to convert the type of the Levy column  to float
data['Levy'] = data['Levy'].astype('float64')

In [None]:
plt.figure(dpi=120)
sns.heatmap(data.isna(),cbar=False)

**Price column**

In [None]:
# we can see that there are many car with a price under the 500$, this is illogic
# we will keep only cars with price greater than 500 $
data = data[data['Price'] > 500]
data.shape # (14799, 17)

**Engine_volume column**

In [None]:
# The engine_volume column has float values with object type we have to convert them to float
# before that we have to deal with the "Turbo" cells
data['Turbo_engine'] = data['Engine_volume'].str.contains('Turbo')
data['Engine_volume'] = data['Engine_volume'].str.slice(0,3)
data['Engine_volume'] = data['Engine_volume'].astype('float64')

**Mileage column**

In [None]:
# we have to rename the Mileage column to Mileage/km
# change the type from object to int
data.rename(columns={'Mileage':'Mileage/km'}, inplace=True)
data['Mileage/km'] = data['Mileage/km'].str.strip('km')
data['Mileage/km'] = data['Mileage/km'].astype('int64')

**Cylinders column**

In [None]:
# change the type of the Cylinders column from float to int it is more logic 
data['Cylinders'] = data['Cylinders'].astype('int64')

**Doors column**

In [None]:
# the column Doors has wrong values like 04-May and 02-Mar
# and we have to change its type 
data['Doors'].replace({'04-May':4, '02-Mar': 2, '>5':5}, inplace = True)
data['Doors'] = data['Doors'].astype('int64')

**Create the categorical variables set**

In [None]:
categorical_col = data.select_dtypes(['object', 'boolean'])
categorical_col.head(2)

**Create the numerical variable set**

In [None]:
numerical_col = data.select_dtypes(['int', 'float'])
numerical_col.head(2)

In [None]:
# Plot the categorical variables 
for col in categorical_col:
    plt.figure(figsize=(20,6))
    sns.barplot(x=data[col], y=data['Price'])

In [None]:
# Plot the numerical variables 
for col in numerical_col:
    plt.figure(figsize=(20,6))
    sns.scatterplot(x=data[col], y=data['Price'])

# Pipline

In [None]:
# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)

# split the data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)
# Categorical_col
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 
                    and X_train_full[cname].dtype in ['object','boolean']]

# Numerical_col
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]


my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [None]:
categorical_cols #['Leather_interior', 'Fuel_type', 'Gear_box_type', 'Drive_wheels', 'Wheel']

In [None]:
numerical_cols  #['Levy','Prod_year', 'Engine_volume','Mileage/km','Cylinders','Doors','Airbags']

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [None]:
# create the  pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('The mean absolute error is:', score) 

# Preprocessing of test data, fit model
preds_test = my_pipeline.predict(X_valid_full) 

In [None]:
output = pd.DataFrame({'Price': preds_test, 'real_price': y_valid})
output

# Evaluate many RandomForest models

In [None]:
# we will create many models with different parametrs 
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]


def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])
    my_pipeline.fit(X_train, y_train)
    preds = my_pipeline.predict(X_valid)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("The Model %d has a MAE: %d" % (i+1, mae))

In [None]:
# create the pipeline
my_pipeline_2 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model_3)
                             ])

# Preprocessing of training data, fit model 
my_pipeline_2.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds_2 = my_pipeline.predict(X_valid)

In [None]:
# Evaluate the model
score_2 = mean_absolute_error(y_valid, preds_2)
print('MAE:', score_2) 
# Preprocessing of test data, fit model
preds_test_2 = my_pipeline.predict(X_valid_full) 

In [None]:
output_2 = pd.DataFrame({'Price': preds_test_2, 'real_price': y_valid})
output_2

# XGBoost

In [None]:
# Read the data
y = data.Price
X = data.drop(['Price'], axis=1)

# split the data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)
              

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64','boolean']]

# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()


# One-hot encode the data 
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)


In [None]:
# First we define the first model
Price_model_1 = XGBRegressor(random_state=0) 

# then we fit it
Price_model_1.fit(X_train, y_train)

# we calculate the prediction
predictions_1 = Price_model_1.predict(X_valid)

# we calculate mean_absolute_error
mae_1 = mean_absolute_error(predictions_1,y_valid) 
print("Mean Absolute Error:" , mae_1)

In [None]:
# we define the second model
Price_model_2 = XGBRegressor(n_estimators=100, learning_rate=0.1,random_state=0) 

# we fit the model
Price_model_2.fit(X_train, y_train) 

# we calculate the predictions
predictions_2 = Price_model_2.predict(X_valid)

# we calculate mean_absolute_error
mae_2 = mean_absolute_error(predictions_2, y_valid) 
print("Mean Absolute Error:" , mae_2) 

In [None]:
# we define the third model
Price_model_3 =XGBRegressor(n_estimators=87, learning_rate=0.01,random_state=0)

# we fit the model
Price_model_3.fit(X_train, y_train)

# we calculate predictions
predictions_3 =Price_model_3.predict(X_valid)

# we calculate mean_absolute_error
mae_3 = mean_absolute_error(predictions_3,y_valid)
print("Mean Absolute Error:" , mae_3)