# Authenticating with Kaggle using kaggle.json

Navigate to https://www.kaggle.com. 
Then go to the [Account tab of your user profile](https://www.kaggle.com/me/account) and select Create API Token. 
This will trigger the download of `kaggle.json`, a file containing your API credentials.

Run the following cell, or just copy from the `kaggle.json` file and paste the username and key to the environment variables below.

In [1]:
#import os
#from pathlib import Path

#def kaggle_key(path: str) -> None:
    #path = Path(path).expanduser()
    
    # Does the file exist?
    #if Path.exists(path):
       #print(f"Path: {path}")
        
        # Open the JSON file
        #with open(path) as json_file:
           # data = json.load(json_file)

        # Extract the username and key
        #print(f"Username: {data['username']}")
       # print(f"Key: {data['key'][:16]}...")

        # Create environmental variables
       # os.environ['KAGGLE_USERNAME'] = data['username']
        ##os.environ['KAGGLE_KEY'] = data['key']
    
#kaggle_key("~/downloads/kaggle.json")
#kaggle_key("../kaggle.json")

In [2]:
import os

os.environ['KAGGLE_USERNAME'] = "derek560"
os.environ['KAGGLE_KEY'] = "cf1dc42c6c72669d7f1ec0a9bbad91e1"

# Used Car Price Prediction

This [data](https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data) contains 
most all relevant information that Craigslist provides on car sales including 
columns like price, condition, manufacturer, latitude/longitude, and 18 other categories.

This notebook was built from the following:

* https://www.kaggle.com/code/maciejautuch/car-price-prediction
* https://www.kaggle.com/code/hemprakashprasanna/used-car-price-prediction

# Download the dataset

In [3]:
import kaggle

!kaggle --version

Kaggle API 1.5.16


In [4]:
# Kaggle URL dataset
# https://www.kaggle.com/datasets/derek560/craigslist-carstrucks-data
#DATASET = 'derek560/craigslist-carstrucks-data'
#PATH = '/Users/jrdegbe/Desktop/DataMasters/Data_Science_Track/week-ten/day_one/mlops-project/vehicles.csv'
#FILE_NAME = '/Users/jrdegbe/Desktop/DataMasters/Data_Science_Track/week-ten/day_one/mlops-project/archive.zip'

#!kaggle datasets download $DATASET -p $PATH

In [5]:
import pandas as pd

df = pd.read_csv('/Users/jrdegbe/Desktop/DataMasters/Data_Science_Track/week-ten/day_one/mlops-project/vehicles.csv')

# Feature engineering

In [6]:
df.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color',
       'image_url', 'description', 'county', 'state', 'lat', 'long',
       'posting_date'],
      dtype='object')

In [7]:
df

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,


## Missing Values

In [8]:
# Calculate the percentage of nulls in each of the above features.
nulls_perc = df.isna().sum()/len(df)*100
nulls_perc[nulls_perc.values>0].sort_values(ascending=False)

county          100.000000
size             71.767476
cylinders        41.622470
condition        40.785232
VIN              37.725356
drive            30.586347
paint_color      30.501078
type             21.752717
manufacturer      4.133714
title_status      1.930753
long              1.534155
lat               1.534155
model             1.236179
odometer          1.030735
fuel              0.705819
transmission      0.598763
year              0.282281
description       0.016398
image_url         0.015930
posting_date      0.015930
dtype: float64

In [9]:
# Drop 'county' and 'size' features which have more than 50% of their data missing
df.drop(['county','size'], axis = 'columns', inplace = True)

In [10]:
# Get the features which have less than 5% of their data missing
lst = nulls_perc[(nulls_perc.values>0) & (nulls_perc.values<5)].sort_values(ascending=False).index
lst

Index(['manufacturer', 'title_status', 'lat', 'long', 'model', 'odometer',
       'fuel', 'transmission', 'year', 'description', 'image_url',
       'posting_date'],
      dtype='object')

In [11]:
# Drop the rows in the above features that have missing values
for features in lst:
    df.dropna(subset=[features], inplace=True, axis='index')

In [12]:
# Re-Calculate the percentage of nulls in each of the above features
nulls_perc = df.isna().sum()/len(df)*100
nulls_perc[nulls_perc.values>0].sort_values(ascending=False)

cylinders      41.843400
condition      39.661318
VIN            36.963183
drive          29.768936
paint_color    29.260705
type           21.485474
dtype: float64

In [13]:
# Others columns with missing.
def value_counts(column: str) -> None:
    print(f"Column name: {column}")
    print(df[column].value_counts(dropna=False))
    print(" ")
    
value_counts('cylinders')
value_counts('condition')
value_counts('VIN')
value_counts('drive')
value_counts('paint_color')
value_counts('type')

Column name: cylinders
cylinders
NaN             160464
6 cylinders      84745
4 cylinders      69547
8 cylinders      64506
5 cylinders       1522
10 cylinders      1291
other              799
3 cylinders        505
12 cylinders       108
Name: count, dtype: int64
 
Column name: condition
condition
NaN          152096
good         114699
excellent     89212
like new      19946
fair           5910
new            1093
salvage         531
Name: count, dtype: int64
 
Column name: VIN
VIN
NaN                  141749
1FMJU1JT1HEA52352       261
3C6JR6DT3KG560649       235
1FTER1EH1LLA36301       231
5TFTX4CN3EX042751       227
                      ...  
1FAHP2KT2AG117594         1
1GKKVTKD4EJ233137         1
2GNAXJEV8K6267690         1
2C3CCAKG2JH160208         1
SAJGX2749VCOO8376         1
Name: count, Length: 107440, dtype: int64
 
Column name: drive
drive
4wd    120840
NaN    114160
fwd     97789
rwd     50698
Name: count, dtype: int64
 
Column name: paint_color
paint_color
NaN       11

In [14]:
# Stripping the word 'cylinders' from the 'cylinders' feature.
df['cylinders'] = df['cylinders'].replace('cylinders','',regex=True)
df['cylinders'] = df['cylinders'].str.strip()
df['cylinders'].value_counts(dropna=False)

cylinders
NaN      160464
6         84745
4         69547
8         64506
5          1522
10         1291
other       799
3           505
12          108
Name: count, dtype: int64

In [15]:
# Drop 'VIN' column which is useless.
df.drop(['VIN'], axis = 'columns', inplace = True)

In [16]:
# Fill missing values in the ratio of non-null values in the feature.
import numpy as np

def fill_missing(column: str) -> None:
    counts = df[column].value_counts(normalize=True)
    df[column] = df[column].fillna(
        pd.Series(np.random.choice(
            list(counts.index), p=list(counts.values), size=len(df)
        ))
    )
    value_counts('paint_color')

fill_missing('paint_color')

Column name: paint_color
paint_color
white     97329
black     80443
silver    54725
blue      39622
red       37981
grey      30831
NaN       11196
green      8999
custom     8407
brown      8368
orange     2422
yellow     2347
purple      817
Name: count, dtype: int64
 


In [17]:
# Dropping all the rows that contain missing values
df.dropna(axis='index', inplace=True)
df.shape

(125377, 23)

## Outliers

In [18]:
# understanding the data structure and looking at car price distribution
lower_limit = np.percentile(df[['price']], 5)
upper_limit = np.percentile(df[['price']], 95)
print(lower_limit, upper_limit)

247.80000000000018 39990.0


In [19]:
# Removing outliers
df = df[(df['price'] >= lower_limit) & (df['price'] <= upper_limit)]  

## Create new features

In [20]:
df['posting_year'] = df['posting_date'].str[0:4].astype('int64')
df['years_used'] = df['posting_year'] - df['year']

# Changing year for a smaller number.
df['year'] = df['year'].astype('int64') - 1900

## Label encoder

In [21]:
# Categorical data encoding - label enncoding
df['title_status'].unique()

array(['clean', 'rebuilt', 'salvage', 'missing', 'lien', 'parts only'],
      dtype=object)

In [22]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['title_status'] = label_encoder.fit_transform(df['title_status'])

## Data Cleaning

In [23]:
# Checking
df['years_used'].unique()

array([  8.,   9.,   5.,   2.,  10.,   4.,   3.,  17.,  20.,   7.,   1.,
        18.,  13.,  14.,  15.,  16.,  11.,  12.,  19.,  26.,  47.,  25.,
        66.,  49.,  27.,  33.,   6.,  21.,  24.,  42.,  37.,  23.,  22.,
        43.,  45.,  30.,  68.,  58.,  53.,  28.,  31.,  39.,  73.,  35.,
        70.,  85.,  55.,  32.,  54.,  40.,  74.,  29.,  59.,  48.,  36.,
        46.,  63.,  57.,  88.,  44.,  50.,  75.,   0.,  34.,  91.,  52.,
        41.,  62.,  56.,  93.,  51.,  92.,  89.,  65.,  97.,  94.,  69.,
        82.,  38.,  79.,  64.,  98.,  87.,  81.,  60., 103.,  61.,  71.,
        83.,  72., 121.,  90.,  95.,  84.,  67.,  -1.,  86.,  80., 116.,
        96., 108.,  77.])

In [24]:
# There is a -1 value in the years_used feature. This may have happened due to some error during listing.
df = df[df.years_used > -1]

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113284 entries, 31 to 426878
Data columns (total 25 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            113284 non-null  int64  
 1   url           113284 non-null  object 
 2   region        113284 non-null  object 
 3   region_url    113284 non-null  object 
 4   price         113284 non-null  int64  
 5   year          113284 non-null  int64  
 6   manufacturer  113284 non-null  object 
 7   model         113284 non-null  object 
 8   condition     113284 non-null  object 
 9   cylinders     113284 non-null  object 
 10  fuel          113284 non-null  object 
 11  odometer      113284 non-null  float64
 12  title_status  113284 non-null  int64  
 13  transmission  113284 non-null  object 
 14  drive         113284 non-null  object 
 15  type          113284 non-null  object 
 16  paint_color   113284 non-null  object 
 17  image_url     113284 non-null  object 
 18  descript

In [26]:
# Remove others columns that will not be used.
df.drop(columns=['id', 'url', 'region', 'region_url', 
                 'image_url', 'description',
                 'lat', 'long', 'posting_date'], axis=1, inplace=True)
df.shape

(113284, 16)

In [27]:
df.describe()

Unnamed: 0,price,year,odometer,title_status,posting_year,years_used
count,113284.0,113284.0,113284.0,113284.0,113284.0,113284.0
mean,14576.234517,109.191448,114123.2,0.202041,2021.0,11.808552
std,10558.208033,9.441432,191523.0,0.89609,0.0,9.441432
min,248.0,0.0,0.0,0.0,2021.0,0.0
25%,6000.0,106.0,60935.0,0.0,2021.0,6.0
50%,10999.0,111.0,106250.0,0.0,2021.0,10.0
75%,21500.0,115.0,150000.0,0.0,2021.0,15.0
max,39990.0,121.0,10000000.0,5.0,2021.0,121.0


In [28]:
df.head(3)

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state,posting_year,years_used
31,15000,113,ford,f-150 xlt,excellent,6,gas,128000.0,0,automatic,rwd,truck,black,al,2021,8.0
32,27990,112,gmc,sierra 2500 hd extended cab,good,8,gas,68696.0,0,other,4wd,pickup,black,al,2021,9.0
33,34590,116,chevrolet,silverado 1500 double,good,6,gas,29499.0,0,other,4wd,pickup,silver,al,2021,5.0


# Model fit

In [29]:
# Split features and label.
x = df.drop(columns=['price','model','state']) 
y = df[['price']] 

In [30]:
# Categorical data encoding.
x = pd.get_dummies(x)
x.shape

(113284, 95)

In [31]:
# Split the data into train and test.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42) 

In [32]:
print(f"x_train {x_train.shape}")
print(f"x_test {x_test.shape}")
print(f"y_train {y_train.shape}")
print(f"y_test {y_test.shape}")

x_train (84963, 95)
x_test (28321, 95)
y_train (84963, 1)
y_test (28321, 1)


In [33]:
# Model 1.
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(x_train, y_train)
lm.score(x_train,y_train), lm.score(x_test,y_test)

(0.5851382228144646, 0.5947596615926427)

In [34]:
# Model 2.
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train.values.ravel())
gnb.score(x_train,y_train), gnb.score(x_test,y_test)

(0.05620093452444005, 0.0371102715299601)

In [35]:
# Model 3.
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor()
neigh.fit(x_train, y_train)
neigh.score(x_train,y_train), neigh.score(x_test,y_test)

(0.7771969747641241, 0.6450001165369083)

In [36]:
# Model 4.
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
poly.fit(x_train, x_test)

x_train_poly = poly.transform(x_train)
x_test_poly = poly.transform(x_test)

lm.fit(x_train_poly, y_train)
lm.score(x_train_poly,y_train), lm.score(x_test_poly, y_test)

(0.8209400327908088, 0.7479459142015579)

In [37]:
# Model 5.
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=0, max_depth=1000, 
                            min_samples_split = 18, min_impurity_decrease = 1.4)
dtr.fit(x_train, y_train.values.ravel())
dtr.score(x_train,y_train), dtr.score(x_test,y_test)

(0.9476909618253074, 0.8633044121438587)

In [38]:
# Model 6.
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(n_estimators = 250, max_features = 'sqrt', n_jobs = 20)
random_forest.fit(x_train, y_train.values.ravel())
print(random_forest.score(x_train, y_train), random_forest.score(x_test, y_test))

0.9879733235408105 0.9142391969378678


In [39]:
# Model 7.
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor(n_estimators = 200, oob_score = True, n_jobs = 10)
bagging.fit(x_train, y_train.values.ravel())
bagging.score(x_train,y_train), bagging.score(x_test,y_test)

(0.9874294045917419, 0.910042848233898)

In [40]:
# Model 8.
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor(random_state=0, n_estimators = 250, max_features = None, min_samples_split = 6)
etr.fit(x_train, y_train.values.ravel())
etr.score(x_train,y_train), etr.score(x_test,y_test)

(0.9884429035360699, 0.9111410496642998)

# Choosing the next best algorithm

In [41]:
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

def create_models():
    models = []
    models.append(('Linear Regression', LinearRegression()))
    models.append(('Decision Tree Regressor', DecisionTreeRegressor()))
    models.append(('ElasticNet_Regressor', ElasticNet()))
    models.append(('Lasso_Regressor', Lasso()))
    models.append(('Ridge_Regressor', Ridge()))
    models.append(('RandomForest_Regressor', RandomForestRegressor()))
    return models

# creating a list with all the algorithms we are going to assess
models = create_models()

In [42]:
from sklearn.metrics import r2_score, mean_squared_error

for name, model in models:
    print(" ")
    print(name)
    model.fit(x_train, y_train)
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)
    print('Train R2 :', r2_score(y_train, y_pred_train))
    print('Test R2 :', r2_score(y_test, y_pred_test))
    print('Train RMSE :', np.sqrt(mean_squared_error(y_train, y_pred_train)))
    print('Test RMSE :', np.sqrt(mean_squared_error(y_test, y_pred_test)))


 
Linear Regression
Train R2 : 0.5851382228144646
Test R2 : 0.5947596615926427
Train RMSE : 6803.4107060806855
Test RMSE : 6712.383370179657
 
Decision Tree Regressor
Train R2 : 0.9998524550408125
Test R2 : 0.8407411662928879
Train RMSE : 128.30310471248401
Test RMSE : 4207.965626662792
 
ElasticNet_Regressor


  model = cd_fast.enet_coordinate_descent(


Train R2 : 0.4317774163811997
Test R2 : 0.4333864935791243
Train RMSE : 7962.220906210727
Test RMSE : 7937.13705183911
 
Lasso_Regressor
Train R2 : 0.58487082561573
Test R2 : 0.594320762185068
Train RMSE : 6805.602906010673
Test RMSE : 6716.017341859232
 
Ridge_Regressor
Train R2 : 0.5851354496917871
Test R2 : 0.5947299948924978
Train RMSE : 6803.4334445709765
Test RMSE : 6712.629064649607
 
RandomForest_Regressor


  return fit_method(estimator, *args, **kwargs)


Train R2 : 0.9871355617655366
Test R2 : 0.9094180178440874
Train RMSE : 1198.0374399978962
Test RMSE : 3173.520645835682


# Hyperparameter tuning for Random Forest Regressor

In [43]:
from warnings import filterwarnings
filterwarnings('ignore')

In [44]:
from sklearn.model_selection import RandomizedSearchCV

grid_parameters = {'n_estimators': [80, 90, 100, 110],'max_depth': [5, 6],
                   'max_features': [None, 'auto'], 'min_samples_split': [2, 3]}

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=1),
    param_distributions=grid_parameters,
    cv=5,n_iter=10,n_jobs=-1)

random_search.fit(x_train, y_train)
print(random_search.best_params_)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


{'n_estimators': 90, 'min_samples_split': 3, 'max_features': None, 'max_depth': 6}


In [45]:
model = RandomForestRegressor(n_estimators=random_search.best_params_.get('n_estimators'),
                            max_depth=random_search.best_params_.get('max_depth'),
                            min_samples_split=random_search.best_params_.get('min_samples_split'),
                            max_features=random_search.best_params_.get('max_features'),
                            random_state=1)

model.fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
print('Train R2 :', r2_score(y_train, y_pred_train))
print('Test R2 :', r2_score(y_test, y_pred_test))
print('Train RMSE :', np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('Test RMSE :', np.sqrt(mean_squared_error(y_test, y_pred_test)))

Train R2 : 0.7675306116893746
Test R2 : 0.7719458835063739
Train RMSE : 5092.814570718696
Test RMSE : 5035.462615794339


# XGBoost Algorithm

In [46]:
from xgboost import XGBRegressor

model = XGBRegressor(random_state=1).fit(x_train,y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
print('Train R2 :', r2_score(y_train, y_pred_train))
print('Test R2 :', r2_score(y_test, y_pred_test))
print('Train RMSE :', np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('Test RMSE :', np.sqrt(mean_squared_error(y_test, y_pred_test)))

Train R2 : 0.9049943622498345
Test R2 : 0.887458861425053
Train RMSE : 3255.739631814906
Test RMSE : 3537.3340158135916


# Hyperparameter Tuning for XGBoost Algorithm

In [47]:
tuning_params = {'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6],
                 'max_depth':range(3,10),
                 'gamma':[0,1,2,3,4]}
xgb_search = RandomizedSearchCV(
    estimator=XGBRegressor(),
    param_distributions=tuning_params,
    cv=5,n_iter=10,n_jobs=1)

xgb_search.fit(x_train, y_train)
print(xgb_search.best_params_)

In [None]:
model = XGBRegressor(learning_rate=0.6,max_depth=9,gamma=4).fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
print('Train R2 :', r2_score(y_train, y_pred_train))
print('Test R2 :', r2_score(y_test, y_pred_test))
print('Train RMSE :', np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('Test RMSE :', np.sqrt(mean_squared_error(y_test, y_pred_test)))

# Save Dataframe

In [None]:
df.to_parquet('/Users/jrdegbe/Desktop/DataMasters/Data_Science_Track/week-ten/day_one/mlops-project/models', index=False)