In [93]:
import pandas as pd
import math
import psycopg2 as ps
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

In [157]:
cars_data = pd.read_csv('carsdata_preprocessed.csv')

In [158]:
cars_data.shape

(34007, 20)

In [159]:
cars_data.sort_values('price_ars')

Unnamed: 0.1,Unnamed: 0,id,brand,model,colour,fuel,doors,engine,location,price,year,transmision,km,type,url,currency,price_ars,neighborhood,city,province
2637,2782,2782,BMW,X6,,Nafta,4,,Tigre - Bs.As. G.B.A. Norte,155000,2018,,38000,,https://auto.mercadolibre.com.ar/MLA-113385607...,USD,155000,Tigre,Bs.As. G.B.A. Norte,Bs.As. G.B.A. Norte
25644,26519,26519,Mercedes Benz,Clase C,Blanco,Nafta,4,3.0,Nordelta Bahia Grande - Tigre - Bs.As. G.B.A. ...,155000,2021,Automatica,2000,Sedan,https://auto.mercadolibre.com.ar/MLA-113668318...,USD,155000,Nordelta Bahia Grande,Tigre,Bs.As. G.B.A. Norte
24888,25672,25672,Land Rover,Range Rover Velar,Gris,Nafta,5,V6 3.0 380HP,Belgrano - Capital Federal - Capital Federal,155000,2018,Automatica,9500,SUV,https://auto.mercadolibre.com.ar/MLA-112980493...,USD,155000,Belgrano,Capital Federal,Capital Federal
10604,11005,11005,Land Rover,VELAR,Plateado,Nafta,5,V6 3.0 380HP,Vicente Lopez - Vicente Lopez - Bs.As. G.B.A. ...,157000,2018,Automatica,8900,SUV,https://auto.mercadolibre.com.ar/MLA-112429370...,USD,157000,Vicente Lopez,Vicente Lopez,Bs.As. G.B.A. Norte
270,280,280,Audi,Q7,Blanco,Nafta,5,3.0,Capital Federal - Capital Federal,158000,2020,Automatica,3000,SUV,https://auto.mercadolibre.com.ar/MLA-113171437...,USD,158000,Capital Federal,Capital Federal,Capital Federal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2256,2364,2364,BMW,X4,Negro,Nafta,5,2.0,Cordoba - Cordoba,75000,2019,Automatica,50000,SUV,https://auto.mercadolibre.com.ar/MLA-113109897...,USD,15000000,Cordoba,Cordoba,Cordoba
687,717,717,BMW,Serie 3,Gris,Nafta,5,258 cv - Linea nueva Full,La Plata - La Plata - Bs.As. G.B.A. Sur,75000,2021,Automatica secuencial,7900,Sedan,https://auto.mercadolibre.com.ar/MLA-113063716...,USD,15000000,La Plata,La Plata,Bs.As. G.B.A. Sur
9801,10151,10151,Jeep,Grand Cherokee,Gris,Nafta,5,3.6,Pilar - Pilar - Bs.As. G.B.A. Norte,75000,2018,Automatica,70000,SUV,https://auto.mercadolibre.com.ar/MLA-113636141...,USD,15000000,Pilar,Pilar,Bs.As. G.B.A. Norte
18257,18829,18829,Dodge,Challenger SRT8,Rojo,Nafta,2,6.2,Capital Federal - Capital Federal,75000,2012,Automatica,50000,Coupe,https://auto.mercadolibre.com.ar/MLA-113131624...,USD,15000000,Capital Federal,Capital Federal,Capital Federal


In [160]:
cars_data.columns

Index(['Unnamed: 0', 'id', 'brand', 'model', 'colour', 'fuel', 'doors',
       'engine', 'location', 'price', 'year', 'transmision', 'km', 'type',
       'url', 'currency', 'price_ars', 'neighborhood', 'city', 'province'],
      dtype='object')

In [161]:
features = ['brand', 'model', 'price_ars', 'colour', 'fuel',
       'engine', 'year', 'transmision', 'km', 'type',
           'city']

In [162]:
X = cars_data[features].dropna()
y = X.pop('price_ars') #we want to predict price (in ARS)

In [163]:
X.describe() #not all the features are numeric (later we will check if a encoding is usefull)

Unnamed: 0,year,km
count,18402.0,18402.0
mean,2014.867406,86620.470384
std,4.928291,62498.531334
min,1991.0,1.0
25%,2013.0,43303.5
50%,2016.0,78000.0
75%,2018.0,118623.75
max,2023.0,604000.0


In [164]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 0)


In [165]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['brand', 'model', 'colour', 'fuel', 'engine', 'transmision', 'type', 'city']


In [166]:
cars_data[object_cols]
missing_values_count = cars_data[object_cols].isnull().sum()
missing_values_count #we have many empty values, we need to clean before encoding

brand              0
model              0
colour         14710
fuel               0
engine          2228
transmision     1966
type            2155
city               0
dtype: int64

We face  a hard decision here, since color is an important feature, but half of the data does not have this value. We can drop the column for the model and make 2 separate models, once with color and once without.

## Note: The model works better if we use the color column and drop the rows that do not have that value. 
From 500K to 427K in MAE

###### As there is no ordinal values in this data, we proceed with one hot encoding

In [167]:
def encoder(X_train, X_valid):  
    # Apply one-hot encoder to each column with categorical data
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
    OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

    # One-hot encoding removed index; put it back
    OH_cols_train.index = X_train.index
    OH_cols_valid.index = X_valid.index

    # Remove categorical columns (will replace with one-hot encoding)
    num_X_train = X_train.drop(object_cols, axis=1)
    num_X_valid = X_valid.drop(object_cols, axis=1)

    # Add one-hot encoded columns to numerical features
    OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
    OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
    return OH_X_train, OH_X_valid

# print("MAE (One-Hot Encoding):") 
# print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

In [168]:
OH_X_train, OH_X_valid = encoder(X_train, X_valid)
model = RandomForestRegressor(random_state=1)
model.fit(OH_X_train, y_train)

RandomForestRegressor(random_state=1)

In [169]:
print("Making predictions for the following 5 cars:")
print(X[500:505])
print("The predictions are")
print(model.predict(OH_X_valid[500:505]))

val_predictions = model.predict(OH_X_valid)
print(mean_absolute_error(y_valid, val_predictions))

Making predictions for the following 5 cars:
    brand model    colour   fuel engine  year transmision     km       type  \
914  Audi    S3      Gris  Nafta    2.0  2016  Automatica  55000  Hatchback   
915  Audi    A4      Gris  Nafta    2.0  2018  Automatica  44500      Sedan   
916  Audi    Q3      Gris  Nafta    2.0  2014      Manual  66200        SUV   
917  Audi    Q2      Gris  Nafta    1.0  2020  Automatica  11000        SUV   
918  Audi    A4  Plateado  Nafta    2.0  2018  Automatica  44600      Sedan   

                  city  
914        Hurlingham   
915   Capital Federal   
916             Tigre   
917   Capital Federal   
918   Capital Federal   
The predictions are
[2994700.   1655700.   5478872.   2310649.66 2970389.98]
429989.90161338093


429K ARS is a bad model. We can try to differentiate expensive and cheap cars, or normalize the price

In [170]:
cars_data.price.describe()

count    3.400700e+04
mean     2.614607e+06
std      2.270250e+06
min      1.000000e+03
25%      9.925000e+05
50%      2.300000e+06
75%      3.700000e+06
max      1.500000e+07
Name: price, dtype: float64

In [171]:
cars_data['brand_avg']=cars_data.groupby('brand')['price_ars'].transform('mean')

In [172]:
cars_data[['brand', 'price_ars', 'brand_avg']]

Unnamed: 0,brand,price_ars,brand_avg
0,Alfa Romeo,3500000,3.166129e+06
1,Alfa Romeo,2940000,3.166129e+06
2,Alfa Romeo,2880000,3.166129e+06
3,Alfa Romeo,2660000,3.166129e+06
4,Alfa Romeo,849000,3.166129e+06
...,...,...,...
34002,Volvo,800000,3.519973e+06
34003,Volvo,900000,3.519973e+06
34004,Volvo,2800000,3.519973e+06
34005,Volvo,2400000,3.519973e+06


#### One-Hot encoding doesnt poerform well if we have many categories, in this case we got over 32, we can try using only the most common

In [173]:
cars_data.groupby('brand').size().count()

32

In [174]:
most_common_cars = ((cars_data.groupby('brand').size()>600)[(cars_data.groupby('brand').size()>600)].index)

In [175]:
cars_data_common = cars_data[cars_data['brand'].isin(list(most_common_cars))]

### now we repeat the process

In [176]:
X = cars_data_common[features].dropna()

In [177]:
y = X.pop('price_ars')

In [178]:
X.describe() #not all the features are numeric (later we will check if a encoding is usefull)

Unnamed: 0,year,km
count,16813.0,16813.0
mean,2015.18111,84215.988818
std,4.618142,60377.625875
min,1991.0,1.0
25%,2013.0,42000.0
50%,2016.0,75500.0
75%,2018.0,115000.0
max,2023.0,604000.0


In [179]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 0)


In [180]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['brand', 'model', 'colour', 'fuel', 'engine', 'transmision', 'type', 'city']


In [181]:
cars_data[object_cols]
missing_values_count = cars_data[object_cols].isnull().sum()
missing_values_count #we have many empty values, we need to clean before encoding

brand              0
model              0
colour         14710
fuel               0
engine          2228
transmision     1966
type            2155
city               0
dtype: int64

###### As there is no ordinal values in this data, we proceed with one hot encoding

In [182]:
OH_X_train, OH_X_valid = encoder(X_train, X_valid)
model = RandomForestRegressor(random_state=1)
model.fit(OH_X_train, y_train)

RandomForestRegressor(random_state=1)

In [183]:
print("Making predictions for the following 5 cars:")
print(X.head())
print("The predictions are")
print(model.predict(OH_X_valid.head()))

val_predictions = model.predict(OH_X_valid)
print(mean_absolute_error(y_valid, val_predictions))

Making predictions for the following 5 cars:
    brand         model colour   fuel engine  year transmision      km  \
220  Audi            TT  Negro  Nafta    1.8  2011      Manual  125000   
221  Audi            A1   Gris  Nafta    1.4  2015      Manual  117000   
222  Audi  A5 Sportback   Gris  Nafta    2.0  2011  Automatica  129000   
223  Audi  A1 Sportback   Rojo  Nafta    1.0  2021  Automatica    1000   
225  Audi            Q3   Gris  Nafta    1.4  2018  Automatica   82229   

          type                  city  
220      Coupe      Capital Federal   
221  Hatchback   Bs.As. G.B.A. Oeste  
222  Hatchback           Avellaneda   
223  Hatchback      Capital Federal   
225        SUV   Bs.As. G.B.A. Oeste  
The predictions are
[ 6864758.          2311768.         12136900.          4770190.
  2001804.06833333]
427757.00480239297


This model is better

##### Finally we will try to see if partitioning the price data gets a better model

In [227]:
X = cars_data_common[features].dropna()

X = X[(X['price_ars']>1000000)]# we dont lose much data and get rid of some noise

y = X.pop('price_ars')

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 0)

# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

OH_X_train, OH_X_valid = encoder(X_train, X_valid)
model = RandomForestRegressor(random_state=1)
model.fit(OH_X_train, y_train)

print("Making predictions for the following 5 cars:")
print(X.head())
print("The predictions are")
print(model.predict(OH_X_valid.head()))

val_predictions = model.predict(OH_X_valid)
print(mean_absolute_error(y_valid, val_predictions))

Categorical variables:
['brand', 'model', 'colour', 'fuel', 'engine', 'transmision', 'type', 'city']
Making predictions for the following 5 cars:
    brand         model colour   fuel engine  year transmision      km  \
220  Audi            TT  Negro  Nafta    1.8  2011      Manual  125000   
221  Audi            A1   Gris  Nafta    1.4  2015      Manual  117000   
222  Audi  A5 Sportback   Gris  Nafta    2.0  2011  Automatica  129000   
223  Audi  A1 Sportback   Rojo  Nafta    1.0  2021  Automatica    1000   
225  Audi            Q3   Gris  Nafta    1.4  2018  Automatica   82229   

          type                  city  
220      Coupe      Capital Federal   
221  Hatchback   Bs.As. G.B.A. Oeste  
222  Hatchback           Avellaneda   
223  Hatchback      Capital Federal   
225        SUV   Bs.As. G.B.A. Oeste  
The predictions are
[10288196.     2138233.     1666919.975  2949424.     7604325.   ]
417507.1016829879


In [228]:
1-mean_absolute_error(y_valid, val_predictions)/y.mean() #---> 90% accuaracy

0.8962272686835152

In [155]:
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
model.fit(OH_X_train, y_train,
         early_stopping_rounds=5, 
             eval_set=[(OH_X_valid, y_valid)], 
             verbose=False)




XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=1000,
             n_jobs=4, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, reg_lambda=1, ...)

In [156]:
val_predictions = model.predict(OH_X_valid)
print(mean_absolute_error(y_valid, val_predictions))

572962.5910320674
