In [101]:
import pandas as pd
import numpy as np
import re
import time
import requests as rq
import bs4 as bs4
from utils import *

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

pd.set_option('max_colwidth', None)
pd.set_option('display.max_rows', None)

pd.set_option('max_columns', 28)

In [72]:
df = pd.read_csv('car_information.csv',index_col=0)
# Select labeled rows
df = df[df['Y'].notnull()]
df.head(1)

Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,extra,Y
0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_ford-ka-hatch-2018-unico-dono-completao-gnv-gratis-ent-8mil-48x-751-00-fixas-no-cdc-686667352,ford,31900,hatch,ka,manual,2015,39869,1,flex,elétrica,preto,sim,ka 1.0 se se plus tivct flex 5p,4 portas,ipva pago,"vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré",1.0


In [73]:
df = clean_df(df)
df.head(1)

Unnamed: 0,link,price,regdate,mileage,version,doors,Y,vidro elétrico,air bag,trava elétrica,ar condicionado,direção hidráulica,alarme,som,...,carcolor_branco,carcolor_prata,carcolor_preto,carcolor_0,carcolor_vermelho,carcolor_cinza,carcolor_azul,carcolor_outra,carcolor_laranja,carcolor_amarelo,carcolor_verde,exchange_sim,exchange_não,exchange_0
0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_ford-ka-hatch-2018-unico-dono-completao-gnv-gratis-ent-8mil-48x-751-00-fixas-no-cdc-686667352,31900,2015,39869,ka 1.0 se se plus tivct flex 5p,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,1,0,0,0,0,0,0,0,0,1,0,0


In [76]:
cols = ['price','regdate','version', 'Y']

In [77]:
df = df[cols]
df.head(2)

Unnamed: 0,price,regdate,version,Y
0,31900,2015,ka 1.0 se se plus tivct flex 5p,1.0
1,26000,2013,gol 1.6 mi plus total flex 8v 4p,1.0


### Split df

In [78]:
# shuffle dataframe
df = shuffle(df).reset_index(drop=True)

In [83]:
Y = df['Y']
X = df.loc[:, 'price':'version']

In [84]:
# Split train and validation data
Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, test_size=0.5, random_state=42)
Xtrain.shape, Xval.shape, Ytrain.shape, Yval.shape

((2159, 3), (2160, 3), (2159,), (2160,))

### version 

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [133]:
version_train = Xtrain['version']
version_val = Xval['version']


version_vec = TfidfVectorizer(min_df= 4)

version_bow_train = version_vec.fit_transform(version_train)
version_bow_val = version_vec.transform(version_val)

In [134]:
version_bow_train.shape

(2159, 113)

In [135]:
version_bow_train

<2159x113 sparse matrix of type '<class 'numpy.float64'>'
	with 10537 stored elements in Compressed Sparse Row format>

In [136]:
Xtrain.shape

(2159, 3)

In [137]:
Xtrain_noversion = Xtrain.loc[:,['price','regdate']]
xval_noversion = Xval.loc[:,['price','regdate']]

The TfidfVectorizer may give us a sparse matrix. That way, we have to account for this issue and add the sparse matrix to our dataframe. 

In [138]:
from scipy.sparse import hstack, vstack

Xtrain_wversion = hstack([Xtrain_noversion, version_bow_train])
Xval_wversion = hstack([xval_noversion,version_bow_val])

In [139]:
Xtrain_wversion.shape, Xval_wversion.shape

((2159, 115), (2160, 115))

In [140]:
model = RandomForestClassifier(n_estimators= 1000 ,
                              random_state=42 ,
                              class_weight= 'balanced',
                              n_jobs = -1)

model.fit(Xtrain_wversion,y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=1000, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [141]:
pred = model.predict_proba(Xval_wversion)[:,1]

In [142]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [143]:
print('average_precision_score :', np.round(average_precision_score(y_val, pred),4))
print('roc_auc_score :', np.round(roc_auc_score(y_val, pred),4))

average_precision_score : 0.1377
roc_auc_score : 0.5094


In [None]:
tests:

min_df = 1  - > ap =  0.1386  -  auc = 0.513
min_df = 2  - > ap =  0.1384  -  auc = 0.5104
min_df = 3  - > ap =  0.1385  -  auc = 0.5117
min_df = 4  - > ap =  0.1377  -  auc = 0.5094

As we can see, adding the version column does not help us. This model is not better than our base model.