In [1]:
import pandas as pd
import numpy as np
import re
import time
import requests as rq
import bs4 as bs4
from utils import *

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, average_precision_score


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

pd.set_option('max_colwidth', None)
pd.set_option('display.max_rows', None)

pd.set_option('max_columns', 28)

# Decision Tree

In [2]:
df = pd.read_csv('car_information.csv',index_col=0)
# Select labeled rows
df = df[df['Y'].notnull()]
df.head(1)

Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,extra,Y
0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_ford-ka-hatch-2018-unico-dono-completao-gnv-gratis-ent-8mil-48x-751-00-fixas-no-cdc-686667352,ford,31900,hatch,ka,manual,2015,39869,1,flex,elétrica,preto,sim,ka 1.0 se se plus tivct flex 5p,4 portas,ipva pago,"vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré",1.0


In [3]:
df = clean_df(df)
df.head(1)

Unnamed: 0,link,price,regdate,mileage,version,doors,Y,vidro elétrico,air bag,trava elétrica,ar condicionado,direção hidráulica,alarme,som,...,carcolor_branco,carcolor_prata,carcolor_preto,carcolor_0,carcolor_vermelho,carcolor_cinza,carcolor_azul,carcolor_outra,carcolor_laranja,carcolor_amarelo,carcolor_verde,exchange_sim,exchange_não,exchange_0
0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_ford-ka-hatch-2018-unico-dono-completao-gnv-gratis-ent-8mil-48x-751-00-fixas-no-cdc-686667352,31900,2015,39869,ka 1.0 se se plus tivct flex 5p,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,1,0,0,0,0,0,0,0,0,1,0,0


In [4]:
df.columns = [ 'link', 'price', 'regdate', 'mileage', 'version', 'doors', 'Y',
       'vidro_eletrico', 'air_bag', 'trava_eletrica', 'ar_condicionado',
       'direcao_hidraulica', 'alarme', 'som', 'sensor_de_re', 'blindado',
       'camera_de_re', 'financial_ipva_pago', 'financial_0',
       'financial_financiado', 'financial_de_leilao', 'financial_com_multas',
       'brand_ford', 'brand_vwvolkswagen', 'cartype_hatch', 'cartype_passeio',
       'cartype_seda', 'cartype_0', 'cartype_suv', 'cartype_pickup',
       'model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual',
       'gearbox_automatico', 'gearbox_semiautomatico', 'gearbox_0',
       'motorpower_1.6', 'motorpower_1', 'motorpower_0', 'motorpower_2.02.9',
       'motorpower_1.5', 'motorpower_1.4', 'fuel_flex',
       'car_steering_hidraulica', 'car_steering_eletrica', 'car_steering_0',
       'car_steering_mecnica', 'carcolor_branco', 'carcolor_prata',
       'carcolor_preto', 'carcolor_0', 'carcolor_vermelho', 'carcolor_cinza',
       'carcolor_azul', 'carcolor_outra', 'carcolor_laranja', 'carcolor_verde',
       'carcolor_amarelo', 'exchange_sim', 'exchange_não', 'exchange_0']

In [5]:
cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'gearbox_manual',
       'gearbox_automatico', 'gearbox_semiautomatico', 'gearbox_0', 'financial_ipva_pago', 'financial_0',
       'financial_financiado', 'financial_de_leilao', 'financial_com_multas' ,'Y']




In [6]:
#cols = df.columns

In [7]:
# shuffle dataframe
df = shuffle(df).reset_index(drop=True)

In [8]:
simple_df = df[cols]
simple_df.head(1)

Unnamed: 0,price,regdate,mileage,model_ka,model_ecosport,model_fiesta,model_fox,model_focus,model_gol,model_voyage,model_up,model_saveiro,model_crossfox,model_jetta,...,model_fusion,model_0,model_ranger,gearbox_manual,gearbox_manual.1,gearbox_automatico,gearbox_semiautomatico,gearbox_0,financial_ipva_pago,financial_0,financial_financiado,financial_de_leilao,financial_com_multas,Y
0,20990,2013,97000,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,1,0,0,0,0,0.0


In [9]:
Y = simple_df['Y']
X = simple_df.loc[:, 'price':'financial_com_multas']

In [10]:
# Split train and validation data
Xtrain, Xval, y_train, y_val = train_test_split(X, Y, test_size=0.5, random_state=42)
Xtrain.shape, Xval.shape, y_train.shape, y_val.shape

((2159, 29), (2160, 29), (2159,), (2160,))

### - Model 

- we have to account for inbalance data 

In [None]:
model = DecisionTreeClassifier(random_state= 42, max_depth = 2 , class_weight= 'balanced')
model.fit(Xtrain,y_train)

In [None]:
pred = model.predict_log_proba(Xval)[:,1]

### Metrics

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score

print('average_precision_score :', np.round(average_precision_score(y_val, pred),4))
print('roc_auc_score :', np.round(roc_auc_score(y_val, pred),4))

In [None]:
1) cols = ['price', 'regdate', 'mileage', 'Y']
-  average_precision_score : 0.4124 roc_auc_score : 0.8816
        
2) cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'Y']

- average_precision_score : 0.4342  roc_auc_score : 0.8882
        
        
3) cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'gearbox_manual',
       'gearbox_automático', 'gearbox_semiautomático', 'gearbox_0', 'Y']

- average_precision_score : 0.4101   roc_auc_score : 0.8789
 

4) cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'gearbox_manual',
       'gearbox_automático', 'gearbox_semiautomático', 'gearbox_0', 'financial_ipva pago', 'financial_0',
       'financial_financiado', 'financial_de leilão', 'financial_com multas' ,'Y']

- average_precision_score : 0.451    roc_auc_score : 0.8859
        
5) cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'gearbox_manual',
       'gearbox_automático', 'gearbox_semiautomático', 'gearbox_0', 'financial_ipva pago', 'financial_0',
       'financial_financiado', 'financial_de leilão', 'financial_com multas' ,'car_steering_hidráulica', 'car_steering_elétrica', 'car_steering_0',
       'car_steering_mecnica', 'Y']

- average_precision_score : 0.4347   roc_auc_score : 0.8915

6) All columns :
    
------ average_precision_score : 0.457  roc_auc_score : 0.8929

## Selected features - Decision Tree

In [None]:
4) cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'gearbox_manual',
       'gearbox_automático', 'gearbox_semiautomático', 'gearbox_0', 'financial_ipva pago', 'financial_0',
       'financial_financiado', 'financial_de leilão', 'financial_com multas' ,'Y']

- average_precision_score : 0.451    roc_auc_score : 0.8859

# random Forest

In [None]:
model = RandomForestClassifier(n_estimators= 1000 ,
                              random_state=42 ,
                              class_weight= 'balanced',
                              n_jobs = -1)

model.fit(Xtrain,y_train)

In [None]:
pred = model.predict_proba(Xval)[:,1]

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score

print('average_precision_score :', np.round(average_precision_score(y_val, pred),4))
print('roc_auc_score :', np.round(roc_auc_score(y_val, pred),4))

In [None]:
1) cols = ['price', 'regdate', 'mileage', 'Y']

------  average_precision_score : 0.5531  roc_auc_score : 0.9044
        
2)  cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'Y']   

------ average_precision_score : 0.6412   roc_auc_score : 0.9271
        
        
3)  cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'gearbox_manual',
       'gearbox_automático', 'gearbox_semiautomático', 'gearbox_0', 'Y']

------  average_precision_score : 0.6326   roc_auc_score : 0.9279
        
4) cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'gearbox_manual',
       'gearbox_automático', 'gearbox_semiautomático', 'gearbox_0', 'financial_ipva pago', 'financial_0',
       'financial_financiado', 'financial_de leilão', 'financial_com multas' ,'Y']


------ average_precision_score : 0.6414   roc_auc_score : 0.9312
        
5)  cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'gearbox_manual',
       'gearbox_automático', 'gearbox_semiautomático', 'gearbox_0', 'financial_ipva pago', 'financial_0',
       'financial_financiado', 'financial_de leilão', 'financial_com multas' ,'car_steering_hidráulica', 'car_steering_elétrica', 'car_steering_0',
       'car_steering_mecnica', 'Y']

------ average_precision_score : 0.64   roc_auc_score : 0.9304
        
6) All columns :
    
------ average_precision_score : 0.6319  roc_auc_score : 0.929

# Selected features

In [None]:
4) cols = ['price', 'regdate', 'mileage','model_ka', 'model_ecosport', 'model_fiesta', 'model_fox',
       'model_focus', 'model_gol', 'model_voyage', 'model_up', 'model_saveiro',
       'model_crossfox', 'model_jetta', 'model_golf', 'model_polo',
       'model_fusion', 'model_0', 'model_ranger', 'gearbox_manual', 'gearbox_manual',
       'gearbox_automático', 'gearbox_semiautomático', 'gearbox_0', 'financial_ipva pago', 'financial_0',
       'financial_financiado', 'financial_de leilão', 'financial_com multas' ,'Y']


------ average_precision_score : 0.6414   roc_auc_score : 0.9312

### Random Forest Tuning

In [77]:
model = RandomForestClassifier(n_estimators= 1000 ,
                              random_state=42 ,
                               min_samples_leaf= 2,
                              class_weight= 'balanced',
                              n_jobs = -1)

model.fit(Xtrain,y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=2,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=1000, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [78]:
pred = model.predict_proba(Xval)[:,1]

In [79]:
from sklearn.metrics import roc_auc_score, average_precision_score

print('average_precision_score :', np.round(average_precision_score(y_val, pred),4))
print('roc_auc_score :', np.round(roc_auc_score(y_val, pred),4))

average_precision_score : 0.6875
roc_auc_score : 0.9452


In [80]:
min_sample_leaf = 2 , n_estimator = 1000 --- > average_precision_score : 0.7303  roc_auc_score : 0.9479
min_sample_leaf = 1 , n_estimator = 100 --- > average_precision_score : 0.7319  roc_auc_score : 0.9472

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score

print('average_precision_score :', np.round(average_precision_score(y_val, pred),4))
print('roc_auc_score :', np.round(roc_auc_score(y_val, pred),4))

## Ok ! 


# LightGBM

In [None]:
Xtrain.head(1)

In [11]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from skopt import forest_minimize

In [12]:
model = LGBMClassifier(random_state = 42, class_weight='balanced', n_jobs= -1)
model.fit(Xtrain,y_train)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=42, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [13]:
pred = model.predict_proba(Xval)[:,1]

In [14]:
print('average_precision_score :', np.round(average_precision_score(y_val, pred),4))
print('roc_auc_score :', np.round(roc_auc_score(y_val, pred),4))

average_precision_score : 0.6953
roc_auc_score : 0.9448



# Bayesian Optimization

In [16]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    num_leaves = params[1]
    max_depth = params[2]
    min_child_samples = params[3]
    subsample =params[4]
    colsample_bytree = params[5]
    n_estimators = params[6]
    min_df = params[7]
    
    
    model = LGBMClassifier(learning_rate= lr,
                           num_leaves= num_leaves,
                           max_depth= max_depth,
                           min_child_samples= min_child_samples,
                           subsample= subsample,
                           colsample_bytree=colsample_bytree,
                           bagging_freq = 1,
                           n_estimators = n_estimators,
                           random_state= 42,
                           class_weight= 'balanced',
                           n_jobs= -1)
    
    model.fit(Xtrain,y_train)
    
    pred = model.predict_proba(Xval)[:,1]
    
    print()
    print('roc_auc_score :', np.round(roc_auc_score(y_val, pred),6))
    print('average_precision_score :', np.round(average_precision_score(y_val, pred),6))
    print()
    
    
    return - average_precision_score(y_val, pred)

space = [(1e-3,1e-1, 'log-uniform'), #lr
         (10,100),
         (1,20), # max depth
         (1,20), # min_child_samples
         (0.05,1.), # subsample 
         (0.05,1.), # colsample_bytree
         (100,1000), # n_estimators
         (1,5)] # min_df
        
         

res = forest_minimize(tune_lgbm,
                     space,
                     random_state = 42,
                     n_random_starts = 20,
                     n_calls = 50,
                     verbose = 1)





                   
         

Iteration No: 1 started. Evaluating function at random point.
[0.03918194347141743, 24, 11, 8, 0.6187255599871848, 0.19821770842031472, 566, 3]

roc_auc_score : 0.938762
average_precision_score : 0.670834

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.6622
Function value obtained: -0.6708
Current minimum: -0.6708
Iteration No: 2 started. Evaluating function at random point.
[0.008288916866885144, 33, 3, 2, 0.7358988336534836, 0.9416250735649627, 485, 4]

roc_auc_score : 0.946563
average_precision_score : 0.688315

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.4257
Function value obtained: -0.6883
Current minimum: -0.6883
Iteration No: 3 started. Evaluating function at random point.
[0.00232706770838378, 85, 12, 17, 0.5485359272454697, 0.42986792312949273, 575, 4]

roc_auc_score : 0.932891
average_precision_score : 0.641514

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.9768
Function value obtained: -0.6415
Current 

In [17]:
res.x

[0.015392517420880682,
 95,
 8,
 10,
 0.8798195541789631,
 0.9684426598014575,
 152,
 5]

In [None]:
roc_auc_score : 0.94885  average_precision_score : 0.712226

# Logistic Regression

In [20]:
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression

In [21]:
Xtrain.loc[:, ['price','regdate','mileage']].head()

Unnamed: 0,price,regdate,mileage
3969,25000,2015,80000
1681,36800,2016,49000
1443,24000,2013,70000
2542,27890,2014,117000
1671,28900,2015,35000


In [36]:
Xtrain2 = csr_matrix(Xtrain.copy())
Xval2 = csr_matrix(Xval.copy())



#scaler =StandardScaler()
scaler = MaxAbsScaler()

 
#Xtrain2[:,:3] = scaler.fit_transform( Xtrain2[:,:3].todense())
#Xval2[:,:3] = scaler.transform( Xval2[:,:3].todense())


Xtrain2 = scaler.fit_transform(Xtrain2)
Xval2 = scaler.transform(Xval2)

In [40]:
model = LogisticRegression(C =0.5, n_jobs= -1, random_state = 42)
model.fit(Xtrain2,y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2', random_state=42,
                   solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [41]:
pred = model.predict_proba(Xval2)[:,1]

In [42]:
print('roc_auc_score :', np.round(roc_auc_score(y_val, pred),6))
print('average_precision_score :', np.round(average_precision_score(y_val, pred),6))


roc_auc_score : 0.822115
average_precision_score : 0.441704


In [None]:
No tuning MaxAbsScaler     -- >  and roc_auc_score : 0.830213  average_precision_score : 0.454579 
MaxAbsScaler C = 10        -- >  and roc_auc_score : 0.87962  average_precision_score : 0.507722
MaxAbsScaler C = 0.5       -- >  and roc_auc_score : 0.822115  average_precision_score : 0.441704
No tuning StandardScaler   -- >  and roc_auc_score : 0.858561  average_precision_score : 0.453326 
        
        

# Ensemble 

## 1 - RF

In [59]:
model_rf = RandomForestClassifier(n_estimators= 1000 ,
                              random_state=42 ,
                               min_samples_leaf= 2,
                              class_weight= 'balanced',
                              n_jobs = -1)

model_rf.fit(Xtrain,y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=2,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=1000, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [60]:
pred_rf = model_rf.predict_proba(Xval)[:,1]

In [61]:


print('average_precision_score :', np.round(average_precision_score(y_val, pred_rf),4))
print('roc_auc_score :', np.round(roc_auc_score(y_val, pred_rf),4))

average_precision_score : 0.6875
roc_auc_score : 0.9452


## 2) LGBM

In [19]:
params = [0.015392517420880682, 95, 8, 10, 0.8798195541789631, 0.9684426598014575, 152, 5]
lr = params[0]
num_leaves = params[1]
max_depth = params[2]
min_child_samples = params[3]
subsample =params[4]
colsample_bytree = params[5]
n_estimators = params[6]
min_df = params[7]


model_lgbm = LGBMClassifier(learning_rate= lr,
                       num_leaves= num_leaves,
                       max_depth= max_depth,
                       min_child_samples= min_child_samples,
                       subsample= subsample,
                       colsample_bytree=colsample_bytree,
                       bagging_freq = 1,
                       n_estimators = n_estimators,
                       random_state= 42,
                       class_weight= 'balanced',
                       n_jobs= -1)

model_lgbm.fit(Xtrain,y_train)

pred_lgbm = model.predict_proba(Xval)[:,1]

print()
print('roc_auc_score :', np.round(roc_auc_score(y_val, pred_lgbm),6))
print('average_precision_score :', np.round(average_precision_score(y_val, pred_lgbm),6))
print()




roc_auc_score : 0.944835
average_precision_score : 0.695332



## Logistic Regression 

In [50]:
from sklearn.pipeline import make_pipeline

In [54]:
Xtrain2 = csr_matrix(Xtrain.copy())
Xval2 = csr_matrix(Xval.copy())

lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C = 10, n_jobs= -1, random_state = 42))
lr_pipeline.fit(Xtrain2,y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


Pipeline(memory=None,
         steps=[('maxabsscaler', MaxAbsScaler(copy=True)),
                ('logisticregression',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=-1, penalty='l2',
                                    random_state=42, solver='warn', tol=0.0001,
                                    verbose=0, warm_start=False))],
         verbose=False)

In [57]:
pred_lr = lr_pipeline.predict_proba(Xval2)[:,1]

In [58]:
print()
print('roc_auc_score :', np.round(roc_auc_score(y_val, pred_lr),6))
print('average_precision_score :', np.round(average_precision_score(y_val, pred_lr),6))
print()


roc_auc_score : 0.87962
average_precision_score : 0.507722



In [None]:
RF:   roc_auc_score : 0.9452     average_precision_score : 0.6875
LGBM: roc_auc_score : 0.944835   average_precision_score : 0.695332
LR :  roc_auc_score : 0.87962    average_precision_score : 0.507722

### LR +RF +LGBM

In [81]:
## Models mean 
p = (pred_rf + pred_lgbm + pred_lr)/ 3
print('roc_auc_score :', np.round(roc_auc_score(y_val, p),6))
print('average_precision_score :', np.round(average_precision_score(y_val, p),6))

roc_auc_score : 0.942239
average_precision_score : 0.68399


Let's check the correlation between the models to see how the predictions are correlated. 

In [83]:
pd.DataFrame({'LR':pred_lr, 'RF': pred_rf, 'LGBM':pred_lgbm}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.801701,0.741891
RF,0.801701,1.0,0.940105
LGBM,0.741891,0.940105,1.0


###  RF + LGBM

In [89]:
p = 0.1*pred_rf + 0.9*pred_lgbm
print('roc_auc_score :', np.round(roc_auc_score(y_val, p),6))
print('average_precision_score :', np.round(average_precision_score(y_val, p),6))

roc_auc_score : 0.945775
average_precision_score : 0.699181


In [None]:
0.5/0.5  --->  roc_auc_score : 0.946991  average_precision_score : 0.699105
0.4/0.6  --->  roc_auc_score : 0.946985  average_precision_score : 0.699817
0.3/0.7  --->  roc_auc_score : 0.94669  average_precision_score :  0.70086
0.2/0.8  --->  roc_auc_score : 0.94629  average_precision_score :  0.70082        
0.1/0.9  --->  roc_auc_score : 0.945775  average_precision_score : 0.699181      
        
        
        

In [90]:
p = 0.3*pred_rf + 0.7*pred_lgbm
print('roc_auc_score :', np.round(roc_auc_score(y_val, p),6))
print('average_precision_score :', np.round(average_precision_score(y_val, p),6))

roc_auc_score : 0.94669
average_precision_score : 0.70086


### LR +LGBM 

In [95]:
p = 0.1*pred_lr + 0.9*pred_lgbm
print('roc_auc_score :', np.round(roc_auc_score(y_val, p),6))
print('average_precision_score :', np.round(average_precision_score(y_val, p),6))

roc_auc_score : 0.940931
average_precision_score : 0.691972


In [None]:
0.5/0.5  --->  roc_auc_score : 0.934415  average_precision_score :  0.666274
0.4/0.6  --->  roc_auc_score : 0.936601  average_precision_score :  0.678067
0.3/0.7  --->  roc_auc_score : 0.938444   average_precision_score : 0.684441
0.2/0.8  --->  roc_auc_score : 0.939632   average_precision_score : 0.687614        
0.1/0.9  --->  roc_auc_score : 0.940931  average_precision_score :  0.691972 

### RF +LR

In [101]:
p = 0.1*pred_lr + 0.9*pred_rf
print('roc_auc_score :', np.round(roc_auc_score(y_val, p),6))
print('average_precision_score :', np.round(average_precision_score(y_val, p),6))

roc_auc_score : 0.9446
average_precision_score : 0.684468


In [None]:
0.5/0.5  --->  roc_auc_score : 0.935796  average_precision_score :  0.659216
0.4/0.6  --->  roc_auc_score : 0.939404  average_precision_score :  0.669158
0.3/0.7  --->  roc_auc_score : 0.942084  average_precision_score : 0.676274
0.2/0.8  --->  roc_auc_score : 0.943604  average_precision_score : 0.680321        
0.1/0.9  --->  roc_auc_score : 0.9446    average_precision_score :  0.684468

We can notice that the best result we get by combining the models is 'using' 30% of the random forest with 70% of the lgbm. Although, this combination is not significant when we compare the result of the lgbm itself. So, in order to deploy the simplest solution, I would choose only the lgbm. But... I want to see how the combination of those tho models will perform in production. Based on that, I will deploy both, the random forest and the lgbm models combine. 

# Save Models

In [102]:
import joblib as jb

In [106]:
jb.dump(model_lgbm,"lgbm_car.pk;.z")
jb.dump(model_rf, "random_forest_car.pk.z")

['random_forest_car.pk.z']