# 0.0 IMPORTS

In [1]:
import re
import warnings
import sweetviz
import IPython
import pickle
import math

import pandas            as pd
import numpy             as np
import lightgbm          as lgb
import xgboost           as xgb
import seaborn           as sns

from sklearn.linear_model    import LogisticRegression
from category_encoders       import TargetEncoder
from sklearn.preprocessing   import OneHotEncoder
from sklearn.ensemble        import ExtraTreesClassifier, RandomForestClassifier, StackingClassifier, HistGradientBoostingClassifier
from scipy                   import stats
from unidecode               import unidecode
from catboost                import CatBoostClassifier
from skopt                   import gp_minimize
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing   import RobustScaler, MinMaxScaler
from imblearn                import over_sampling, under_sampling
from sklearn.neural_network  import MLPClassifier
from sklearn.naive_bayes     import GaussianNB
from sklearn                 import svm
from sklearn.impute          import KNNImputer
from sklearn.neighbors       import KNeighborsClassifier

from sklearn                 import model_selection   as ms
from sklearn                 import metrics           as m
from matplotlib              import pyplot            as plt
from imblearn                import combine           as co

  from .autonotebook import tqdm as notebook_tqdm


## 0.1 AUX FUNCTIONS

In [2]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

def ml_error( model_name, ytest, yhat ):
    f1 = m.f1_score( ytest, yhat )
    precision = m.precision_score( ytest, yhat )
    recall = m.recall_score( ytest, yhat )

    return pd.DataFrame( {'Model name': model_name,
                          'F1': f1,
                          'Precision': precision,
                          'Recall': recall  }, index=[0] )

## 0.2 READ DATASETS

In [3]:
df_users = pd.read_csv( "Dataset_model.csv" )
df_questions = pd.read_csv( "subjects_questions.csv" )

df_users = df_users.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

df_questions.dropna( inplace=True )
df_questions['subject_id'] = df_questions['subject_id'].astype(str)
df_questions = df_questions.groupby(['novo_question_id']).agg({'subject_id': ', '.join}).reset_index()
df_users = df_users.merge( df_questions, how='left', left_on='novo_question_id', right_on='novo_question_id' )

In [4]:
df_taxa_acerto = pd.read_csv( 'features/taxa_acertos.csv' )
df_taxa_acerto_knowledge = pd.read_csv( 'features/taxa_acerto_knowledge.csv' )

In [5]:
df_dados_estado = pd.read_csv( "Dados Estado Simplificado.csv" )
df_dados_municipio = pd.read_excel( "Dados Município Simplificado.xlsx", sheet_name="Dados" )
df_pib = pd.read_excel( "PIB Municípios.xlsx" )
df_ibge = pd.read_excel( "IBGE.xlsx", sheet_name="Planilha1" )

df_users['region'] = df_users['region'].apply( lambda x: unidecode(x).upper() if type(x) is not float else x )

# merge public datasets
df_dados_estado.rename( columns={ 'População':'populacao_estado','Matrículas':'matriculas_estado',
                                  'Categoria do Quadrante':'categoria_quadrante_estado','Insumos do Ioeb':'insumos_ioeb_estado',
                                  'Ioeb':'ioeb_estado','Resultados do Ioeb':'resultados_ioeb_estado'}, inplace=True)
                                  
df_dados_municipio.rename( columns={ 'População':'populacao_cidade','Total de Matrículas':'matriculas_cidade',
                                  'Categoria do Quadrante':'categoria_quadrante_cidade','Insumos do Ioeb':'insumos_ioeb_cidade',
                                  'Ioeb':'ioeb_cidade','Resultados do Ioeb':'resultados_ioeb_cidade','Região':'regiao' }, inplace=True)

for col in df_dados_estado.columns:
    if df_dados_estado[f'{col}'].dtypes != "int64" and df_dados_estado[f'{col}'].dtypes != "float64":
        df_dados_estado[f'{col}'] = df_dados_estado[f'{col}'].apply(lambda x: x.replace( ",", "" ) if type(x) is not float else x )
        df_dados_estado[f'{col}'] = df_dados_estado[f'{col}'].apply(lambda x: x.replace( ".", "" ) if type(x) is not float else x )

for col in df_dados_municipio.columns:
    if df_dados_municipio[f'{col}'].dtypes != "int64" and type(df_dados_municipio[f'{col}']) != "float64":
        df_dados_municipio[f'{col}'] = df_dados_municipio[f'{col}'].apply(lambda x: x.replace( ",", "" ) if type(x) is not float else x )
        df_dados_municipio[f'{col}'] = df_dados_municipio[f'{col}'].apply(lambda x: x.replace( ".", "" ) if type(x) is not float else x )

# merge df users com df dados estado
df_dados_estado['Estado'] = df_dados_estado['Estado'].apply( lambda x: unidecode(x).upper() )
df_users['region'] = df_users['region'].apply( lambda x: unidecode(x).upper() if type(x) is not float else x )
df_users['city'] = df_users['city'].apply( lambda x: unidecode(x).upper() if type(x) is not float else x )
df_users['city'] = df_users['city'].apply( lambda x: x.replace( "'", "" ) if type(x) is not float else x )
df_users.loc[df_users.city == 'FEDERAL DISTRICT', 'city'] = 'BRASILIA'
df_users.loc[df_users.city == 'BRASILIA', 'region'] = 'DISTRITO FEDERAL'

df_users = df_users.merge( df_dados_estado, how='left', left_on='region', right_on='Estado' )
df_users.drop( columns=['Estado','Ano do Ioeb','Código da UF','Unidade da Federação','Região'], inplace=True )

# merge df users com df dados municipio
df_dados_municipio['Nome do Município'] = df_dados_municipio['Nome do Município'].apply( lambda x: unidecode(x).upper() )
df_dados_municipio['estado'] = df_dados_municipio['estado'].apply( lambda x: unidecode(x).upper() )

df_users = df_users.merge( df_dados_municipio, how='left', left_on=['city','region'], right_on=['Nome do Município','estado'] )
df_users.drop( columns=['Código do Município','Código da UF','Unidade da Federação','Nome do Município','Ano do Ioeb','estado'], inplace=True )

features_to_float = ['ioeb_estado', 'insumos_ioeb_estado', 'resultados_ioeb_estado', 'populacao_estado', 
'ioeb_cidade', 'insumos_ioeb_cidade', 'resultados_ioeb_cidade', 'matriculas_cidade', 'populacao_cidade']
df_users[features_to_float] = df_users[features_to_float].astype(float)

df_pib['Município'] = df_pib['Município'].apply( lambda x: x.split('(')[0] )
df_pib['Município'] = df_pib['Município'].apply( lambda x: unidecode(x).upper().strip() )
df_pib['Município'] = df_pib['Município'].apply( lambda x: x.replace( "'", "" ) )
df_pib['estado'] = df_pib['estado'].apply( lambda x: unidecode(x).upper().strip() )


df_users = df_users.merge( df_pib, how='left', left_on=['city','region'], right_on=['Município','estado'] )
df_users.drop( columns=['Município','estado'], inplace=True )

df_ibge = df_ibge[['Município','estado','IDHM','IDHM_E','IDHM_L','IDHM_R','I_ESCOLARIDADE','I_FREQ_PROP','TRABPUB','RENOCUP','GINI','T_FLSUPER',
'T_ANALF18M','E_ANOSESTUDO','ESPVIDA','T_FBMED', 'T_FBSUPER', 'RDPC','PESO1824','T_MED18A24','T_FREQFUND1824']]
df_ibge['Município'] = df_ibge['Município'].apply( lambda x: unidecode(x).upper().strip() )
df_ibge['Município'] = df_ibge['Município'].apply( lambda x: x.replace( "'", "" ) )
df_ibge['estado'] = df_ibge['estado'].apply( lambda x: unidecode(x).upper().strip() )

df_users = df_users.merge( df_ibge, how='left', left_on=['city','region'], right_on=['Município','estado'] )
df_users.drop( columns=['Município','estado'], inplace=True )

In [6]:
df_users = df_users[['novo_user_id', 'city', 'country', 'device', 'device_type', 'os',
       'platform', 'region', 'gpcarrers', 'gpcollegetype', 'gpdegreecourse',
       'gppreviousexperience', 'gpschooltype', 'gpsegment', 'gpsource_project',
       'acertou', 'created_at', 'row', 'commented_by_professor', 'difficulty',
       'discipline_id', 'examining_board_id', 'institute_id',
       'knowledge_area_id', 'modality_id', 'nullified', 'outdated',
       'product_id', 'publication_year', 'right_answer', 'scholarity_id',
       'novo_question_id', 'subject_id','PIB','TRABPUB']]

In [7]:
# print('{}'.format(len(df_users['discipline_id'].unique())))
# print('{}'.format(len(df_users['examining_board_id'].unique())))
# print('{}'.format(len(df_users['institute_id'].unique())))
# print('{}'.format(len(df_users['knowledge_area_id'].unique())))
# print('{}'.format(len(df_users['subject_id'].unique())))

# 1.0 DATA DESCRIPTION

## 1.1 Shape

In [8]:
print( f"df users rows: {df_users.shape[0]}" )
print( f"df users columns: {df_users.shape[1]}" )

df users rows: 2000000
df users columns: 35


## 1.2 Check NA

In [9]:
# df_users['os'] = df_users['os'].fillna( df_users['os'].mode()[0] )
# df_users['gpcarrers'] = df_users['gpcarrers'].fillna( df_users['gpcarrers'].mode()[0] )
# df_users['discipline_id'] = df_users['discipline_id'].fillna( df_users['discipline_id'].mode()[0] )
# df_users['examining_board_id'] = df_users['examining_board_id'].fillna( df_users['examining_board_id'].mode()[0] )
# df_users['institute_id'] = df_users['institute_id'].fillna( df_users['institute_id'].mode()[0] )
# df_users['knowledge_area_id'] = df_users['knowledge_area_id'].fillna( df_users['knowledge_area_id'].mode()[0] )
# df_users['publication_year'] = df_users['publication_year'].fillna( df_users['publication_year'].mode()[0] )
# df_users['PIB'] = df_users['PIB'].fillna( df_users['PIB'].mean() )
# df_users['TRABPUB'] = df_users['TRABPUB'].fillna( df_users['TRABPUB'].mean() )
# df_users['subject_id'] = df_users['subject_id'].fillna( df_users['subject_id'].mode()[0] )

## 1.3 Data Types

In [10]:
df_users['created_at'] = pd.to_datetime(df_users['created_at'])

# 2.0 FEATURE ENGINEERING

In [11]:
df2_users = df_users.copy()

In [12]:
df2_users = df2_users.merge( df_taxa_acerto, how='left', right_on=['novo_user_id','difficulty'], left_on=['novo_user_id','difficulty'])
df2_users = df2_users.merge( df_taxa_acerto_knowledge, how='left', right_on=['novo_user_id','knowledge_area_id'], left_on=['novo_user_id','knowledge_area_id'])

In [13]:
# list_users = df2_users['novo_user_id'].unique()
# difficulty_list = df2_users['difficulty'].unique()
# df2_users['taxa_acerto'] = math.nan

# for user in list_users:
#     df_user = df2_users.loc[df2_users['novo_user_id'] == user ]
#     print(user)
    
#     for difficulty in difficulty_list:
#         taxa_acerto = df_user.loc[df_user['difficulty'] == difficulty, 'acertou'].mean()
#         df2_users.loc[(df2_users['novo_user_id'] == user) & (df2_users['difficulty'] == difficulty), 'taxa_acerto'] = taxa_acerto
#         print(taxa_acerto)

In [14]:
# df_taxa_acertos = df2_users[['novo_user_id','difficulty','taxa_acerto']]
# df_taxa_acertos.drop_duplicates(subset=['novo_user_id','difficulty'], inplace=True )
# df_taxa_acertos.reset_index(drop=True, inplace=True)
# df_taxa_acertos.to_csv('features/taxa_acertos.csv', index=False)

In [15]:
# list_users = df2_users['novo_user_id'].unique()
# knowledge_list = df2_users['knowledge_area_id'].unique()
# df2_users['taxa_acerto_knowledge'] = math.nan

# for user in list_users:
#     df_user = df2_users.loc[df2_users['novo_user_id'] == user ]
#     print(user)
    
#     for knowledge in knowledge_list:
#         taxa_acerto_knowledge = df_user.loc[df_user['knowledge_area_id'] == knowledge, 'acertou'].mean()
#         df2_users.loc[(df2_users['novo_user_id'] == user) & (df2_users['knowledge_area_id'] == knowledge), 'taxa_acerto_knowledge'] = taxa_acerto_knowledge
#         if not pd.isnull(taxa_acerto_knowledge):
#             print(taxa_acerto_knowledge)

In [16]:
# df_taxa_acertos_knowledge = df2_users[['novo_user_id','knowledge_area_id','taxa_acerto_knowledge']]
# df_taxa_acertos_knowledge.drop_duplicates(subset=['novo_user_id','knowledge_area_id'], inplace=True )
# df_taxa_acertos_knowledge.reset_index(drop=True, inplace=True)
# df_taxa_acertos_knowledge.to_csv('features/taxa_acerto_knowledge.csv', index=False)

In [17]:
# year
df2_users['year'] = df2_users['created_at'].dt.year

# month
df2_users['month'] = df2_users['created_at'].dt.month

# day
df2_users['day'] = df2_users['created_at'].dt.day

# week of year 
df2_users['week_of_year'] = df2_users['created_at'].dt.isocalendar().week
df2_users['week_of_year'] = df2_users['week_of_year'].astype(int)

# weekday
df2_users['day_of_week'] = df2_users['created_at'].dt.weekday

# hour
df2_users['hour'] = df2_users['created_at'].dt.hour

# minute
df2_users['minute'] = df2_users['created_at'].dt.minute

# second
df2_users['second'] = df2_users['created_at'].dt.second

In [18]:
# nivel de concentracao
df2_users['concentracao'] = df2_users['row'].apply( lambda x: 'pomodoro1' if x <=10 else 
                                                              'pomodoro2' if x > 10 and x <= 20 else
                                                              'pomodoro3' if x > 20 and x <= 30 else
                                                              'pomodoro4' if x > 30 and x <= 40 else 
                                                              'pomodoro5' if x > 40 and x <= 50 else 
                                                              'pomodoro6' if x > 50 and x <= 60 else 
                                                              'pomodoro7' if x > 60 and x <= 70 else 
                                                              'pomodoro8' if x > 70 and x <= 80 else 
                                                              'pomodoro9' if x > 80 and x <= 90 else 
                                                              'pomodoro10' )

# letras_recorrentes
letras_recorrentes = ['A','E']
df2_users['letras_recorrentes'] = df2_users['right_answer'].apply( lambda x: 'recorrente' if x in letras_recorrentes else 'nao recorrente'  )

# nivel_dificuldade: 1/2-3/3-5
df2_users['nivel_dificuldade'] = df2_users['difficulty'].apply( lambda x: 'Baixa' if x <= 1 else 
                                                                          'Média' if x > 1 and x <= 3 else
                                                                          'Alta' if x > 3 else 'NA' )

In [19]:
df2_users['subject_id'] = df2_users.apply( lambda x: str(x['subject_id']) + ' - ' + str(x['scholarity_id']), axis=1)

# 3.0 DATA FILTERING

# 4.0 EDA

## 4.1 Análise Bivariada

# 5.0 DATA PREPARATION

In [20]:
df5_users = df2_users.copy()

In [21]:
# Rescaling
mms = MinMaxScaler()
rs = RobustScaler()

df5_users['discipline_id'] = rs.fit_transform( df5_users[['discipline_id']].values )
pickle.dump( rs, open( 'encoders/discipline_id_scaler', 'wb') )

df5_users['examining_board_id'] = rs.fit_transform( df5_users[['examining_board_id']].values )
pickle.dump( rs, open( 'encoders/examining_board_id_scaler', 'wb') )

df5_users['institute_id'] = rs.fit_transform( df5_users[['institute_id']].values )
pickle.dump( rs, open( 'encoders/institute_id_scaler', 'wb') )

df5_users['knowledge_area_id'] = rs.fit_transform( df5_users[['knowledge_area_id']].values )
pickle.dump( rs, open( 'encoders/knowledge_area_id_scaler', 'wb') )

df5_users['PIB'] = mms.fit_transform( df5_users[['PIB']].values )
pickle.dump( mms, open( 'encoders/PIB_scaler', 'wb') )

df5_users['TRABPUB'] = mms.fit_transform( df5_users[['TRABPUB']].values )
pickle.dump( mms, open( 'encoders/TRABPUB_scaler', 'wb') )

df5_users['year'] = mms.fit_transform( df5_users[['year']].values )
pickle.dump( mms, open( 'encoders/year_scaler', 'wb') )

# df5_users['month'] = mms.fit_transform( df5_users[['month']].values )
# pickle.dump( mms, open( 'encoders/month_scaler', 'wb') )

# df5_users['day'] = mms.fit_transform( df5_users[['day']].values )
# pickle.dump( mms, open( 'encoders/day_scaler', 'wb') )

# df5_users['week_of_year'] = mms.fit_transform( df5_users[['week_of_year']].values )
# pickle.dump( mms, open( 'encoders/week_of_year_scaler', 'wb') )

df5_users['day_of_week'] = mms.fit_transform( df5_users[['day_of_week']].values )
pickle.dump( mms, open( 'encoders/day_of_week_scaler', 'wb') )

# df5_users['hour'] = mms.fit_transform( df5_users[['hour']].values )
# pickle.dump( mms, open( 'encoders/hour_scaler', 'wb') )

# df5_users['minute'] = mms.fit_transform( df5_users[['minute']].values )
# pickle.dump( mms, open( 'encoders/minute_scaler', 'wb') )

# df5_users['second'] = mms.fit_transform( df5_users[['second']].values )
# pickle.dump( mms, open( 'encoders/second_scaler', 'wb') )

df5_users['publication_year'] = mms.fit_transform( df5_users[['publication_year']].values )
pickle.dump( mms, open( 'encoders/publication_year_scaler', 'wb') )

In [22]:
# target encoder
te = TargetEncoder()

df5_users['city'] = te.fit_transform( df5_users['city'], df5_users['acertou'] )
pickle.dump( te, open( 'encoders/city_encoding', 'wb' ) )

df5_users['os'] = te.fit_transform( df5_users['os'], df5_users['acertou'] )
pickle.dump( te, open( 'encoders/os_encoding', 'wb' ) )

df5_users['gpcarrers'] = te.fit_transform( df5_users['gpcarrers'], df5_users['acertou'] )
pickle.dump( te, open( 'encoders/gpcarrers_encoding', 'wb' ) )

df5_users['subject_id'] = te.fit_transform( df5_users['subject_id'], df5_users['acertou'] )
pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

# df5_users['country'] = te.fit_transform( df5_users['country'], df5_users['acertou'] )
# # pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

#df5_users['device'] = te.fit_transform( df5_users['device'], df5_users['acertou'] )
# pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

#df5_users['device_type'] = te.fit_transform( df5_users['device_type'], df5_users['acertou'] )
# pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

df5_users['platform'] = te.fit_transform( df5_users['platform'], df5_users['acertou'] )
pickle.dump( te, open( 'encoders/platform_encoding', 'wb' ) )

# df5_users['region'] = te.fit_transform( df5_users['region'], df5_users['acertou'] )
# pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

# df5_users['gpcollegetype'] = te.fit_transform( df5_users['gpcollegetype'], df5_users['acertou'] )
# # pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

# df5_users['gpdegreecourse'] = te.fit_transform( df5_users['gpdegreecourse'], df5_users['acertou'] )
# # pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

# df5_users['gppreviousexperience'] = te.fit_transform( df5_users['gppreviousexperience'], df5_users['acertou'] )
# pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

# df5_users['gpschooltype'] = te.fit_transform( df5_users['gpschooltype'], df5_users['acertou'] )
# # pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

# df5_users['gpsegment'] = te.fit_transform( df5_users['gpsegment'], df5_users['acertou'] )
# # pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

# df5_users['gpsource_project'] = te.fit_transform( df5_users['gpsource_project'], df5_users['acertou'] )
# # pickle.dump( te, open( 'encoders/subject_id_encoding', 'wb' ) )

df5_users['right_answer'] = te.fit_transform( df5_users['right_answer'], df5_users['acertou'] )
pickle.dump( te, open( 'encoders/right_answer_encoding', 'wb' ) )

df5_users['letras_recorrentes'] = te.fit_transform( df5_users['letras_recorrentes'], df5_users['acertou'] )
pickle.dump( te, open( 'encoders/letras_recorrentes_encoding', 'wb' ) )

df5_users['concentracao'] = te.fit_transform( df5_users['concentracao'], df5_users['acertou'] )
pickle.dump( te, open( 'encoders/concentracao_encoding', 'wb' ) )

df5_users['nivel_dificuldade'] = te.fit_transform( df5_users['nivel_dificuldade'], df5_users['acertou'] )
pickle.dump( te, open( 'encoders/nivel_dificuldade_encoding', 'wb' ) )

df5_users['created_at'] = df5_users['created_at'].astype(str)
df5_users['created_at'] = te.fit_transform( df5_users['created_at'], df5_users['acertou'] )
pickle.dump( te, open( 'encoders/created_at_encoding', 'wb' ) )

df5_users.drop( columns=['novo_user_id','novo_question_id'], inplace=True )

# 6.0 FEATURE SELECTION

In [23]:
df6_users = df5_users.copy()

# # model
# forest = ExtraTreesClassifier( n_jobs=-1 )

# # training
# x_train_fselection = df6_users.drop(columns=['acertou'])
# y_train_fselection = df6_users['acertou'].values

# forest.fit( x_train_fselection, y_train_fselection )

In [24]:
# importances = forest.feature_importances_
# std = np.std( [tree.feature_importances_ for tree in forest.estimators_], axis=0 )
# indices = np.argsort( importances )[::-1]

# # print the feature ranking
# df = pd.DataFrame()

# print( 'Feature Ranking:\n' )
# for i, j in zip( x_train_fselection,forest.feature_importances_ ):
#     aux = pd.DataFrame( {'feature': i, 'importance': j}, index=[0] )
#     df = pd.concat( [df, aux], axis=0 )
    
# print( df.sort_values( 'importance', ascending=False ) ) 

# # plot the impurity-based feature importances of the forest
# plt.figure(figsize=(10,5))
# plt.title( 'Feature importances' )
# plt.bar( range( x_train_fselection.shape[1] ), importances[indices], color='r', yerr=std[indices], align='center' )
# plt.xticks( range(x_train_fselection.shape[1]), indices )
# plt.xlim( [-1, x_train_fselection.shape[1]] )
# plt.show()

# 7.0 MACHINE LEARNING

In [25]:
cols = ['city','acertou','row','region','device_type','device','country','gpschooltype','gpcollegetype','gpdegreecourse','nullified','outdated','gpsegment','product_id','modality_id',
'commented_by_professor','gpsource_project','scholarity_id','gppreviousexperience','difficulty','second','minute','gpcarrers','letras_recorrentes','month','day','week_of_year','hour','concentracao','discipline_id','taxa_acerto','taxa_acerto_knowledge']
X = df6_users.drop(columns=cols)
Y = df6_users['acertou'].copy() 

X_train, X_val, y_train, y_val = ms.train_test_split( X, Y, test_size=0.2, random_state=42 )

## 7.1 LightGBM

In [26]:
# model
model_lgb = lgb.LGBMClassifier( n_estimators=420, max_depth=10, learning_rate=0.0021326894421670327, num_leaves=120, min_child_samples=33, subsample=0.3969072433479774, 
                                colsample_bytree=0.9971483355092672, n_jobs=-1, random_state=42, subsample_freq=1 ).fit( X_train, y_train )
# model_lgb = lgb.LGBMClassifier().fit( X_train, y_train )

# prediction
yhat_lgb = model_lgb.predict( X_val )

# performance
model_lgb_results = ml_error( 'LightGBM',  y_val, yhat_lgb )
model_lgb_results

Unnamed: 0,Model name,F1,Precision,Recall
0,LightGBM,0.79949,0.679576,0.970791


In [27]:
model_lgb = lgb.LGBMClassifier( n_estimators=420, max_depth=10, learning_rate=0.0021326894421670327, num_leaves=120, min_child_samples=33, subsample=0.3969072433479774, 
                                colsample_bytree=0.9971483355092672, n_jobs=-1, random_state=42, subsample_freq=1 ).fit( X, Y )

pickle.dump( model_lgb, open( 'model/model_lgb.pkl', 'wb' ) )

In [28]:
# cross validation
kfold = KFold(n_splits=5, random_state=1, shuffle=True)
cv = cross_val_score(model_lgb, X, Y, scoring='f1', cv=kfold, n_jobs=-1, error_score='raise' )
print(f'{np.mean(cv)} +/- {np.std(cv)}')

0.7999300368574012 +/- 0.0004896090824439788


## 7.2 HistGradientBoostingClassifier

In [29]:
# # model
# model_hist = HistGradientBoostingClassifier().fit( X_train, y_train )

# # prediction
# yhat_hist = model_hist.predict( X_val )

# # performance
# model_hist_results = ml_error( 'Hist',  y_val, yhat_hist )
# model_hist_results

## 7.3 XGBoost

In [30]:
# # model
# model_xgb = xgb.XGBClassifier( n_jobs=-1 ).fit( X_train, y_train )

# # prediction
# yhat_xgb = model_xgb.predict( X_val )

# # performance
# model_xgb_results = ml_error( 'XGBoost',  y_val, yhat_xgb )
# model_xgb_results

## 7.4 CatBoost

In [32]:
# # model
# model_cb = CatBoostClassifier( verbose=False ).fit( X_train, y_train )

# # prediction
# yhat_cb = model_cb.predict( X_val )

# # performance
# model_cb_results = ml_error( 'CatBoost',  y_val, yhat_cb)
# model_cb_results

## 7.5 MLP

In [33]:
# # Criando a rede neural
# mlp_model = MLPClassifier().fit(X_train, y_train)

# # Fit vai fazer o ajuste dos pesos (treinamento)
# yhat_mlp = mlp_model.predict( X_val )

# # Função que faz a contagem dos acertos e dos erros
# model_mlp_results = ml_error( 'MLP',  y_val, yhat_mlp )
# model_mlp_results

In [34]:
# mlp_model = MLPClassifier().fit( X, Y )

# pickle.dump( mlp_model, open( 'model/mlp_model.pkl', 'wb' ) )

## 7.6 Extra Trees

In [35]:
# # model
# model_et = ExtraTreesClassifier().fit( X_train, y_train )

# # prediction
# yhat_et = model_et.predict( X_val )

# # performance
# model_et_results = ml_error( 'Extra Trees',  y_val, yhat_et)
# model_et_results

## 7.7 KNN

In [36]:
# # model
# model_knn = KNeighborsClassifier().fit( X_train, y_train )

# # prediction
# yhat_knn = model_knn.predict( X_val )

# # performance
# model_knn_results = ml_error( 'KNN',  y_val, yhat_knn)
# model_knn_results

## 7.8 Logistic Regression

In [37]:
# # model
# model_lr = LogisticRegression().fit( X_train, y_train )

# # prediction
# yhat_lr = model_lr.predict( X_val )

# # performance
# model_lr_results = ml_error( 'Logistic Regression',  y_val, yhat_lr)
# model_lr_results

## 7.9 Naive Bayes

In [38]:
# # model
# model_gaussian = GaussianNB().fit( X_train, y_train )

# # prediction
# yhat_gaussian = model_gaussian.predict( X_val )

# # performance
# model_gaussian_results = ml_error( 'Naive Bayes',  y_val, yhat_gaussian )
# model_gaussian_results

## 7.10 SVM

In [39]:
# # model
# model_svm = svm.SVC().fit( X_train, y_train )

# # prediction
# yhat_svm = model_svm.predict( X_val )

# # performance
# model_svm_results = ml_error( 'SVM',  y_val, yhat_svm )
# model_svm_results

## 7.11 Stacking Model

In [40]:
# estimators_list = [ ('lgbm', model_lgb), ('xgboost', model_xgb) ]

# # Build stack model
# stack_model = StackingClassifier( estimators = estimators_list, final_estimator=LogisticRegression(), n_jobs=-1, verbose=True ).fit( X_train, y_train )
# pickle.dump( stack_model, open( 'model/model_stack.pkl', 'wb' ) )

# # prediction
# yhat_stack = stack_model.predict( X_val )

# # performance
# stack_model_results = ml_error( 'Stacking Model', y_val, yhat_stack )
# stack_model_results

# 8.0 Bayesian Optimization

## 8.1 LGBM

In [41]:
# search_space = [(100, 1500), #name = 'n_estimators'), 
#                 (1, 20), #name = 'max_depth'), 
#                 (0.001, 0.1, 'log-uniform'),#, name = 'learning_rate'),
#                 (2, 128), #name = 'num_leaves'),
#                 (1, 100), #name = 'min_child_samples'),
#                 (0.05, 1.0), #name = 'subsample'),
#                 (0.15, 1.0) #name = 'colsample_bytree')]
# ]

# def treinar_modelo( params ):
#     n_estimators = params[0]
#     max_depth = params[1]
#     learning_rate = params[2]
#     num_leaves = params[3]
#     min_child_samples = params[4]
#     subsample = params[5]
#     colsample_bytree = params[6]

#     print(params, '\n')

#     lgbm_model = lgb.LGBMClassifier(learning_rate = learning_rate, num_leaves=num_leaves, n_estimators=n_estimators, max_depth=max_depth, min_child_samples=min_child_samples, subsample=subsample, colsample_bytree=colsample_bytree, n_jobs=-1, random_state=42, subsample_freq=1)
#     lgbm_model.fit( X_train, y_train )

#     yhat_lgb = lgbm_model.predict( X_val )

#     return -m.f1_score( y_val, yhat_lgb )

# result = gp_minimize( treinar_modelo, search_space, n_calls = 200, n_initial_points = 10, verbose=True, n_jobs=-1, random_state= 42 )