### Imports

In [51]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, IsolationForest
from sklearn.metrics import  f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import seaborn as sns
import plotly.express as px
import sweetviz as sv
import pickle
from boruta import BorutaPy
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVR

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### Helper Functions

In [58]:
alvo='limite_adicional'

def simple_model_test(model, nome):
    #model definition

    # model fit
    model.fit(X_train, y_train)

    #model predict
    y_hat = model.predict(X_test)
    
    # evaluate
    f1 = np.round(f1_score( y_test_, y_hat_ ), 2)
    
    print('{}\n f1: {}'.format(model_name, f1))
    return f1

def send_model(model):
     # model fit
    model.fit(X_train[cols_selected], y_train)
    pickle.dump(model, open('../parameters/model.pkl', 'wb'))
    print('Model submited')
    return None

def target_encoding(df, column):
    target = df.groupby(column)[alvo].mean()
    df[column] = df[column].map(target)
    pickle.dump(target, open(f'../parameters/{column}_encode.pkl', 'wb'))
    print(f'Target Encode to {column}')
    
def frequency_encoding(df, column):
    frequency = df.groupby(column)[alvo].count() / len(df)
    df[column] = df[column].map(frequency)
    pickle.dump(target, open(f'../parameters/{column}_encode.pkl', 'wb'))
    print(f'Frequency Encode to {column}')
    

In [6]:
def data_preparation(df, teste=True):
    num_cols = df.select_dtypes('number').drop('id_cliente', axis=1).columns
    mms = pp.MinMaxScaler()

    df[num_cols] = mms.fit_transform(df[num_cols])

    return df

In [64]:
def encodes_obrigatorios(df, teste=True):
    num_cols = df.select_dtypes('number').drop('id_cliente', axis=1).columns


    map_bool = {'Sim': 1, 'Não': 0}
    map_alvo = {'Conceder': 1, 'Negar': 0}
    string_cols = ['investe_exterior', 'pessoa_polit_exp']

    df['investe_exterior'] = df['investe_exterior'].map(map_bool)
    df['pessoa_polit_exp'] = df['pessoa_polit_exp'].map(map_bool)
    
    if teste==False:
        df[alvo] = df[alvo].map(map_alvo)
    return df

### Load Data 

In [67]:
path = '../data/train.csv'

df_raw = pd.read_csv(path)
df_raw=encodes_obrigatorios(df_raw, teste=False)

df_test = pd.read_csv('../data/test.csv')
df_test=encodes_obrigatorios(df_test, teste=True)

### Data Description

In [68]:
df1 = df_raw.copy()

map_alvo = {'Conceder': 1, 'Negar': 0}
df1['limite_adicional'] = df1['limite_adicional'].map(map_alvo)

#### Nulls

In [69]:
df1.isna().sum()

id_cliente                    0
idade                         0
saldo_atual                   0
divida_atual                  0
renda_anual                   0
valor_em_investimentos        0
taxa_utilizacao_credito       0
num_emprestimos               0
num_contas_bancarias          0
num_cartoes_credito           0
dias_atraso_dt_venc           0
num_pgtos_atrasados           0
num_consultas_credito         0
taxa_juros                    0
investe_exterior              0
pessoa_polit_exp              0
limite_adicional           9500
dtype: int64

#### Dtypes

In [70]:
df1.dtypes

id_cliente                   int64
idade                        int64
saldo_atual                float64
divida_atual               float64
renda_anual                float64
valor_em_investimentos     float64
taxa_utilizacao_credito    float64
num_emprestimos              int64
num_contas_bancarias         int64
num_cartoes_credito          int64
dias_atraso_dt_venc          int64
num_pgtos_atrasados          int64
num_consultas_credito        int64
taxa_juros                   int64
investe_exterior             int64
pessoa_polit_exp             int64
limite_adicional           float64
dtype: object

### Outliers inspection

In [49]:
dashboard=sv.analyze(df1, target_feat='limite_adicional')

                                             |          | [  0%]   00:00 -> (? left)

In [75]:
dashboard.show_html()

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


### Data Filtering

In [77]:
df2 = df1.copy()

### Feature Engineering

In [78]:
df3 = df2.copy()

### Colum Selection

In [79]:
df4 = df3.copy()

### Data Preparation

In [84]:
df5 = df4.copy()

In [85]:
df5.columns

Index(['id_cliente', 'idade', 'saldo_atual', 'divida_atual', 'renda_anual',
       'valor_em_investimentos', 'taxa_utilizacao_credito', 'num_emprestimos',
       'num_contas_bancarias', 'num_cartoes_credito', 'dias_atraso_dt_venc',
       'num_pgtos_atrasados', 'num_consultas_credito', 'taxa_juros',
       'investe_exterior', 'pessoa_polit_exp', 'limite_adicional'],
      dtype='object')

In [86]:
rs_idade=pp.RobustScaler(quantile_range=(0, 95.0))
variables=['idade','taxa_juros','num_consultas_credito',"num_pgtos_atrasados","num_cartoes_credito","num_contas_bancarias",]
df5[variables]=rs_idade.fit_transform(df5[variables])
df5.head()

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior,pessoa_polit_exp,limite_adicional
0,1767,-0.4,278.17,2577.05,24196.9,104.31,31.04,6,-0.1,0.11,21,0.0,0.42,0.03,0,0,
1,11920,0.05,268.87,2465.39,19227.38,69.86,36.92,5,0.2,-0.11,40,0.39,0.5,0.13,0,0,
2,8910,-0.05,446.64,1055.29,42822.28,134.2,34.56,0,-0.3,0.0,26,-0.04,-0.08,0.03,1,0,
3,4964,0.48,321.14,703.05,51786.83,297.35,31.49,0,-0.3,0.11,12,-0.3,-0.17,-0.42,1,0,
4,10100,-0.07,428.72,891.29,44626.85,134.2,28.03,2,0.2,0.11,24,-0.17,0.33,0.19,1,0,


In [None]:
variables_min=['dias_atraso_dt_venc']

In [16]:
num_cols = df5.select_dtypes('number').drop('id_cliente', axis=1).columns
mms = pp.MinMaxScaler()

df5[num_cols] = mms.fit_transform(df5[num_cols])


map_bool = {'Sim': 1, 'Não': 0}
map_alvo = {'Conceder': 1, 'Negar': 0}
string_cols = ['investe_exterior', 'pessoa_polit_exp']

df5['investe_exterior'] = df5['investe_exterior'].map(map_bool)
df5['pessoa_polit_exp'] = df5['pessoa_polit_exp'].map(map_bool)
df5[alvo] = df5[alvo].map(map_alvo)

### Model Train

In [17]:
X = df5.drop(['id_cliente', alvo], axis=1)
y = df5[alvo]

In [18]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)


100%|██████████| 29/29 [00:13<00:00,  2.15it/s]


In [76]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NearestCentroid,0.68,0.75,0.75,0.72,0.04
GaussianNB,0.72,0.74,0.74,0.75,0.03
QuadraticDiscriminantAnalysis,0.73,0.73,0.73,0.76,0.03
LGBMClassifier,0.87,0.72,0.72,0.86,0.31
RandomForestClassifier,0.88,0.7,0.7,0.87,0.93
XGBClassifier,0.87,0.7,0.7,0.86,1.22
BaggingClassifier,0.87,0.7,0.7,0.86,0.31
DecisionTreeClassifier,0.83,0.7,0.7,0.84,0.09
AdaBoostClassifier,0.87,0.69,0.69,0.86,0.48
Perceptron,0.79,0.65,0.65,0.8,0.03


In [19]:
rf = RandomForestClassifier()


In [20]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [21]:
pipe = clf.models['RandomForestClassifier']

pred = pipe.predict(X_test)

f1_score(y_test, pred)

0.5389507154213037

In [22]:
pred = pipe.predict(X_test)
f1_score(y_test, pred)

0.5389507154213037

In [23]:
types_f1 = ['micro', 'macro', 'samples', 'weighted', 'binary']

In [24]:
for t in types_f1:
    try:
        print(t,f1_score(y_test, pred, average=t))
    except ValueError:
        print(f'Not possible {t}')

micro 0.8778947368421053
macro 0.7342897231559322
Not possible samples
weighted 0.8673669924252495
binary 0.5389507154213037


### Teste

In [25]:
df_test = pd.read_csv('../data/test.csv')
df_test = data_preparation(df_test, teste=True)

In [26]:
X_submission = df_test.drop('id_cliente', axis =1)
ids = df_test['id_cliente']

In [27]:
pred = pipe.predict(X_submission)

In [28]:
submission = pd.DataFrame()

In [29]:
submission['id_cliente'] = ids
submission['limite_adicional'] = pred

In [30]:
map_resposta = {0: 'Negar', 1: 'Conceder'}

In [31]:
submission['limite_adicional'] = submission['limite_adicional'].map(map_resposta)

In [32]:
submission.to_csv('../data/submissao.csv', index=False)

In [33]:
pd.read_csv('../data/submissao.csv')

Unnamed: 0,id_cliente,limite_adicional
0,2,Conceder
1,5,Negar
2,6,Negar
3,8,Negar
4,10,Conceder
...,...,...
2995,12484,Negar
2996,12487,Negar
2997,12489,Negar
2998,12495,Conceder
