### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import  f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import seaborn as sns
import plotly.express as px
import sweetviz as sv
import pickle
from boruta import BorutaPy
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVR

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


### Helper Functions

In [None]:
def simple_model_test(model, nome):
    #model definition

    # model fit
    model.fit(X_train, y_train)

    #model predict
    y_hat = model.predict(X_test)
    
    # evaluate
    f1 = np.round(f1_score( y_test_, y_hat_ ), 2)
    
    print('{}\n f1: {}'.format(model_name, f1))
    return f1

def send_model(model):
     # model fit
    model.fit(X_train[cols_selected], y_train)
    pickle.dump(model, open('../parameters/model.pkl', 'wb'))
    print('Model submited')
    return None

def target_encoding(df, column)
    target = df.groupby(column)[alvo].mean()
    df[column] = df[column].map(target)
    pickle.dump(target, open(f'../parameters/{column}_encode.pkl', 'wb'))
    print(f'Target Encode to {column}')
    
def frequency_encoding(df, column)
    frequency = df.groupby(column)[alvo].count() / len(df)
    df[column] = df[column].map(frequency)
    pickle.dump(target, open(f'../parameters/{column}_encode.pkl', 'wb'))
    print(f'Frequency Encode to {column}')
    

In [53]:
def data_preparation(df, teste=True):
    num_cols = df.select_dtypes('number').drop('id_cliente', axis=1).columns
    mms = pp.MinMaxScaler()

    df[num_cols] = mms.fit_transform(df[num_cols])

    map_bool = {'Sim': 1, 'Não': 0}
    map_alvo = {'Conceder': 1, 'Negar': 0}
    string_cols = ['investe_exterior', 'pessoa_polit_exp']

    df['investe_exterior'] = df['investe_exterior'].map(map_bool)
    df['pessoa_polit_exp'] = df['pessoa_polit_exp'].map(map_bool)
    
    if teste==False:
        df[alvo] = df[alvo].map(map_alvo)
    return df
    

### Load Data 

In [51]:
path = '../data/train.csv'

df_raw = pd.read_csv(path)
df_test = pd.read_csv('../data/test.csv')

### Data Description

In [4]:
df1 = df_raw.copy()

In [6]:
# set target
alvo = 'limite_adicional'

#### Nulls

In [None]:
df1.isna().sum()

#### Dtypes

In [None]:
df1.dtypes

### Data Filtering

In [7]:
df2 = df1.copy()

### Feature Engineering

In [8]:
df3 = df2.copy()

### Colum Selection

In [9]:
df4 = df3.copy()

### Data Preparation

In [11]:
df5 = df4.copy()

In [18]:
num_cols = df5.select_dtypes('number').drop('id_cliente', axis=1).columns
mms = pp.MinMaxScaler()

df5[num_cols] = mms.fit_transform(df5[num_cols])


map_bool = {'Sim': 1, 'Não': 0}
map_alvo = {'Conceder': 1, 'Negar': 0}
string_cols = ['investe_exterior', 'pessoa_polit_exp']

df5['investe_exterior'] = df5['investe_exterior'].map(map_bool)
df5['pessoa_polit_exp'] = df5['pessoa_polit_exp'].map(map_bool)
df5[alvo] = df5[alvo].map(map_alvo)

### Model Train

In [21]:
X = df5.drop(['id_cliente', alvo], axis=1)
y = df5[alvo]

In [22]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:11<00:00,  2.56it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NearestCentroid,0.68,0.75,0.75,0.72,0.03
GaussianNB,0.72,0.74,0.74,0.75,0.02
QuadraticDiscriminantAnalysis,0.73,0.73,0.73,0.76,0.04
LGBMClassifier,0.87,0.72,0.72,0.86,0.24
RandomForestClassifier,0.88,0.7,0.7,0.87,0.82
XGBClassifier,0.87,0.7,0.7,0.86,0.53
BaggingClassifier,0.87,0.7,0.7,0.86,0.27
DecisionTreeClassifier,0.83,0.7,0.7,0.84,0.08
AdaBoostClassifier,0.87,0.69,0.69,0.86,0.43
Perceptron,0.79,0.65,0.65,0.8,0.03


In [27]:
rf = RandomForestClassifier()


In [28]:
rf.fit(X_train, y_train)

In [36]:
pipe = clf.models['RandomForestClassifier']

pred = pipe.predict(X_test)

f1_score(y_test, pred)

In [76]:
pred = pipe.predict(X_test)
f1_score(y_test, pred)

0.5389507154213037

In [77]:
types_f1 = ['micro', 'macro', 'samples', 'weighted', 'binary']

In [80]:
for t in types_f1:
    try:
        print(t,f1_score(y_test, pred, average=t))
    except ValueError:
        print(f'Not possible {t}')

micro 0.8778947368421053
macro 0.7342897231559322
Not possible samples
weighted 0.8673669924252495
binary 0.5389507154213037


### Teste

In [57]:
df_test = pd.read_csv('../data/test.csv')
df_test = data_preparation(df_test, teste=True)

In [62]:
X_submission = df_test.drop('id_cliente', axis =1)
ids = df_test['id_cliente']

In [64]:
pred = pipe.predict(X_submission)

In [65]:
submission = pd.DataFrame()

In [66]:
submission['id_cliente'] = ids
submission['limite_adicional'] = pred

In [69]:
map_resposta = {0: 'Negar', 1: 'Conceder'}

In [71]:
submission['limite_adicional'] = submission['limite_adicional'].map(map_resposta)

In [73]:
submission.to_csv('../data/submissao.csv', index=False)

In [74]:
pd.read_csv('../data/submissao.csv')

Unnamed: 0,id_cliente,limite_adicional
0,2,Conceder
1,5,Negar
2,6,Negar
3,8,Negar
4,10,Conceder
...,...,...
2995,12484,Negar
2996,12487,Negar
2997,12489,Negar
2998,12495,Conceder
