### Imports

In [96]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import  f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import seaborn as sns
import plotly.express as px
import sweetviz as sv
import pickle
from boruta import BorutaPy
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVR

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### Helper Functions

In [9]:
def simple_model_test(model, nome):
    #model definition

    # model fit
    model.fit(X_train, y_train)

    #model predict
    y_hat = model.predict(X_test)
    
    # evaluate
    f1 = np.round(f1_score( y_test_, y_hat_ ), 2)
    
    print('{}\n f1: {}'.format(model_name, f1))
    return f1

def send_model(model):
     # model fit
    model.fit(X_train[cols_selected], y_train)
    pickle.dump(model, open('../parameters/model.pkl', 'wb'))
    print('Model submited')
    return None

def target_encoding(df, column):
    target = df.groupby(column)[alvo].mean()
    df[column] = df[column].map(target)
    pickle.dump(target, open(f'../parameters/{column}_encode.pkl', 'wb'))
    print(f'Target Encode to {column}')
    
def frequency_encoding(df, column):
    frequency = df.groupby(column)[alvo].count() / len(df)
    df[column] = df[column].map(frequency)
    pickle.dump(target, open(f'../parameters/{column}_encode.pkl', 'wb'))
    print(f'Frequency Encode to {column}')

### Load Data 

In [92]:
path = '../data/train.csv'

df_raw = pd.read_csv(path)
test = pd.read_csv('../data/test.csv')

### Data Description

In [11]:
df1 = df_raw.copy()

In [12]:
df1.shape

(9500, 17)

In [34]:
df1.sample().T

Unnamed: 0,8161
id_cliente,4750
idade,24
saldo_atual,283.093834
divida_atual,2918.12
renda_anual,19834.94968
valor_em_investimentos,70.811855
taxa_utilizacao_credito,36.54257
num_emprestimos,6
num_contas_bancarias,6
num_cartoes_credito,7


In [35]:
# set target
alvo = 'limite_adicional'

#### Nulls

In [36]:
df1.isna().sum()

id_cliente                 0
idade                      0
saldo_atual                0
divida_atual               0
renda_anual                0
valor_em_investimentos     0
taxa_utilizacao_credito    0
num_emprestimos            0
num_contas_bancarias       0
num_cartoes_credito        0
dias_atraso_dt_venc        0
num_pgtos_atrasados        0
num_consultas_credito      0
taxa_juros                 0
investe_exterior           0
pessoa_polit_exp           0
limite_adicional           0
dtype: int64

#### Dtypes

In [37]:
df1.dtypes

id_cliente                   int64
idade                        int64
saldo_atual                float64
divida_atual               float64
renda_anual                float64
valor_em_investimentos     float64
taxa_utilizacao_credito    float64
num_emprestimos              int64
num_contas_bancarias         int64
num_cartoes_credito          int64
dias_atraso_dt_venc          int64
num_pgtos_atrasados          int64
num_consultas_credito        int64
taxa_juros                   int64
investe_exterior            object
pessoa_polit_exp            object
limite_adicional            object
dtype: object

### Data Filtering

In [38]:
df2 = df1.copy()

### Feature Engineering

In [39]:
df3 = df2.copy()

### Colum Selection

In [40]:
df4 = df3.copy()

### Data Preparation

In [86]:
df5 = df4.copy()

In [87]:
num_columns = df5.select_dtypes('number').drop('id_cliente', axis=1).columns

In [88]:
mms = pp.MinMaxScaler()

df5[num_columns] = mms.fit_transform(df5[num_columns])

bool_map = {'Sim': 1, 'Não': 0}
target_map = {'Negar': 0, 'Conceder': 1}

df5['investe_exterior'] = df5['investe_exterior'].map(bool_map)
df5['pessoa_polit_exp'] = df5['pessoa_polit_exp'].map(bool_map)
df5[alvo] = df5[alvo].map(target_map)

In [95]:
df5['limite_adicional'].value_counts()

0    7995
1    1505
Name: limite_adicional, dtype: int64

### Model Train

In [102]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

NameError: name 'X' is not defined

In [100]:
knn = KNeighborsClassifier()
lgbm = LGBMClassifier()
rf = RandomForestClassifier()
lg = LogisticRegression()
xgb = XGBClassifier()
et = ExtraTreesClassifier()

models = [knn, lgbm, rf, lg, xgb, et]

In [101]:
for model in models:
    simple_model_test(model, 'model')

NameError: name 'X_train' is not defined

In [None]:
X = df5.drop(alvo, axis =1)
y = df5[alvo]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [None]:
lgbm = LGBMRegressor