In [1]:
# Importando as bibliotecas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import RandomOverSampler
import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
# Carregando os dados
treino = pd.read_csv("train.csv")
teste = pd.read_csv("test.csv")

treino.shape, teste.shape

((76020, 371), (75818, 370))

In [None]:
# Visualizando os datasets
display(treino.head())
display(teste.head())

In [3]:
# Removendo os IDs
treino.drop("ID", axis=1, inplace=True)
id_submit = teste["ID"]
teste.drop("ID", axis=1, inplace=True)

treino.shape, teste.shape

((76020, 370), (75818, 369))

In [4]:
# Dividindo as variáveis independentes da variável dependente/target
X = treino.drop("TARGET", axis=1)
y = treino["TARGET"]

In [5]:
# Dividindo os dados em Treino e Validação
X_treino, X_valid, y_treino, y_valid = train_test_split(X, y, test_size = 0.30, random_state = 42)
X_treino.shape, X_valid.shape, y_treino.shape, y_valid.shape

((53214, 369), (22806, 369), (53214,), (22806,))

In [6]:
# Identificando as variáveis que não tem variância
var_const = VarianceThreshold(threshold=0)
var_const.fit(X_treino)

# Extraindo o nome
cols_const = [ col for col in X_treino.columns if col not in X_treino.columns[var_const.get_support()]]
print(len(cols_const))
print()
print(cols_const)

# Removendo
X_treino.drop(columns=cols_const, axis=1, inplace=True)
X_valid.drop(columns=cols_const, axis=1, inplace=True)
teste.drop(columns=cols_const, axis=1, inplace=True)

X_treino.shape, X_valid.shape, teste.shape

46

['ind_var2_0', 'ind_var2', 'ind_var18_0', 'ind_var18', 'ind_var27_0', 'ind_var28_0', 'ind_var28', 'ind_var27', 'ind_var41', 'ind_var46_0', 'ind_var46', 'num_var18_0', 'num_var18', 'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27', 'num_var41', 'num_var46_0', 'num_var46', 'saldo_var18', 'saldo_var28', 'saldo_var27', 'saldo_var41', 'saldo_var46', 'delta_imp_amort_var18_1y3', 'delta_imp_reemb_var33_1y3', 'delta_num_reemb_var33_1y3', 'imp_amort_var18_hace3', 'imp_amort_var18_ult1', 'imp_amort_var34_hace3', 'imp_reemb_var13_hace3', 'imp_reemb_var33_hace3', 'imp_reemb_var33_ult1', 'imp_trasp_var17_out_hace3', 'imp_trasp_var33_out_hace3', 'num_var2_0_ult1', 'num_var2_ult1', 'num_reemb_var13_hace3', 'num_reemb_var33_hace3', 'num_reemb_var33_ult1', 'num_trasp_var17_out_hace3', 'num_trasp_var33_out_hace3', 'saldo_var2_ult1', 'saldo_medio_var13_medio_hace3', 'saldo_medio_var29_hace3']


((53214, 323), (22806, 323), (75818, 323))

In [7]:
# Identificando as variáveis duplicadas
X_treino_T = X_treino.T

# Extraindo o nome
print(X_treino_T[X_treino_T.duplicated()].shape[0])
print()
cols_dupli = list(X_treino_T[X_treino_T.duplicated()].index)
print(cols_dupli)

# Removendo
X_treino.drop(columns=cols_dupli, axis=1, inplace=True)
X_valid.drop(columns=cols_dupli, axis=1, inplace=True)
teste.drop(columns=cols_dupli, axis=1, inplace=True)

X_treino.shape, X_valid.shape, teste.shape

26

['ind_var13_medio', 'ind_var26', 'ind_var25', 'ind_var29_0', 'ind_var29', 'ind_var32', 'ind_var34', 'ind_var37', 'ind_var39', 'num_var13_medio', 'num_var26', 'num_var25', 'num_var29_0', 'num_var29', 'num_var32', 'num_var34', 'num_var37', 'num_var39', 'saldo_var29', 'delta_num_reemb_var13_1y3', 'delta_num_reemb_var17_1y3', 'delta_num_trasp_var17_in_1y3', 'delta_num_trasp_var17_out_1y3', 'delta_num_trasp_var33_in_1y3', 'delta_num_trasp_var33_out_1y3', 'saldo_medio_var13_medio_ult1']


((53214, 297), (22806, 297), (75818, 297))

In [8]:
# Balanceamento da variável target
y_treino.value_counts()

0    51121
1     2093
Name: TARGET, dtype: int64

In [9]:
# OverSampling
sampler = RandomOverSampler(random_state=1)
X_treino_sampler, y_treino_sampler = sampler.fit_resample(X_treino, y_treino)

In [10]:
y_treino_sampler.value_counts()

0    51121
1    51121
Name: TARGET, dtype: int64

In [11]:
# Construção do modelo para avaliação
clf = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05, objective="binary:logistic")
clf.fit(X_treino_sampler, y_treino_sampler, early_stopping_rounds=10, eval_metric="auc",
        eval_set=[(X_valid, y_valid)])

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0]	validation_0-auc:0.79342
[1]	validation_0-auc:0.80037
[2]	validation_0-auc:0.80401
[3]	validation_0-auc:0.80518
[4]	validation_0-auc:0.80368
[5]	validation_0-auc:0.80521
[6]	validation_0-auc:0.80664
[7]	validation_0-auc:0.80812
[8]	validation_0-auc:0.80940
[9]	validation_0-auc:0.81008
[10]	validation_0-auc:0.81112
[11]	validation_0-auc:0.81236
[12]	validation_0-auc:0.81293
[13]	validation_0-auc:0.81353
[14]	validation_0-auc:0.81442
[15]	validation_0-auc:0.81445
[16]	validation_0-auc:0.81460
[17]	validation_0-auc:0.81476
[18]	validation_0-auc:0.81539
[19]	validation_0-auc:0.81657
[20]	validation_0-auc:0.81759
[21]	validation_0-auc:0.81767
[22]	validation_0-auc:0.81779
[23]	validation_0-auc:0.81761
[24]	validation_0-auc:0.81834
[25]	validation_0-auc:0.81835
[26]	validation_0-auc:0.81880
[27]	validation_0-auc:0.81871
[28]	validation_0-auc:0.81903
[29]	validation_0-auc:0.82010
[30]	validation_0-auc:0.82028
[31]	validation_0-auc:0.82019
[32]	validation_0-auc:0.82022
[33]	validation_0-au

In [12]:
# Data Matrix used in XGBoost.
# DMatrix is an internal data structure that is used by XGBoost, which is optimized for both memory efficiency and training speed. 
# You can construct DMatrix from multiple different sources of data.

params = {"objective": "binary:logistic", "booster": "gbtree", "eval_metric":"auc"}

treino_xgb = xgb.DMatrix(X_treino_sampler, y_treino_sampler)
teste_xgb  = xgb.DMatrix(teste)

gbm = xgb.train(params, treino_xgb, 20)
y_pred = gbm.predict(teste_xgb)

In [13]:
submit = pd.DataFrame({"id": id_submit, "target": y_pred})
submit.head()

Unnamed: 0,id,target
0,2,0.445065
1,5,0.488602
2,6,0.014676
3,7,0.141842
4,9,0.022934


In [14]:
submit.to_csv('submission.csv',index=False)