In [45]:
import functools
from math import sqrt

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import scipy.stats as sct
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_blobs, make_classification
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

import warnings

In [46]:

# Algumas configurações para o matplotlib.
%matplotlib inline

from IPython.core.pylabtools import figsize


figsize(12, 12)

sns.set()

pd.options.display.max_columns = 1000
warnings.filterwarnings('ignore')

In [47]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [48]:
df_train = df_train[['NU_INSCRICAO', 'CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_COR_RACA', 'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO',
       'TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO', 'IN_TREINEIRO', 'TP_DEPENDENCIA_ADM_ESC',
       'IN_BAIXA_VISAO', 'IN_CEGUEIRA', 'IN_SURDEZ', 'IN_DISLEXIA',
       'IN_DISCALCULIA', 'IN_SABATISTA', 'IN_GESTANTE', 'IN_IDOSO',
       'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC', 'TP_PRESENCA_MT',
       'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'TP_LINGUA',
       'TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3',
       'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q001', 'Q002',
       'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047']]

In [49]:
df_train.drop(['NU_INSCRICAO'], axis=1, inplace=True)
df_test.drop(['NU_INSCRICAO'], axis=1, inplace=True)

In [50]:
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [51]:
df_train.head()

Unnamed: 0,CO_UF_RESIDENCIA,NU_IDADE,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,IN_TREINEIRO,TP_DEPENDENCIA_ADM_ESC,IN_BAIXA_VISAO,IN_CEGUEIRA,IN_SURDEZ,IN_DISLEXIA,IN_DISCALCULIA,IN_SABATISTA,IN_GESTANTE,IN_IDOSO,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,TP_PRESENCA_MT,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,TP_LINGUA,TP_STATUS_REDACAO,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO,SG_UF_RESIDENCIA_AC,SG_UF_RESIDENCIA_AL,SG_UF_RESIDENCIA_AM,SG_UF_RESIDENCIA_AP,SG_UF_RESIDENCIA_BA,SG_UF_RESIDENCIA_CE,SG_UF_RESIDENCIA_DF,SG_UF_RESIDENCIA_ES,SG_UF_RESIDENCIA_GO,SG_UF_RESIDENCIA_MA,SG_UF_RESIDENCIA_MG,SG_UF_RESIDENCIA_MS,SG_UF_RESIDENCIA_MT,SG_UF_RESIDENCIA_PA,SG_UF_RESIDENCIA_PB,SG_UF_RESIDENCIA_PE,SG_UF_RESIDENCIA_PI,SG_UF_RESIDENCIA_PR,SG_UF_RESIDENCIA_RJ,SG_UF_RESIDENCIA_RN,SG_UF_RESIDENCIA_RO,SG_UF_RESIDENCIA_RR,SG_UF_RESIDENCIA_RS,SG_UF_RESIDENCIA_SC,SG_UF_RESIDENCIA_SE,SG_UF_RESIDENCIA_SP,SG_UF_RESIDENCIA_TO,TP_SEXO_F,TP_SEXO_M,Q001_A,Q001_B,Q001_C,Q001_D,Q001_E,Q001_F,Q001_G,Q001_H,Q002_A,Q002_B,Q002_C,Q002_D,Q002_E,Q002_F,Q002_G,Q002_H,Q006_A,Q006_B,Q006_C,Q006_D,Q006_E,Q006_F,Q006_G,Q006_H,Q006_I,Q006_J,Q006_K,Q006_L,Q006_M,Q006_N,Q006_O,Q006_P,Q006_Q,Q024_A,Q024_B,Q024_C,Q024_D,Q024_E,Q025_A,Q025_B,Q026_A,Q026_B,Q026_C,Q027_A,Q027_B,Q027_C,Q027_D,Q027_E,Q027_F,Q027_G,Q027_H,Q027_I,Q027_J,Q027_K,Q027_L,Q027_M,Q047_A,Q047_B,Q047_C,Q047_D,Q047_E
0,43,24,1,1,1,4,1,,0,,0,0,0,0,0,0,0,0,1,1,1,1,436.3,495.4,581.2,1,1.0,120.0,120.0,120.0,80.0,80.0,520.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1,23,17,3,1,2,0,2,1.0,0,2.0,0,0,0,0,0,0,0,0,1,1,1,1,474.5,544.1,599.0,1,1.0,140.0,120.0,120.0,120.0,80.0,580.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,23,21,3,1,3,0,1,,0,,0,0,0,0,0,0,0,0,0,0,0,0,,,,1,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,33,25,0,1,1,9,1,,0,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
4,13,28,2,1,1,4,1,,0,,0,0,0,0,0,0,0,0,0,0,0,0,,,,1,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0


In [52]:
df_train['TP_DEPENDENCIA_ADM_ESC'].fillna(-100, inplace=True)
df_train['TP_ENSINO'].fillna(-100, inplace=True)
df_train['TP_STATUS_REDACAO'].fillna(-100, inplace=True)
df_train['NU_NOTA_COMP2'].fillna(df_train['NU_NOTA_COMP2'].kurtosis(), inplace=True)
df_train['NU_NOTA_LC'].fillna(df_train['NU_NOTA_LC'].kurtosis(), inplace=True)
df_train['NU_NOTA_COMP1'].fillna(df_train['NU_NOTA_COMP1'].kurtosis(), inplace=True)
df_train['NU_NOTA_COMP3'].fillna(df_train['NU_NOTA_COMP3'].kurtosis(), inplace=True)
df_train['NU_NOTA_COMP5'].fillna(df_train['NU_NOTA_COMP5'].kurtosis(), inplace=True)
df_train['NU_NOTA_REDACAO'].fillna(df_train['NU_NOTA_REDACAO'].kurtosis(), inplace=True)
df_train['NU_NOTA_COMP4'].fillna(df_train['NU_NOTA_COMP4'].kurtosis(), inplace=True)
df_train['NU_NOTA_CH'].fillna(df_train['NU_NOTA_CH'].kurtosis(), inplace=True)
df_train['NU_NOTA_CN'].fillna(df_train['NU_NOTA_CN'].kurtosis(), inplace=True)

In [53]:
df_test['TP_DEPENDENCIA_ADM_ESC'].fillna(-100, inplace=True)
df_test['TP_ENSINO'].fillna(-100, inplace=True)
df_test['TP_STATUS_REDACAO'].fillna(-100, inplace=True)
df_test['NU_NOTA_COMP2'].fillna(df_test['NU_NOTA_COMP2'].kurtosis(), inplace=True)
df_test['NU_NOTA_LC'].fillna(df_test['NU_NOTA_LC'].kurtosis(), inplace=True)
df_test['NU_NOTA_COMP1'].fillna(df_test['NU_NOTA_COMP1'].kurtosis(), inplace=True)
df_test['NU_NOTA_COMP3'].fillna(df_test['NU_NOTA_COMP3'].kurtosis(), inplace=True)
df_test['NU_NOTA_COMP5'].fillna(df_test['NU_NOTA_COMP5'].kurtosis(), inplace=True)
df_test['NU_NOTA_REDACAO'].fillna(df_test['NU_NOTA_REDACAO'].kurtosis(), inplace=True)
df_test['NU_NOTA_COMP4'].fillna(df_test['NU_NOTA_COMP4'].kurtosis(), inplace=True)
df_test['NU_NOTA_CH'].fillna(df_test['NU_NOTA_CH'].kurtosis(), inplace=True)
df_test['NU_NOTA_CN'].fillna(df_test['NU_NOTA_CN'].kurtosis(), inplace=True)

In [54]:
features_train = df_train.drop(['IN_TREINEIRO'], axis=1)
target_train = df_train['IN_TREINEIRO']

In [55]:
features_test = df_test

In [56]:
bagging = BaggingClassifier()
bagging.fit(features_train, target_train)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False)

In [57]:
predicted = bagging.predict(features_test)

In [58]:
acuracy = bagging.score(features_train, target_train)

In [59]:
acuracy

0.9994173343044428

In [39]:
test = pd.read_csv('test.csv')
num_inscricao = test['NU_INSCRICAO']
resposta = pd.DataFrame(columns=['NU_INSCRICAO', 'IN_TREINEIRO'])
resposta['NU_INSCRICAO'] = num_inscricao
resposta['IN_TREINEIRO'] = predicted
resposta.set_index(['NU_INSCRICAO'], inplace=True)
resposta

Unnamed: 0_level_0,IN_TREINEIRO
NU_INSCRICAO,Unnamed: 1_level_1
ba0cc30ba34e7a46764c09dfc38ed83d15828897,0
177f281c68fa032aedbd842a745da68490926cd2,0
6cf0d8b97597d7625cdedc7bdb6c0f052286c334,1
5c356d810fa57671402502cd0933e5601a2ebf1e,0
df47c07bd881c2db3f38c6048bf77c132ad0ceb3,0
3f28749fb79fb059caf5aed79625a5addfd7a91a,0
bb2a0edddf3c59181a1496390aaaee7f32624d9d,1
cc7cab347fe5455aae983f3701ca40f84dc01949,0
95e9338f1da02f7bfa0e3194130afdccc0fb5457,1
155f84f2ee5b34e658f2adcc70f2ec83e37040cb,0


In [40]:
resposta.to_csv('answer.csv')