## Competição DSA 2019/06
<h3>Prever Índice de Lealdade</h3>

In [1]:
# Importar as bibliotecas necessárias para este notebook
import pandas as pd
import numpy  as np
from datetime import datetime as dt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics         import accuracy_score
from sklearn.metrics         import mean_squared_error
from sklearn.model_selection import GridSearchCV
from xgboost                 import XGBRegressor
import xgboost
from xgboost import plot_importance
from collections import OrderedDict

In [2]:
# Retomando o trabalho:
df_treino  = pd.read_csv('df_treino_ETL.csv')
df_teste   = pd.read_csv('df_teste_ETL.csv')

In [3]:
# Recriar os campos de tipo data:
df_treino['date_Card_Activation'] = pd.to_datetime(df_treino['date_Card_Activation'], format = "%Y-%m-%d")
df_treino['oldest_Date'] = pd.to_datetime(df_treino['oldest_Date'], format = "%Y-%m-%d")
df_treino['latest_Date'] = pd.to_datetime(df_treino['latest_Date'], format = "%Y-%m-%d")
df_teste['date_Card_Activation'] = pd.to_datetime(df_teste['date_Card_Activation'], format = "%Y-%m-%d")
df_teste['oldest_Date'] = pd.to_datetime(df_teste['oldest_Date'], format = "%Y-%m-%d")
df_teste['latest_Date'] = pd.to_datetime(df_teste['latest_Date'], format = "%Y-%m-%d")

In [4]:
# Criar colunas com resultado de cálculos com data de ativação
df_treino['oldest_Activation'] = (df_treino['oldest_Date'] - df_treino['date_Card_Activation']).dt.days
df_treino['latest_Activation'] = (df_treino['latest_Date'] - df_treino['date_Card_Activation']).dt.days
df_teste['oldest_Activation'] = (df_teste['oldest_Date'] - df_teste['date_Card_Activation']).dt.days
df_teste['latest_Activation'] = (df_teste['latest_Date'] - df_teste['date_Card_Activation']).dt.days

In [5]:
# Excluir as colunas de data:
df_treino.drop(['date_Card_Activation', 'oldest_Date', 'latest_Date'], axis=1, inplace = True)
df_teste.drop(['date_Card_Activation', 'oldest_Date', 'latest_Date'], axis=1, inplace = True)

In [6]:
# Verificar a presença de valores NA:
print(df_treino.isna().sum())
print(df_teste.isna().sum())

card_id              0
target               0
feature_1_1          0
feature_1_2          0
feature_1_3          0
feature_1_4          0
feature_1_5          0
feature_2_1          0
feature_2_2          0
feature_2_3          0
feature_3_0          0
feature_3_1          0
denied_purchase?     0
min_installments     0
max_installments     0
min_month_lag        0
max_month_lag        0
oldest_Activation    0
latest_Activation    0
dtype: int64
card_id              0
feature_1_1          0
feature_1_2          0
feature_1_3          0
feature_1_4          0
feature_1_5          0
feature_2_1          0
feature_2_2          0
feature_2_3          0
feature_3_0          0
feature_3_1          0
denied_purchase?     0
min_installments     0
max_installments     0
min_month_lag        0
max_month_lag        0
oldest_Activation    0
latest_Activation    0
dtype: int64


In [7]:
# características dos dados de treino:
df_treino.describe()

Unnamed: 0,target,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_3_0,feature_3_1,denied_purchase?,min_installments,max_installments,min_month_lag,max_month_lag,oldest_Activation,latest_Activation
count,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0,201917.0
mean,-0.393636,0.059614,0.276336,0.364372,0.098481,0.201197,0.441974,0.370642,0.187384,0.434431,0.565569,1.0,-0.036609,4.401734,-7.877777,1.591015,118.748159,412.984192
std,3.8505,0.23677,0.447186,0.481255,0.297965,0.400896,0.496623,0.482978,0.39022,0.495683,0.495683,0.0,0.719849,23.057988,3.843313,0.778242,224.135979,284.760641
min,-33.219281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,-13.0,-11.0,-331.0,1.0
25%,-0.88311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,1.0,-12.0,1.0,10.0,210.0
50%,-0.023437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,-8.0,2.0,22.0,327.0
75%,0.765453,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,6.0,-4.0,2.0,123.0,515.0
max,17.965068,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0,999.0,-1.0,2.0,1995.0,2370.0


In [8]:
df_teste.describe()

Unnamed: 0,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_3_0,feature_3_1,denied_purchase?,min_installments,max_installments,min_month_lag,max_month_lag,oldest_Activation,latest_Activation
count,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0,123623.0
mean,0.059908,0.27596,0.361737,0.099755,0.20264,0.443081,0.372042,0.184877,0.435623,0.564377,1.0,-0.039653,4.495474,-7.887812,1.588119,117.984914,412.441957
std,0.237317,0.446999,0.480505,0.299674,0.401968,0.496752,0.483352,0.388199,0.49584,0.49584,0.0,0.718909,24.847743,3.840508,0.784334,222.63425,283.659779
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,-13.0,-10.0,-329.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,1.0,-12.0,1.0,10.0,210.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,-8.0,2.0,22.0,328.0
75%,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,6.0,-4.0,2.0,123.0,515.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0,999.0,-1.0,2.0,2151.0,2353.0


In [9]:
# Incoerências detectadas:
#   denied_purchased por possuir somente valores iguais a 1
#
# Excluir colunas
df_treino.drop(['denied_purchase?'], axis=1, inplace = True)
df_teste.drop(['denied_purchase?'],  axis=1, inplace = True)

In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics         import accuracy_score
from sklearn.metrics         import mean_squared_error
from sklearn.pipeline        import Pipeline
from sklearn.linear_model    import LinearRegression
import warnings
warnings.filterwarnings("ignore")

In [13]:
var_mais_importantes = ['latest_Activation', 'max_installments', 'oldest_Activation', 'max_month_lag', 
                        'min_installments', 'min_month_lag']

In [23]:
# Separando o array em componentes de input e output
X = df_treino[var_mais_importantes]
Y = df_treino['target']

# Definindo os valores para os folds
num_folds = 10
seed      = 7

# Separando os dados em folds
kfold = KFold(num_folds, True, random_state = seed)

# Criando o modelo
modelo    = LinearRegression()
resultado = cross_val_score(modelo, X, Y, cv = kfold)

# Usamos a média e o desvio padrão
print("Acurácia Final: %.3f%%" % (resultado.mean() * 100.0))

# Salvar arquivo:
filename = 'reg_comFS.csv'
pd.DataFrame({'card_id': df_teste.card_id, 'target': test_dataset.Prediction}).to_csv(filename, index=False)

Acurácia Final: 0.256%


In [31]:
# Separando o array em componentes de input e output
X = df_treino.iloc[:, 2:19]
Y = df_treino['target']

# Definindo os valores para os folds
num_folds = 10
seed      = 7

# Separando os dados em folds
kfold = KFold(num_folds, True, random_state = seed)

# Criando o modelo
modelo    = LinearRegression()
resultado = cross_val_score(modelo, X, Y, cv = kfold)

# Usamos a média e o desvio padrão
print("Acurácia Final: %.3f%%" % (resultado.mean() * 100.0))

Acurácia Final: 0.284%


In [40]:
from sklearn.decomposition import PCA
from sklearn.pipeline      import Pipeline
from sklearn.linear_model  import LinearRegression

In [41]:
pca = PCA(n_components = 8)

In [42]:
# aplicar o objeto PCA aso dataFrames
novo_treino = pca.fit_transform(df_treino.drop(['card_id', 'target'], axis = 1))
novo_teste  = pca.fit_transform(df_teste.drop('card_id', axis  = 1))

In [43]:
# Criar Pandas Data Frame com o resultado anterior
features_treino = pd.DataFrame(novo_treino)
features_teste  = pd.DataFrame(novo_teste)

In [44]:
# Criar modelo de regressão linear
regre_lin = LinearRegression()

In [45]:
# Usar o pipeline para encadear PCA e Regressão Linear, onde o resultado do PCA é entrada para
# regressão linear
pipe = Pipeline([('pca', pca), ('linear', regre_lin)])
pipe.fit(features_treino, df_treino['target'])

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('linear', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

In [46]:
# Fazer previsões no modelo treinado
predictions = pipe.predict(features_teste)