# Sumário
[](http://) 
1. [Coleta de Dados](#Coleta-de-Dados)  
1. [Análise e Processamento de Dados](#Análise-e-Processamento-de-Dados)  
1. [Criação de Modelos de Machine Learning](#Criação-de-Modelos-de-Machine-Learning)  
1. [Apresentação dos Resultados](#Apresentação-dos-Resultados)  



# Coleta de Dados

In [None]:
# Importando bibliotecas de código

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Carregando conjunto de dados

dados = pd.read_csv('/kaggle/input/online-shoppers-intention/online_shoppers_intention.csv')

# Separando váriaveis

colunas_numericas = ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']
colunas_categoricas = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']
variavel_resposta = ['Revenue']

# Análise e Processamento de Dados

In [None]:
# Verificando  valores ausentes
percent_nulos = 100*dados.isnull().sum()/dados.shape[0]

# Verificando existência de valores negativos
percent_negativos = 100*dados[dados[colunas_numericas] < 0].count()/dados.shape[0]

pd.DataFrame({'% Valores Negativos':percent_negativos, '% Valores Nulos':percent_nulos}) 

In [None]:
# Tratando valores nulos e negativos

imputer = SimpleImputer(missing_values=np.nan, strategy='median')

# Transformando negativos em nulos

dados[dados[colunas_numericas] < 0] = np.nan

dados =  pd.concat([pd.DataFrame(imputer.fit_transform(dados[colunas_numericas]),columns=colunas_numericas),dados[colunas_categoricas], dados[variavel_resposta]], axis=1)

dados_transf = dados.copy()


In [None]:
dados.describe(include='all').transpose()

In [None]:
# Verificando  valores ausentes
percent_nulos = 100*dados.isnull().sum()/dados.shape[0]

# Verificando existência de valores negativos
percent_negativos = 100*dados[dados[colunas_numericas] < 0].count()/dados.shape[0]

pd.DataFrame({'% Valores Negativos':percent_negativos, '% Valores Nulos':percent_nulos}) 

In [None]:
# Tratando Outliers

# Investigando distribuições
sns.catplot(kind="boxen", data=dados[colunas_numericas], aspect=1.2)
plt.title('Investigando Outliers')
plt.xticks(rotation=90)
plt.yscale("log")

In [None]:
# Aplicando IsolationForest para identificar Outliers
from sklearn.ensemble import IsolationForest

clf = IsolationForest(n_estimators=100)
Out= clf.fit_predict(dados_transf[colunas_numericas])  # fit 100 trees 
dt = pd.concat([dados_transf[colunas_numericas], dados_transf[colunas_categoricas], dados_transf[variavel_resposta], pd.DataFrame(Out, columns=['Sel'])], axis=1)
dp = dt[dt['Sel'] == -1]

In [None]:
plt.title('Outliers identificados pelo IsolationForest')
sns.countplot(dp['Revenue'])
# Imprimindo porcentagens no topo das barras
ax = plt.gca()
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%i' % p.get_height(), ha='center', va='bottom')

In [None]:
# Transformação Logarítimica

dados_transf['Administrative']=dados_transf['Administrative'].apply(lambda x: np.log10(x) if x > 1 else x)
dados_transf['Administrative_Duration']=dados_transf['Administrative_Duration'].apply(lambda x: np.log10(x) if x > 1 else x)
dados_transf['Informational']=dados_transf['Informational'].apply(lambda x: np.log10(x) if x > 1 else x)
dados_transf['Informational_Duration']=dados_transf['Informational_Duration'].apply(lambda x: np.log10(x) if x > 1 else x)
dados_transf['ProductRelated']=dados_transf['ProductRelated'].apply(lambda x: np.log10(x) if x > 1 else x)
dados_transf['ProductRelated_Duration']=dados_transf['ProductRelated_Duration'].apply(lambda x: np.log10(x) if x > 1 else x)
dados_transf['ExitRates']=dados_transf['ExitRates'].apply(lambda x: np.log10(x) if x > 1 else x)
dados_transf['PageValues']=dados_transf['PageValues'].apply(lambda x: np.log10(x) if x > 1 else x)
dados_transf['BounceRates']=dados_transf['BounceRates'].apply(lambda x: np.log10(x) if x > 1 else x)

In [None]:
sns.catplot(kind="boxen", data=dados_transf[colunas_numericas], aspect=1.2)
plt.title('Reultado Transf. Log.')
plt.xticks(rotation=90)


In [None]:
# Redimensionando valores para o intervalo [0, 1]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
dados_transf[colunas_numericas] = scaler.fit_transform(dados_transf[colunas_numericas])

#dados_transf[colunas_numericas]= dados_transf[colunas_numericas].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

In [None]:
dados.describe(include='all')[colunas_numericas].transpose()[['min','max']]


In [None]:
dados_transf.describe(include='all')[colunas_numericas].transpose()[['min','max']]

In [None]:
# Usando Get_dummies para criar variáveis fictícias para dados categóricos

dados_transf.loc[:,['Region','Browser','OperatingSystems','Weekend']]=dados_transf.loc[:,['Region','Browser','OperatingSystems','Weekend']].astype(str) 

dados_transf = pd.concat([dados_transf[colunas_numericas],pd.get_dummies(dados_transf[colunas_categoricas]), dados_transf[variavel_resposta]], axis=1)

In [None]:
pd.DataFrame(dados_transf.columns, columns=['Variáveis'])

In [None]:
dados.head(5)

In [None]:
dados.describe(include='all').transpose().iloc[:,[0,1,2,3,6,10]]

In [None]:
def autolabel(ax):
    for p in ax.patches:
        #ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%.0f' % p.get_height(), ha='center', va='bottom')
        ax.text(p.get_x() + p.get_width()/2., p.get_height()+100, '%2.1f %%' % (100*p.get_height()/dados['Revenue'].count()), ha='center', va='bottom', rotation=90)

In [None]:
ax = sns.countplot(dados['Revenue'])
autolabel(ax)
plt.show()
for i in colunas_categoricas:
    plt.figure(figsize=(6,4))
    if i == "Month":
        o=('Feb', 'Mar', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')
        ax = sns.countplot(x=i, hue='Revenue', data=dados, order=o)
    else:
        ax = sns.countplot(x=i, hue='Revenue', data=dados)
    autolabel(ax)
    plt.show()

In [None]:
for i in colunas_numericas:
    f, axes = plt.subplots(1, 2, figsize=(12,4))
    sns.distplot(dados.loc[(dados['Revenue']==True), i], kde=False, ax=axes[0],color="coral", bins=10, hist_kws={"alpha":1},label="True")
    axes[0].legend()
    sns.distplot(dados.loc[(dados['Revenue']==False), i], kde=False, ax=axes[1], hist_kws=dict(alpha=1), bins=10,label="False")
    axes[1].legend()
    plt.show()

In [None]:
dados.groupby(['Month','Revenue'])['SpecialDay'].sum()

In [None]:
# Análise multivariada

o = ('Feb', 'Mar', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')
ax = sns.barplot(x="Month", y="SpecialDay", hue="Revenue", data=dados, order=o, dodge=True, ci=0, estimator=sum)#lambda x: sum(x))
#ax = sns.barplot(x="Month", y="SpecialDay", hue="Revenue", data=dados, order=o, dodge=True, ci=0)
#ax = sns.boxenplot(x="Month", y="SpecialDay", hue="Revenue", data=dados)
#autolabel(ax)
plt.show()

In [None]:
sns.heatmap(pd.crosstab(dados.Weekend, dados.Revenue, margins=True, margins_name="Total", normalize=True), annot=True, fmt=".1%", cmap="Blues")

In [None]:
aux = dados[['Administrative','Informational','ProductRelated', 'Revenue']].groupby('Revenue').sum()
aux = pd.concat([aux, pd.Series(aux.index.transpose().astype(str))], axis=1)
f, axes = plt.subplots(1, 3, figsize=(15,4))
sns.barplot(y=aux['Informational'], x='Revenue', data=aux, ci=0, ax=axes[0])
sns.barplot(y=aux['Administrative'], x='Revenue', data=aux, ci=0, ax=axes[1])
sns.barplot(y=aux['ProductRelated'], x='Revenue', data=aux, ci=0, ax=axes[2]); plt.show()


In [None]:
sns.catplot(x="Revenue", col="Weekend", data=dados, kind="count", height=4, aspect=1.2)

In [None]:
#dataframe.col3, mapping_index = pd.Series(dataframe.col3).factorize()
pd.Series(dados.Month).factorize()

In [None]:
sns.pairplot(dados[colunas_numericas + variavel_resposta], hue='Revenue', kind = 'reg', diag_kind='hist', corner=True);

In [None]:
sns.pairplot(dados,vars=['Administrative', 'Informational', 'ProductRelated'],hue='Revenue', kind = 'reg', diag_kind='hist', corner=True); plt.show()
sns.pairplot(dados,vars=['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration'],hue='Revenue', kind = 'reg', diag_kind='hist', corner=True); plt.show()
sns.pairplot(dados,vars=['BounceRates', 'ExitRates', 'PageValues'],hue='Revenue', kind = 'reg', diag_kind='hist', corner=True); plt.show()


In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(dados.corr(), annot=True, cmap="Blues", fmt=".2f")
plt.xticks(rotation=90)

## Criação de Modelos de Machine Learning

In [None]:
# Dividindo conjunto de dados
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

x = dados_transf.drop(['Revenue'], axis=1)
y = dados['Revenue'].astype(int) 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
# Lidando com dados desbalanceados
# Over-sampling do conjunto de dados de treinamento para criar um equilíbrio entre compras e desistências
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=33)
x_train, y_train = sm.fit_sample(x_train, y_train)

ax = sns.countplot(y_train)
ax.set(title='SMOTE')
autolabel(ax)
plt.show()

### Feature Selection

In [None]:
# Selecionando variáveis mais importantes
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

selector = SelectFromModel(estimator=RandomForestClassifier()).fit(x,y)
selector.transform(x)
selector.estimator_.feature_importances_
feat_selected = pd.concat([pd.Series(selector.estimator_.feature_importances_, index=x.columns, name="Importances"), pd.Series(selector.get_support(),index=x.columns, name="Get")], axis=1)
feat_selected.loc[feat_selected["Get"] == True, ["Importances"]].plot(kind='barh').set(title='Variáveis Selecionadas')
plt.show()

In [None]:
x_train = x_train[feat_selected.index]
x_test  = x_test[feat_selected.index]

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

bestfeatures = SelectKBest(chi2, k=10).fit(x, y)

feat_importances = pd.Series(bestfeatures.scores_, index=x.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()


### Aplicação dos modelos

In [None]:
# Aplicando os modelos selecionados

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_roc_curve
from sklearn.feature_selection import SelectFromModel

names = ["SVM", 
         "Nearest Neighbors", 
         "Decision Tree", 
         "Random Forest",
         "Extra Trees",
         "AdaBoost",
         "Gradient Boost",
         "Neural Net", 
         "Logistic Regression"
        ]


classifiers = [
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    ExtraTreesClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    MLPClassifier(),
    LogisticRegression()
]

report_list = []
    
hyper_param = [
    [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}],
    {'n_neighbors': [3, 5, 7]},
    {'min_samples_split': range(2, 40, 10), 'criterion' :['gini', 'entropy']},
    {'n_estimators': [50, 100, 200], 'criterion' :['gini', 'entropy']},
    {'n_estimators': [50, 100, 200], 'criterion' :['gini', 'entropy']},
    {'n_estimators': [50, 100, 200], 'learning_rate':[0.1, 1, 2]},
    {'n_estimators': [50, 100, 200], 'learning_rate':[0.1, 1, 2]},
    {'alpha': [0.00001, 0.0001, 0.001], 'max_iter': [100, 200, 300], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'learning_rate': ['constant', 'invscaling', 'adaptive']},
    {'penalty' : ['l1', 'l2'], 'C' : np.logspace(-4, 1, 5), 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
]

fig, ax = plt.subplots(figsize=(10,10))
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
ax.set(title='ROC Curves')


for name, clf, hyper in zip(names, classifiers, hyper_param):
    
    # Pesquisando os melhores Hiper-parâmetros
    rs = RandomizedSearchCV(clf, hyper, verbose=0, cv=3, n_jobs=-1, n_iter=10, random_state=5)
    rs_results = rs.fit(x_train, y_train)
    
    # Testando o modelo
    y_pred = rs.predict(x_test)
    
    # Resultados
    report_list.append([name, classification_report(y_test, y_pred, output_dict=True, target_names=['False','True'])])
    plot_roc_curve(rs, x_test, y_test, ax=ax, alpha=0.8, name=name)

plt.show()


## Apresentação dos Resultados

In [None]:
# Resultado dos Modelos

list_plot = []

for item in report_list:
    list_plot.append([item[0], 'False_precision', item[1]['False']['precision']])
    list_plot.append([item[0], 'False_recall', item[1]['False']['recall']])
    list_plot.append([item[0], 'False_f1-score', item[1]['False']['f1-score']])
#    list_plot.append([item[0], 'False_support', item[1]['False']['support']])
    list_plot.append([item[0], 'True_precision', item[1]['True']['precision']])
    list_plot.append([item[0], 'True_recall', item[1]['True']['recall']])
    list_plot.append([item[0], 'True_f1-score', item[1]['True']['f1-score']])
#    list_plot.append([item[0], 'True_support', item[1]['True']['support']])
    list_plot.append([item[0], 'accuracy', item[1]['accuracy']])
#    list_plot.append([item[0], 'macro avg_precision', item[1]['macro avg']['precision']])
#    list_plot.append([item[0], 'macro avg_recall', item[1]['macro avg']['recall']])
#    list_plot.append([item[0], 'macro avg_f1-score', item[1]['macro avg']['f1-score']])
#    list_plot.append([item[0], 'macro avg_support', item[1]['macro avg']['support']])
    list_plot.append([item[0], 'weighted avg_precision', item[1]['weighted avg']['precision']])
    list_plot.append([item[0], 'weighted avg_recall', item[1]['weighted avg']['recall']])
    list_plot.append([item[0], 'weighted avg_f1-score', item[1]['weighted avg']['f1-score']])
#    list_plot.append([item[0], 'weighted avg_support', item[1]['weighted avg']['support']])

df = pd.DataFrame(list_plot, columns=['Classifier', 'Metric', 'Value'])

sns.heatmap(df.pivot(index='Classifier', columns='Metric', values='Value'), annot=True, cmap="Blues", fmt=".2f").set(title='Classification Report')
