# Modelagem

In [None]:
%load_ext autoreload
%autoreload 2

# pip install shap

In [None]:
import os

if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")

In [None]:
from src.prepare import prepare_data

X_train, X_test = prepare_data(
    project_id="ca-churn-project",
    database_name="customer_churn",
    table_name="customer_churn_data",
)

In [None]:
categoric_columns = X_train.select_dtypes(include=["category"]).columns

X_train["receita_total"] = X_train["receita_total"].fillna(X_train["receita_mensal"])
X_test["receita_total"] = X_test["receita_total"].fillna(X_test["receita_mensal"])

y_train = X_train.pop("churn")
y_test = X_test.pop("churn")

In [None]:
X_train[categoric_columns]

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer

ordinal_columns = [
    "tipo_de_empresa",
    "funcionarios",
    "_modulo_financeiro",
    "_emissao_de_nota_fiscal",
    "_integracao_bancaria",
    "_modulo_de_vendas",
    "_relatorios",
    "_utilizacao_de_apis_de_integracao",
    "contrato",
    "frequencia_de_pagamento",
]

numeric_columns = [
    "fundacao_da_empresa",
    "meses_de_permanencia",
    "receita_mensal",
    "receita_total",
]

ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

scaler = RobustScaler()

preprocessing = ColumnTransformer(
    transformers=[
        (
            "a",
            ordinal_encoder,
            ordinal_columns,
        ),
        (
            "b",
            one_hot_encoder,
            list(set(categoric_columns) - set(ordinal_columns)),
        ),
        (
            "c",
            scaler,
            numeric_columns,
        ),
    ]
).set_output(transform="pandas")

In [None]:
X_train=preprocessing.fit_transform(X_train)
X_test=preprocessing.transform(X_test)

In [None]:
X_train

In [None]:
#feature selectionÇ select k best
from sklearn.feature_selection import SelectKBest, mutual_info_regression
import pandas as pd

k=10
selector = SelectKBest(mutual_info_regression, k=k)

selector.fit(X_train.values, y_train)
X_train = pd.DataFrame(selector.transform(X_train.values), columns=X_train.columns[selector.get_support()])
X_test = pd.DataFrame(selector.transform(X_test.values), columns=X_test.columns[selector.get_support()])
X_train.shape, X_test.shape

In [None]:

from tqdm.auto import tqdm
import pandas as pd
#class report
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

from xgboost import XGBClassifier
#linear svc
from sklearn.svm import LinearSVC
#random forest
from sklearn.ensemble import RandomForestClassifier

from sklearn.utils.class_weight import compute_class_weight
import numpy as np
#l;ightgbm
from lightgbm import LGBMClassifier

# calculate class weights based on the training data
class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y_train)

# clf = XGBClassifier(n_jobs=-1, random_state=0, pos_weight=10)
# clf = LinearSVC(random_state=0, max_iter=1000,class_weight='balanced')
# clf=RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced')
clf=LGBMClassifier(n_estimators=100, random_state=0, class_weight='balanced')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


In [None]:
import shap
# print the JS visualization code to the notebook
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(clf)
shap_values = explainer(X_test)

In [None]:
shap.force_plot(
    explainer.expected_value, shap_values.values[1, :], X_test.iloc[0, :]
)

Importancia global de cada feature

In [None]:
explainer = shap.Explainer(clf)
shap_values = explainer(X_test)

clust = shap.utils.hclust(X_test, y_test, linkage="single")
shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)

In [None]:
shap.summary_plot(shap_values, X_test)