<h1>Fuga de clientes</h1>

In [1]:
import pandas as pd
import numpy as np

In [18]:
dataset = pd.read_csv('../data/raw/train_clientes.csv',sep=';')

In [21]:
target = 'ATTRITION'
X, y = dataset.drop([target, 'ID_CORRELATIVO', 'CODMES'], axis=1), dataset[target]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

## Preprocesamiento

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

imputer = ColumnTransformer(
    transformers=[
        ('mean_imputer', SimpleImputer(strategy='mean'), ['EDAD', 'ANTIGUEDAD']),
        ('mode_imputer', SimpleImputer(strategy='most_frequent'), ['RANG_INGRESO', 'FLAG_LIMA_PROVINCIA']),
        ('ohe', OneHotEncoder(categories='auto', drop='first', sparse_output=False, handle_unknown='error'), ['RANG_INGRESO','FLAG_LIMA_PROVINCIA','RANG_SDO_PASIVO_MENOS0','RANG_NRO_PRODUCTOS_MENOS0'])

    ],
    remainder='passthrough',
    verbose_feature_names_out=False
).set_output(transform='pandas')

In [28]:
X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)

In [31]:
X_train.drop(['RANG_INGRESO', 'FLAG_LIMA_PROVINCIA'], axis=1, inplace=True)
X_test.drop(['RANG_INGRESO', 'FLAG_LIMA_PROVINCIA'], axis=1, inplace=True)

## Procesamiento

In [29]:
from sklearn.metrics import roc_auc_score

### LightGBM

In [32]:
from lightgbm import LGBMClassifier

lgbmClassifier = LGBMClassifier() 
lgbmClassifier.fit(X_train, y_train)

predict_train_lg = lgbmClassifier.predict_proba(X_train)[:,1]
predict_test_lg = lgbmClassifier.predict_proba(X_test)[:,1]

print("auc on training in LGBMClassifier data : {:.3f}".format(roc_auc_score(y_train, predict_train_lg)))
print("auc on testing in LGBMClassifier  data : {:.3f}".format(roc_auc_score(y_test, predict_test_lg))) 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 8658, number of negative: 47342
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2581
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 76
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.154607 -> initscore=-1.698914
[LightGBM] [Info] Start training from score -1.698914
auc on training in LGBMClassifier data : 0.895
auc on testing in LGBMClassifier  data : 0.851


### XGBoost

In [33]:
from xgboost import XGBClassifier

xgbClassifier = XGBClassifier() 
xgbClassifier.fit(X_train, y_train)

predict_train_lg = xgbClassifier.predict_proba(X_train)[:,1]
predict_test_lg = xgbClassifier.predict_proba(X_test)[:,1]

print("auc on training in XGBClassifier data : {:.3f}".format(roc_auc_score(y_train, predict_train_lg)))
print("auc on testing in XGBClassifier  data : {:.3f}".format(roc_auc_score(y_test, predict_test_lg))) 

auc on training in LGBMClassifier data : 0.922
auc on testing in LGBMClassifier  data : 0.850


### Catboost

In [34]:
from catboost import CatBoostClassifier

cbClassifier = CatBoostClassifier(verbose=0, n_estimators=500) 
cbClassifier.fit(X_train, y_train)

predict_train_lg = cbClassifier.predict_proba(X_train)[:,1]
predict_test_lg = cbClassifier.predict_proba(X_test)[:,1]

print("auc on training in CatBoostClassifier data : {:.3f}".format(roc_auc_score(y_train, predict_train_lg)))
print("auc on testing in CatBoostClassifier  data : {:.3f}".format(roc_auc_score(y_test, predict_test_lg))) 

auc on training in CatBoostClassifier data : 0.911
auc on testing in CatBoostClassifier  data : 0.855


### Decision tree

In [37]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(class_weight=None,
                               criterion='gini',
                               max_depth=None,
                               max_features=None,
                               max_leaf_nodes=None,
                               min_samples_leaf=1,
                               min_samples_split=2,
                               min_weight_fraction_leaf=0.0,
                               random_state=None,
                               splitter='best')
dtree.fit(X_train,y_train)

predict_train_dtree = dtree.predict_proba(X_train)[:,1]
predict_test_dtree = dtree.predict_proba(X_test)[:,1]

print("auc on training in DecisionTreeClassifier data : {:.3f}".format(roc_auc_score(y_train, predict_train_dtree)))
print("auc on testing in DecisionTreeClassifier  data : {:.3f}".format(roc_auc_score(y_test, predict_test_dtree)))

auc on training in DecisionTreeClassifier data : 1.000
auc on testing in DecisionTreeClassifier  data : 0.653


### Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(bootstrap=True,
                            class_weight=None,
                            criterion='gini',
                            max_depth=None,
                            max_features=None,
                            max_leaf_nodes=None,
                            min_samples_leaf=1,
                            min_samples_split=2,
                            min_weight_fraction_leaf=0.0,
                            n_estimators=10,
                            n_jobs=1,
                            oob_score=False,
                            random_state=None,
                            verbose=0,
                            warm_start=False
                            )
rfc.fit(X_train, y_train)

predict_train_rfc = rfc.predict_proba(X_train)[:,1]
predict_test_rfc = rfc.predict_proba(X_test)[:,1]

print("auc on training in RandomForestClassifier data : {:.3f}".format(roc_auc_score(y_train, predict_train_rfc)))
print("auc on testing in RandomForestClassifier  data : {:.3f}".format(roc_auc_score(y_test, predict_test_rfc)))

  return fit_method(estimator, *args, **kwargs)


auc on training in RandomForestClassifier data : 0.999
auc on testing in RandomForestClassifier  data : 0.788


### Logistic regression

In [39]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

predict_train_model = model.predict_proba(X_train)[:,1]
predict_test_model = model.predict_proba(X_test)[:,1]

print("auc on training in LogisticRegression data : {:.3f}".format(roc_auc_score(y_train, predict_train_model)))
print("auc on testing in LogisticRegression  data : {:.3f}".format(roc_auc_score(y_test, predict_test_model)))

  y = column_or_1d(y, warn=True)


auc on training in LogisticRegression data : 0.708
auc on testing in LogisticRegression  data : 0.695


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
from sklearn.linear_model import LogisticRegression

logmodel1 = LogisticRegression(C=1.0,
                               class_weight=None,
                               dual=False,
                               fit_intercept=True,
                               intercept_scaling=1,
                               l1_ratio=None,
                               max_iter=1000,
                               multi_class='auto',
                               n_jobs=None,
                               penalty='l2',
                               random_state=None,
                               solver='lbfgs',
                               tol=0.0001,
                               verbose=0,
                               warm_start=False
                               )
logmodel1.fit(X_train, y_train)

predict_train_logmodel1 = logmodel1.predict_proba(X_train)[:,1]
predict_test_logmodel1 = logmodel1.predict_proba(X_test)[:,1]

print("auc on training in LogisticRegression data : {:.3f}".format(roc_auc_score(y_train, predict_train_logmodel1)))
print("auc on testing in LogisticRegression  data : {:.3f}".format(roc_auc_score(y_test, predict_test_logmodel1)))

  y = column_or_1d(y, warn=True)


auc on training in LogisticRegression data : 0.665
auc on testing in LogisticRegression  data : 0.652


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
