In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
import lightgbm
import xgboost as xgb
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
data = pd.read_csv('./data/preprocessing_train.csv')
data

Unnamed: 0,Churn,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,DroppedCalls,...,NotNewCellphoneUser,IncomeGroup,OwnsMotorcycle,AdjustmentsToCreditRating,HandsetPrice,MadeCallToRetentionTeam,CreditRating,PrizmCode,Occupation,MaritalStatus
0,1.0,64.80,479.0,50.0,0.00,139.0,0.0,-337.0,-13.3,0.7,...,1.0,3,0.0,0,8.0,0.0,0.0,2.0,4.0,2.0
1,0.0,61.02,134.0,70.0,0.50,0.0,0.0,-12.0,0.0,1.7,...,1.0,0,0.0,0,8.0,0.0,0.0,0.0,3.0,1.0
2,1.0,70.71,369.0,50.0,7.18,0.0,3.8,-7.0,-5.3,4.0,...,0.0,0,0.0,0,8.0,0.0,0.0,0.0,3.0,1.0
3,1.0,61.62,674.0,55.0,2.97,30.0,0.0,169.0,10.3,2.7,...,0.0,6,1.0,1,8.0,0.0,0.0,2.0,3.0,2.0
4,1.0,72.65,493.0,50.0,1.24,7.0,0.7,220.0,3.1,2.3,...,1.0,3,0.0,0,3.0,0.0,0.0,2.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37718,1.0,38.34,320.0,30.0,0.99,21.0,0.0,-66.0,-8.3,12.3,...,0.0,6,0.0,0,15.0,0.0,0.0,0.0,4.0,0.0
37719,0.0,0.00,76.0,30.0,0.00,0.0,0.0,0.0,0.0,5.7,...,0.0,4,0.0,0,15.0,0.0,1.0,0.0,4.0,0.0
37720,0.0,31.92,63.0,17.0,0.00,43.0,0.0,-38.0,-13.2,0.7,...,0.0,3,0.0,0,15.0,0.0,2.0,0.0,3.0,2.0
37721,0.0,71.99,724.0,70.0,0.00,4.0,0.9,-40.0,-2.0,14.3,...,0.0,7,0.0,0,15.0,0.0,0.0,0.0,4.0,1.0


In [3]:
results = {}
stratified_k_fold = StratifiedKFold(random_state=0, shuffle=True)  # n_splits: 5 (default)

for i, (train_index, test_index) in enumerate(stratified_k_fold.split(data, data.Churn)):
    temp = data.iloc[train_index]
    test = data.iloc[test_index]
    train, valid = train_test_split(temp, test_size= len(test)/len(temp), random_state=0)

    lgbm = lightgbm.LGBMClassifier(random_state=0)
    
    parameters = {
        'max_depth': np.random.randint(1, 15, 5),
        'n_estimators': np.random.randint(50, 500, 20),
        }

    random_search = RandomizedSearchCV(lgbm, parameters, random_state=0)
    random_search.fit(train.drop(columns=['Churn']), train.Churn)

    index_best_param = random_search.cv_results_.get('rank_test_score').argmax()
    best_params = random_search.cv_results_.get('params')[index_best_param]

    lgbm =lightgbm.LGBMClassifier(
        n_estimators = best_params.get('n_estimators'),
        max_depth = best_params.get('max_depth'),
        random_state=0
    ).fit(train.drop(columns=['Churn']), train.Churn)

    confusion_matrix_result = confusion_matrix(
        test.Churn,
        lgbm.predict(test.drop(columns=['Churn']))
    )

    best_importances = pd.Series(lgbm.feature_importances_, index=train.drop(columns=['Churn']).columns).sort_values(ascending=False)[:3]

    TP = confusion_matrix_result[0, 0]
    FN = confusion_matrix_result[0, 1]
    FP = confusion_matrix_result[1, 0]
    TN = confusion_matrix_result[1, 1]

    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = 2*precision*recall/(precision+recall)

    accuracy = (TP+TN) / (TP+TN+FP+FN)

    results.update({
        f'fold{i+1}':{
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "accuracy": float(accuracy),
        "best_params": best_params,
        "best_importances": best_importances
        }
    })

    print(f'fold:{i+1}| prescision: {precision: .2f}, recall: {recall: .2f}, f1: {f1: .2f}, accuracy: {accuracy: .2f}')

[LightGBM] [Info] Number of positive: 5165, number of negative: 12941
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4198
[LightGBM] [Info] Number of data points in the train set: 18106, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.285265 -> initscore=-0.918495
[LightGBM] [Info] Start training from score -0.918495
[LightGBM] [Info] Number of positive: 5165, number of negative: 12941
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4207
[LightGBM] [Info] Number of data points in the train set: 18106, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.285265 -> initscore=-0.918495
[LightGBM] [Info] Start training from score -0.918495
[LightGBM] [In

In [4]:
for fold, result in results.items():
    print(f'{fold}: {result}')

fold1: {'precision': 0.7176234979973297, 'recall': 0.9957391626528344, 'f1': 0.8341092489137182, 'accuracy': 0.7166335321404904, 'best_params': {'n_estimators': 114, 'max_depth': 1}, 'best_importances': MonthsInService      33
MonthlyMinutes       13
PercChangeMinutes    13
dtype: int32}
fold2: {'precision': 0.7442662269524809, 'recall': 0.9196147434710131, 'f1': 0.8227009113504556, 'accuracy': 0.7163684559310802, 'best_params': {'n_estimators': 466, 'max_depth': 8}, 'best_importances': PercChangeMinutes       842
PercChangeRevenues      771
CurrentEquipmentDays    751
dtype: int32}
fold3: {'precision': 0.740451260875977, 'recall': 0.9299870346360437, 'f1': 0.8244663382594417, 'accuracy': 0.7166335321404904, 'best_params': {'n_estimators': 478, 'max_depth': 8}, 'best_importances': PercChangeMinutes       873
ServiceArea             826
CurrentEquipmentDays    773
dtype: int32}
fold4: {'precision': 0.7160100809125879, 'recall': 1.0, 'f1': 0.8345056813789904, 'accuracy': 0.71619830328738

In [5]:
results = {}
stratified_k_fold = StratifiedKFold(random_state=0, shuffle=True)  # n_splits: 5 (default)

for i, (train_index, test_index) in enumerate(stratified_k_fold.split(data, data.Churn)):
    temp = data.iloc[train_index]
    test = data.iloc[test_index]
    train, valid = train_test_split(temp, test_size= len(test)/len(temp), random_state=0)

    xgboost = xgb.XGBClassifier(random_state=0)
    
    parameters = {
        'max_depth': np.random.randint(1, 15, 5),
        'n_estimators': np.random.randint(50, 500, 20),
        }

    random_search = RandomizedSearchCV(xgboost, parameters, random_state=0)
    random_search.fit(train.drop(columns=['Churn']), train.Churn)

    index_best_param = random_search.cv_results_.get('rank_test_score').argmax()
    best_params = random_search.cv_results_.get('params')[index_best_param]

    xgboost =xgb.XGBClassifier(
        n_estimators = best_params.get('n_estimators'),
        max_depth = best_params.get('max_depth'),
        random_state=0
    ).fit(train.drop(columns=['Churn']), train.Churn)

    confusion_matrix_result = confusion_matrix(
        test.Churn,
        xgboost.predict(test.drop(columns=['Churn']))
    )

    best_importances = pd.Series(xgboost.feature_importances_, index=train.drop(columns=['Churn']).columns).sort_values(ascending=False)[:3]

    TP = confusion_matrix_result[0, 0]
    FN = confusion_matrix_result[0, 1]
    FP = confusion_matrix_result[1, 0]
    TN = confusion_matrix_result[1, 1]

    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = 2*precision*recall/(precision+recall)

    accuracy = (TP+TN) / (TP+TN+FP+FN)

    results.update({
        f'fold{i+1}':{
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "accuracy": float(accuracy),
        "best_params": best_params,
        "best_importances": best_importances
        }
    })

    print(f'fold:{i+1}| prescision: {precision: .2f}, recall: {recall: .2f}, f1: {f1: .2f}, accuracy: {accuracy: .2f}')

fold:1| prescision:  0.74, recall:  0.90, f1:  0.81, accuracy:  0.70
fold:2| prescision:  0.75, recall:  0.89, f1:  0.81, accuracy:  0.70
fold:3| prescision:  0.74, recall:  0.89, f1:  0.81, accuracy:  0.70
fold:4| prescision:  0.74, recall:  0.90, f1:  0.81, accuracy:  0.70
fold:5| prescision:  0.74, recall:  0.90, f1:  0.81, accuracy:  0.71



#### fold|1| prescision|  0.72| recall|  1.00| f1|  0.83| accuracy|  0.72
#### fold|2| prescision|  0.72| recall|  1.00| f1|  0.83| accuracy|  0.72
#### fold|3| prescision|  0.72| recall|  1.00| f1|  0.83| accuracy|  0.72
#### fold|4| prescision|  0.72| recall|  1.00| f1|  0.84| accuracy|  0.72
#### fold|5| prescision|  0.72| recall|  1.00| f1|  0.83| accuracy|  0.72

In [6]:
for fold, result in results.items():
    print(f'{fold}: {result}')

fold1: {'precision': 0.7435779110906015, 'recall': 0.895516858095591, 'f1': 0.8125052525422305, 'accuracy': 0.7043074884029158, 'best_params': {'n_estimators': 377, 'max_depth': 7}, 'best_importances': HandsetRefurbished           0.040328
AdjustmentsToCreditRating    0.025944
CurrentEquipmentDays         0.025672
dtype: float32}
fold2: {'precision': 0.7454828660436137, 'recall': 0.8864604556399334, 'f1': 0.8098823927574246, 'accuracy': 0.7021868787276342, 'best_params': {'n_estimators': 479, 'max_depth': 7}, 'best_importances': HandsetRefurbished      0.029879
HandsetWebCapable       0.027884
CurrentEquipmentDays    0.025420
dtype: float32}
fold3: {'precision': 0.7445627024525683, 'recall': 0.8940544545286164, 'f1': 0.8124894798855412, 'accuracy': 0.7047051027170311, 'best_params': {'n_estimators': 295, 'max_depth': 7}, 'best_importances': HandsetRefurbished      0.031024
OwnsMotorcycle          0.027853
CurrentEquipmentDays    0.027666
dtype: float32}
fold4: {'precision': 0.743960609