In [1]:
import os
from pathlib import Path

import pandas as pd
from joblib import dump
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
proj_path = Path(os.getcwd()).parent.absolute()
churn_filepath = proj_path/'data'/'Churn_Modelling.csv'

In [3]:
df = pd.read_csv(churn_filepath)
df.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
2,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
3,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
4,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0


In [4]:
df.shape

(7523, 13)

In [5]:
# Are there missing values?
df.isna().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [6]:
df['Geography'].value_counts()

France     5014
Germany    2509
Name: Geography, dtype: int64

In [7]:
cat_cols = ['Geography', 'Gender']
num_cols = ['CreditScore', 'Age', 'Tenure', 
               'Balance', 'NumOfProducts', 'HasCrCard',
               'IsActiveMember', 'EstimatedSalary']
targ_col = 'Exited'

In [8]:
X, y = df[cat_cols + num_cols], df[targ_col]

for cat_col in cat_cols:
    X[cat_col] = X[cat_col].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cat_col] = X[cat_col].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cat_col] = X[cat_col].astype("category")


In [9]:
y.mean()

0.21587132792768843

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
clf = LGBMClassifier()
clf.fit(X_train, y_train)

LGBMClassifier()

In [12]:
y_prob = clf.predict_proba(X_test)
y_pred = y_prob[:, 1] >= 0.5

In [13]:
f1_score(y_test, y_pred)

0.6063218390804598

In [14]:
roc_auc_score(y_test, y_prob[:, 1])

0.8511382501356037

In [15]:
df_test = X_test
df_test['true'] = y_test
df_test['pred'] = y_pred
df_test

Unnamed: 0,Geography,Gender,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,true,pred
2338,France,Male,445,31,7,145056.59,1,1,1,175893.53,0,False
6886,Germany,Female,630,51,0,108449.23,3,0,0,88372.69,1,True
5861,Germany,Male,712,53,6,134729.99,2,1,1,132702.64,0,False
6610,Germany,Male,850,55,0,98710.89,1,1,1,83617.17,1,True
676,France,Male,561,29,9,120268.13,1,1,1,173870.39,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...
6094,Germany,Male,714,53,1,99141.86,1,1,1,72496.05,1,False
7253,Germany,Female,717,33,0,115777.23,1,1,1,81508.10,0,False
4310,France,Male,751,42,4,0.00,2,1,1,81442.60,0,False
6558,Germany,Male,590,36,6,92340.69,2,1,1,174667.58,0,False


In [16]:
df_test.groupby('Geography')[['true', 'pred']].apply(lambda x: f1_score(x['true'], x['pred']))

Geography
France     0.541667
Germany    0.666667
dtype: float64

In [17]:
model_path = proj_path/'models'
model_path.mkdir(exist_ok=True)
dump(clf, model_path/'clf.joblib')

['/Users/akim/Desktop/gh/iter/demo-bank-customer-churn/models/clf.joblib']