In [None]:
import os
from pathlib import Path

import pandas as pd
from joblib import dump
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
proj_path = Path(os.getcwd()).parent.absolute()
churn_filepath = proj_path/'data'/'Churn_Modelling.csv'

In [None]:
df = pd.read_csv(churn_filepath)
df.head()

In [None]:
df.shape

In [None]:
# Are there missing values?
df.isna().sum()

In [None]:
df['Geography'].value_counts()

In [None]:
cat_cols = ['Geography', 'Gender']
num_cols = ['CreditScore', 'Age', 'Tenure', 
               'Balance', 'NumOfProducts', 'HasCrCard',
               'IsActiveMember', 'EstimatedSalary']
targ_col = 'Exited'

In [None]:
X, y = df[cat_cols + num_cols], df[targ_col]

for cat_col in cat_cols:
    X[cat_col] = X[cat_col].astype("category")

In [None]:
y.mean()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
clf = LGBMClassifier()
clf.fit(X_train, y_train)

In [None]:
y_prob = clf.predict_proba(X_test)
y_pred = y_prob[:, 1] >= 0.5

In [None]:
f1_score(y_test, y_pred)

In [None]:
roc_auc_score(y_test, y_prob[:, 1])

In [None]:
df_test = X_test
df_test['true'] = y_test
df_test['pred'] = y_pred
df_test

In [None]:
df_test.groupby('Geography')[['true', 'pred']].apply(lambda x: f1_score(x['true'], x['pred']))

In [None]:
model_path = proj_path/'models'
model_path.mkdir(exist_ok=True)
dump(clf, model_path/'clf.joblib')