# Churn Prediction in the Credit Card Industry
![Churn Prediction](Images/Churn.jpg)
## Objective:
<p align="justify">
Early detection of customers at risk of churning credit cards. Features influencing churns the most are identified as well. A structured approach is adopted to develop data understanding, preparation, algorithm selection, modelling, and evaluation. All the steps are presented. Models are extensively evaluated using accuracy, precision, sensitivity, specificity, R.O.C. curves, A.U.C. and confusion matrices.
</p>


[Churn.csv](https://github.com/justgrossi/Portfolio/blob/main/2.Churn_Prediction/churns.csv)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import cohen_kappa_score
from imblearn.over_sampling import SMOTE
from collections import Counter

In [20]:
# Data exploration
def load_and_inspect_data(file_path):
    """Load the dataset and perform initial inspection"""
    print("=" * 60)
    print("LOADING AND INSPECTING DATA")
    print("=" * 60)
    
    # Load the data
    df = pd.read_csv(file_path)
    
    print(f"Dataset shape: {df.shape}")
    print(f"\nColumn names and types:")
    print(df.dtypes)
    
    print(f"\nMissing values per column:")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])
    display(df.head())    

    return df


def check_values(cols):
    for col in cols:
        print(f'VARIABLE: {col}')
        print(df[col].value_counts())
        print('================')


# Model final report
def final_report(y_pred_MODEL):
    print("Accuracy (reduced features):", accuracy_score(y_test, y_pred_MODEL))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_MODEL))
    print("Classification Report:\n", classification_report(y_test, y_pred_MODEL))
    kappa = cohen_kappa_score(y_test, y_pred_MODEL)
    print("Cohen's Kappa:", kappa)




# Model ROC curve
def model_roc_curve(y_true, y_prob, model_name="Model"):
    # Compute ROC-AUC
    roc_auc = roc_auc_score(y_true, y_prob)
    print(f"{model_name} ROC-AUC: {roc_auc:.4f}")

    # Compute ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_prob)

    # Plot ROC curve
    plt.style.use('ggplot')
    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')
    plt.plot([0,1], [0,1], linestyle='--', color='red', label='Random guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.show()

In [3]:
file_path='churns.csv'
df=load_and_inspect_data(file_path)

LOADING AND INSPECTING DATA
Dataset shape: (10127, 20)

Column names and types:
churned            object
age                 int64
gender             object
numDep              int64
eduLevel           object
marital            object
incomeCat          object
cardCat            object
monthsOnBook        int64
totBought           int64
monthsInactive      int64
numContacts         int64
credLim           float64
revBal              int64
credBought        float64
delta12amt        float64
totTransAmt         int64
totTransCount       int64
delta12count      float64
ratio             float64
dtype: object

Missing values per column:
Series([], dtype: int64)


Unnamed: 0,churned,age,gender,numDep,eduLevel,marital,incomeCat,cardCat,monthsOnBook,totBought,monthsInactive,numContacts,credLim,revBal,credBought,delta12amt,totTransAmt,totTransCount,delta12count,ratio
0,no,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,no,49,F,5,Graduate,Single,Less than $40K,blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,no,51,M,3,Graduate,Married,$80K - $120K,blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,no,40,F,4,High School,Unknown,Less than $40K,blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,no,40,M,3,Uneducated,Married,$60K - $80K,blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [21]:
cols=['gender', 'eduLevel', 'marital', 'incomeCat', 'cardCat']
check_values(cols)

VARIABLE: gender
gender
F    5358
M    4769
Name: count, dtype: int64
VARIABLE: eduLevel
eduLevel
Graduate         3128
High School      2013
Unknown          1519
Uneducated       1487
College          1013
Post-Graduate     516
Doctorate         451
Name: count, dtype: int64
VARIABLE: marital
marital
Married     4687
Single      3943
Unknown      749
Divorced     748
Name: count, dtype: int64
VARIABLE: incomeCat
incomeCat
Less than $40K    3561
$40K - $60K       1790
$80K - $120K      1535
$60K - $80K       1402
Unknown           1112
$120K +            727
Name: count, dtype: int64
VARIABLE: cardCat
cardCat
Blue        9421
Silver       538
Gold          98
gold          18
silver        17
Platinum      16
blue          15
platinum       4
Name: count, dtype: int64
