In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings(action="ignore")

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


## Customers Segmentation based on their Credit Card usage behaviour

Dataset for this notebook consists of credit card usage behavior of customers with 18 behavioral features. Segmentation of customers can be used to define marketing strategies.


**Content of this Kernel:**

* Data Preprocessing
* Clustering using KMeans
* Interpretation of Clusters
* Visualization of Clusters using PCA

In [None]:
df=pd.read_csv("CC GENERAL.csv")
data=df
print(df.shape)
data.head()

Data Preprocessing

Descriptive Statistics of Data

In [None]:
data.describe()

In [None]:
# Customer ID is unnecessary for building a clustering model (or any prediction model).
# Dropping customer Id column.
data.drop(['CUST_ID'], axis=1, inplace=True)

In [None]:
# Checking for null values
data.isnull().sum()

In [None]:
# The easiest way to handle the columns with null values is either to drop those rows, or fill with mean value.
# Filling with mean value
data = data.fillna(data.mean())

# Verifying all the columns are filled
data.isnull().sum()

In [None]:
# K-mean clustering model uses "distances" between data points to put them into groups
# Logically, for it to work well, we will need to standardize the unit of the "distances"
# Therefore, we would like to scale the dataset. Here, again, we would use the one of the most commonly used method - StandardScaler.
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)

In [None]:
#PART 3

In [None]:
from sklearn.cluster import KMeans

# Let's start with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(data_scaled)

In [None]:
# Evaluating the model with different metrics
# Decreasing the WCSS is the key objective
# Silhouette coefficient should be nearer to +1
# Lower the value of Davies-Bouldin Index would improve the performance.

from sklearn.metrics import silhouette_score,calinski_harabasz_score,davies_bouldin_score

labels = kmeans.fit_predict(data)

print("Silhouette Coefficient: %0.3f" % silhouette_score(data, labels))
print("Calinski-Harabasz Index: %0.3f" % calinski_harabasz_score(data, labels))
print("Davies-Bouldin Index: %0.3f" % davies_bouldin_score(data, labels))

In [None]:
#PART 4

In [None]:
# Elbow Method with WCSS (Within-Cluster Sum of Square).

# To do that, we would need to build multiple models with varying "number of clusters (i.e. K)"
# We can extract the wcss value from the model by using the inertia_ field
# We would then put value of WCSS in each of model into an array and visualize them later

wcss = []

for i in range(2, 16): 
    kmeans = KMeans(n_clusters = i, random_state = 0)
    kmeans.fit(data) 
    wcss.append(kmeans.inertia_)
    

In [None]:
# After having the series of WCSS values, we can then plot it out, and see what is the optimal K value.

import matplotlib.pyplot as plt

kvalue = range(2, 16)
plt.plot(kvalue, wcss, marker='x')
plt.show()

In [None]:
# Let's try to plot other metrics in the same chart as well

silhouette = []
davies_bouldin = []

for i in range(2, 16): 
    kmeans = KMeans(n_clusters = i, random_state = 0)
    labels = kmeans.fit_predict(data)
    silhouette.append(silhouette_score(data, labels))
    davies_bouldin.append(davies_bouldin_score(data, labels))

In [None]:
kvalue = range(2, 16)
plt.plot(kvalue, silhouette, marker='o')
plt.plot(kvalue, davies_bouldin, marker='*')
plt.show()

In [None]:
# Remove non-numeric columns if any (e.g., 'CUST_ID')
df_numeric = df.select_dtypes(include='number')

# Calculate correlation matrix
corr = df_numeric.corr()

# Plot heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
data.isnull().sum().sort_values(ascending=False).head()

In [None]:
columns=['BALANCE', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'CREDIT_LIMIT',
        'PAYMENTS', 'MINIMUM_PAYMENTS']

for c in columns:
    
    Range=c+'_RANGE'
    data[Range]=0        
    data.loc[((data[c]>0)&(data[c]<=500)),Range]=1
    data.loc[((data[c]>500)&(data[c]<=1000)),Range]=2
    data.loc[((data[c]>1000)&(data[c]<=3000)),Range]=3
    data.loc[((data[c]>3000)&(data[c]<=5000)),Range]=4
    data.loc[((data[c]>5000)&(data[c]<=10000)),Range]=5
    data.loc[((data[c]>10000)),Range]=6
 

In [None]:
columns=['BALANCE_FREQUENCY', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY', 
         'CASH_ADVANCE_FREQUENCY', 'PRC_FULL_PAYMENT']

for c in columns:
    
    Range=c+'_RANGE'
    data[Range]=0
    data.loc[((data[c]>0)&(data[c]<=0.1)),Range]=1
    data.loc[((data[c]>0.1)&(data[c]<=0.2)),Range]=2
    data.loc[((data[c]>0.2)&(data[c]<=0.3)),Range]=3
    data.loc[((data[c]>0.3)&(data[c]<=0.4)),Range]=4
    data.loc[((data[c]>0.4)&(data[c]<=0.5)),Range]=5
    data.loc[((data[c]>0.5)&(data[c]<=0.6)),Range]=6
    data.loc[((data[c]>0.6)&(data[c]<=0.7)),Range]=7
    data.loc[((data[c]>0.7)&(data[c]<=0.8)),Range]=8
    data.loc[((data[c]>0.8)&(data[c]<=0.9)),Range]=9
    data.loc[((data[c]>0.9)&(data[c]<=1.0)),Range]=10
    

In [None]:
columns=['PURCHASES_TRX', 'CASH_ADVANCE_TRX']  

for c in columns:
    
    Range=c+'_RANGE'
    data[Range]=0
    data.loc[((data[c]>0)&(data[c]<=5)),Range]=1
    data.loc[((data[c]>5)&(data[c]<=10)),Range]=2
    data.loc[((data[c]>10)&(data[c]<=15)),Range]=3
    data.loc[((data[c]>15)&(data[c]<=20)),Range]=4
    data.loc[((data[c]>20)&(data[c]<=30)),Range]=5
    data.loc[((data[c]>30)&(data[c]<=50)),Range]=6
    data.loc[((data[c]>50)&(data[c]<=100)),Range]=7
    data.loc[((data[c]>100)),Range]=8

In [None]:
data.drop([ 'BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
       'PURCHASES_FREQUENCY',  'ONEOFF_PURCHASES_FREQUENCY',
       'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
       'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT' ], axis=1, inplace=True)

X= np.asarray(data)

In [None]:
scale = StandardScaler()
X = scale.fit_transform(X)
X.shape

In [None]:
n_clusters=30
cost=[]
for i in range(1,n_clusters):
    kmean= KMeans(i)
    kmean.fit(X)
    cost.append(kmean.inertia_)  

In [None]:
plt.plot(cost, 'bx-')

In [None]:
kmean= KMeans(6)
kmean.fit(X)
labels=kmean.labels_

In [None]:
clusters=pd.concat([data, pd.DataFrame({'cluster':labels})], axis=1)
clusters.head()


In [None]:
for c in clusters:
    grid= sns.FacetGrid(clusters, col='cluster')
    grid.map(plt.hist, c)

In [None]:
dist = 1 - cosine_similarity(X)

pca = PCA(2)
pca.fit(dist)
X_PCA = pca.transform(dist)
X_PCA.shape

In [None]:
x, y = X_PCA[:, 0], X_PCA[:, 1]

colors = {0: 'red',
          1: 'blue',
          2: 'green', 
          3: 'yellow', 
          4: 'orange',  
          5:'purple'}

names = {0: 'who make all type of purchases', 
         1: 'more people with due payments', 
         2: 'who purchases mostly in installments', 
         3: 'who take more cash in advance', 
         4: 'who make expensive purchases',
         5:'who don\'t spend much money'}
  
df = pd.DataFrame({'x': x, 'y':y, 'label':labels}) 
groups = df.groupby('label')

fig, ax = plt.subplots(figsize=(20, 13)) 

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=5,
            color=colors[name],label=names[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
    ax.tick_params(axis= 'y',which='both',left='off',top='off',labelleft='off')
    
ax.legend()
ax.set_title("Customers Segmentation based on their Credit Card usage bhaviour.")
plt.show()

May 18th