## Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.pandas.set_option('display.max_columns',None) #To make sure all the columns are displayed

## Q1  Univariate Analysis

In [None]:
dataset = pd.read_csv("../input/credit-card-customer-data/Credit Card Customer Data.csv")

In [None]:
dataset.head() #Check the head of the dataframe

In [None]:
dataset.info() #Total 660 entries

In [None]:
features = ['Total_visits_bank','Total_visits_online','Total_calls_made']

In [None]:
for feature in features:
    sns.distplot(dataset[feature]) #Checking the distributions of the interactions
    plt.show()

In [None]:
dataset.drop(['Sl_No','Customer Key'],axis=1).describe()

 1. There are no missing values
 2. A customer carries ~5 CreditCards on an average and visits ~2 times on an average
 3. There are customers who do not visit banks / online/calls 


In [None]:
### Creating a new feature with total interaction with banks  for analysis
data=dataset.copy() #Copying the dataset into a new memory
data['Total_interactions'] = data['Total_visits_bank'] + data['Total_visits_online'] + data['Total_calls_made']
# Total interactions = total calls + totals visits in banks + total online visits
plt.figure(figsize=(12,8))
feature_perc=[]
for feature in features:
    feature_perc.append((data[feature].sum()/data['Total_interactions'].sum())*100)
plt.pie(feature_perc,labels=['Bank Visits','Online Visits','Calls Made'],autopct='%1.2f',textprops=dict(color="w"))
plt.legend()
plt.title("% age of interactions with respect to the medium")
plt.show()



##  Q2,Q3,Q4 - Perform EDA  ,Create Visualizations and present insights

In [None]:
#Finding duplicate rows
dataset[dataset.duplicated(subset=None, keep='first')]

* There are no duplicate entries in the dataset

In [None]:
data = dataset.drop(['Sl_No','Customer Key'],axis=1).copy()
sns.pairplot(data) #Pairplot from seaborn library

In [None]:
data.corr() #Checking the correlation of the features

## Key Observations:

**Customers with Avg_Credit_Limit > ~60,000 which maybe loyal customers make fewer calls (0-2.5) than the customers with   Avg_Credit_Limit ~10,000 to 25000**

**Customers with 1-4 Credit Cards make more than 5 calls (5-10)**

**People with lower limit, makes more visits to the bank**

**Customers who visits banks less frequently makes more phone calls**10. 

### Total_Credit_Cards v/s Avg_Credit_Limit

In [None]:
plt.figure(figsize=(12,7))
sns.scatterplot(x='Avg_Credit_Limit',y='Total_Credit_Cards',data=dataset)

1.There are two clusters:
    **Customers with Average Credit Limit <100000 And Customers with Average Credit Limit < 75000**

### Avg_Credit_Limit vs Total_visits_banks


In [None]:
plt.figure(figsize=(12,7))
sns.scatterplot(x='Total_visits_bank',y='Avg_Credit_Limit',data=dataset) 

**People with Average Limit > ~70000 make 0 to 1 visits to the bank**
**People with Average Credit Limit < ~70000 visit frequently i.e. 2 to 5 times**

### Avg_Credit_Limit v/s Total_visits_online 

In [None]:
plt.figure(figsize=(12,7))
sns.scatterplot(x='Total_visits_online',y='Avg_Credit_Limit',data=dataset) 

**There clearly seems to be differentiated 2 clusters on the basis of above two features**

**People with Average Credit Limit >75000 visits more frequently online(>7 times)**

###  Total_Credit_Cards v/s Total_visits_online graph

In [None]:
plt.figure(figsize=(12,7))
sns.scatterplot(x='Total_visits_online',y='Total_Credit_Cards',size='Avg_Credit_Limit',data=dataset) 

**There seems to be two differentiable clusters on the basis of above two features**

**Customers with more than 7 credit cards and with Average Limit > 80000 visits  online more frequently**

## Total_visits_online v/s Total_calls_made


In [None]:
plt.figure(figsize=(12,7))
sns.scatterplot(x='Total_visits_online',y='Total_calls_made',size='Avg_Credit_Limit',data=dataset) 

#Dots are sized relative to the Avg_Credit_Limit

**There seems to be cleary two clusters on the above two features**

**Customers who visit less frequently online make more phone calls**

**People with Average credit limit >80000 make more frequent online visits**

### Creating a new feature with total interaction with banks  for analysis

In [None]:
### Creating a new feature with total interaction with banks  for analysis
data=dataset.copy() #Copying the dataset into a new memory
data['Total_interactions'] = data['Total_visits_bank'] + data['Total_visits_online'] + data['Total_calls_made']
# Total interactions = total calls + totals visits in banks + total online visits

In [None]:
sns.scatterplot(x='Avg_Credit_Limit',y='Total_interactions',data=data)

**Customers can be segmented on the basis of Total interactions or total complaints with bank and the average credit limit**

## Q5 Execute KMeans

In [None]:
X = dataset.iloc[:,2:].values #Selecting the features

In [None]:
X.shape 

**As there is a difference in magnitude of Credit Limit and other features, we will scale the features**

**As there are not any negative values, we can use MinMaxScaler**

In [None]:
from sklearn.preprocessing import MinMaxScaler #Importing MinMaxScaler
scaler= MinMaxScaler() #Initialising the instance of the scaler
scaled_features = scaler.fit_transform(X) #Storing the scaled version of features in scaled_features

In [None]:
from sklearn.cluster import KMeans #Importing KMeans from sklearn

In [None]:
## Writing a for loop to plot the graph for within cluster sum of squares

In [None]:
wcss = [] #Within cluster sum of squares
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42) #Initialising 
    kmeans.fit(scaled_features) #Fitting on scaled features
    wcss.append(kmeans.inertia_) #Appending the wcss to the blank list
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show() 

**As there is smooth descent afer n_clusters = 3 we will choose number of clusters as 3**

In [None]:
## Training the K-Means model on the dataset

In [None]:
!pip install yellowbrick

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
kmeans=KMeans(n_clusters = 3, init = 'k-means++', random_state = 42) #Initialising 
visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
visualizer.fit(scaled_features)        # Fit the data to the visualizer
visualizer.show()

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [None]:
print(y_kmeans)

In [None]:
Segment = pd.DataFrame(y_kmeans,columns=['Segment']) #Converting the cluster to data frame
Segment

In [None]:
data=dataset.copy()
data=pd.concat([data,Segment],axis=1)

In [None]:
features = ['Avg_Credit_Limit','Total_Credit_Cards','Total_visits_bank','Total_visits_online','Total_calls_made']

In [None]:
## Analyzing the results

In [None]:
for feature in features:  #Loop to plot boxplot of each cluster
    sns.boxplot(x='Segment',y=feature,data=data)
    plt.show()

# Q6 Hierarchical Clustering

In [None]:
import scipy.cluster.hierarchy as sch #import the library
plt.figure(figsize=(12,8)) #Canvas size
dendrogram = sch.dendrogram(sch.linkage(scaled_features, method = 'ward'))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

**Based on the dendrogram number of clusters = 3**

In [None]:
from scipy.cluster.hierarchy import cophenet

In [None]:
cophenet_ =np.mean(cophenet(sch.linkage(scaled_features)))
cophenet_

In [None]:
from sklearn.cluster import AgglomerativeClustering #Importing lib
hc = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage = 'ward') #Euclidean distance, and ward linkage
y_hc = hc.fit_predict(X) #TrainingTheModel

In [None]:
y_hc #Predictions

In [None]:
df_yhc = pd.DataFrame(y_hc,columns=['HCluster'])

In [None]:
data= dataset.copy()
data = pd.concat([data,df_yhc],axis=1)
data.head()

In [None]:
for feature in features:  #Loop to plot boxplot of each cluster
    sns.boxplot(x='HCluster',y=feature,data=data)
    plt.show()

## Q7 Calculate avg silhoutte scores

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
kmeans_score= silhouette_score(scaled_features,y_kmeans)
kmeans_score

In [None]:
hc_score= silhouette_score(scaled_features,y_hc)
hc_score

In [None]:
print("The silhoutte score of kmeans and Hierarchical Cluster are {} and {} respectively".format(kmeans_score,hc_score))


**The hc_score is greater than kmeans_score**

Its analysis is as follows −

+1 Score − Near +1 Silhouette score indicates that the sample is far away from its neighboring cluster.

0 Score − 0 Silhouette score indicates that the sample is on or very close to the decision boundary separating two neighboring clusters.

-1 Score − 1 Silhouette score indicates that the samples have been assigned to the wrong clusters.

# Q8 Comparing the clusters

Overall for this dataset Agglomerative Clustering has given better results.
The silhoutte score of kmeans and Hierarchical Cluster are 0.058573709938719964 and 0.11914770655113151 respectively
HC has higher silhoutte score.

# Q9 Final answers

For cluster 0 and 1, they prefer to make phone calls or onsite visits to solve problems, therefore the cluster 0 should be targeted through phone calls with Relationship Managers and cross selling through phone calls and banks branches. Since these two clusters do not have much limit and cards, recommend targeting them to cover the operational costs.

For cluster 2, they have high limits and more cards and make the most of online materials. They should be targeted through loyalty programs and luxury online offers to maintain and improve retention as they are affluent and premium customers