In [None]:
# Load the necessary libraries
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
from numpy import inf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)
# Load the data
data = pd.read_csv('Marketing_data.csv')

In [None]:
# One liner pandas data exploration
# Pandas profiling
data_profile = ProfileReport(data)
data_profile

In [None]:
data.isnull().sum()

In [None]:
# Filling up the missing elements with mean of the 'CREDIT_LIMIT' 
data.loc[(data['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = data['CREDIT_LIMIT'].mean()
# Filling up the missing elements with mean of the 'MINIMUM_PAYMENT' 
data.loc[(data['MINIMUM_PAYMENTS'].isnull() == True), 
                  'MINIMUM_PAYMENTS'] = data['MINIMUM_PAYMENTS'].mean()

In [None]:
# Let's drop Customer ID since it has no meaning here 
data.drop("CUST_ID", axis = 1, inplace= True)

plt.figure(figsize=(10,50))
for i in range(len(data.columns)):
  plt.subplot(17, 1, i+1)
  sns.distplot(data[data.columns[i]], kde_kws={"color": "b", "lw": 3, "label": "KDE"}, 
               hist_kws={"color": "g"})
  plt.title(data.columns[i])
plt.tight_layout()

In [None]:
correlations = data.corr()
f, ax = plt.subplots(figsize = (20, 20))
sns.heatmap(correlations, annot = True)

In [None]:
# IDENTIFY OPTIMAL NUMBERS OF CLUSTERS

# Let's scale the data first
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
data_scaled

In [None]:
# Create 10 models with 1 to 10 clusters
wcss = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i)
    # Fit the data points
    kmeans.fit(data_scaled)
    # Get the WCSS (inertia) value
    wcss.append(kmeans.inertia_)
    
#Plot the WCSS values onto a line graph
plt.plot(range(1, 11), wcss)
plt.title('WCSS by Clusters')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# RUNNING THE K-MEANS ALGORITHM

kmeans = KMeans(4)
kmeans.fit(data_scaled)
labels = kmeans.labels_
kmeans.cluster_centers_.shape

In [None]:
cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns = [data.columns])
cluster_centers

In [None]:
# Perform inverse transformation
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = [data.columns])
cluster_centers

In [None]:
# concatenate the clusters labels to our original dataframe
creditcard_df_cluster = pd.concat([data, pd.DataFrame({'cluster':labels})], axis = 1)
creditcard_df_cluster.head()

In [None]:
# Plot the histogram of various clusters
for i in data.columns:
  plt.figure(figsize = (35, 5))
  for j in range(4):
    plt.subplot(1,4,j+1)
    cluster = creditcard_df_cluster[creditcard_df_cluster['cluster'] == j]
    cluster[i].hist(bins = 20)
    plt.title('{}    \nCluster {} '.format(i,j))
  
  plt.show()

In [None]:
# Principal Component Analysis

# Obtain the 2 principal components 
pca = PCA(n_components=2)
principal_comp = pca.fit_transform(data_scaled)
# Create a dataframe with the two components
pca_df = pd.DataFrame(data = principal_comp, columns =['pca1','pca2'])
# Concatenate the clusters labels to the dataframe
pca_df = pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis = 1)
pca_df.head()

In [None]:
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x="pca1", y="pca2", hue = "cluster", data = pca_df, palette =['red','green','blue','pink'])
plt.show()

In [None]:
# INTERPRETING THE RESULTS

creditcard_df_cluster['cluster'].value_counts()


https://github.com/manilwagle/medium