## Mini Project III
### Customer segmentation using PCA and Kmeans

In [None]:
!pip install -r requirements.txt

In [None]:
# import libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import plotly.graph_objects as go
from sklearn.decomposition import PCA

## CUSTOMERS SEGMENTATION

In [None]:
# get our file
customers = pd.read_csv('data/twm_customer.csv', delimiter=';')

In [None]:
# check customers
customers.head()

In [None]:
# geo stuff for google places location analysis
geostuff = customers[['income', 'street_nbr', 'street_name', 'postal_code', 'city_name', 'state_code']]
geostuff.to_csv('data/cust_loc.csv', index=False)

### DATA WRANGLING

In [None]:
# drop unneeded cols from customers
df1 = customers.drop(columns=['cust_id', 'name_prefix', 'first_name', 'last_name', 'street_nbr', 
                              'street_name', 'postal_code', 'city_name', 'state_code'])

In [None]:
df1.head()

In [None]:
#one hot encoding for marital status and gender, also drop those cols
df1 = pd.concat([df1.drop('marital_status', axis=1), pd.get_dummies(df1['marital_status'], prefix='MaritalStatus')], axis=1)
df1 = pd.concat([df1.drop('gender', axis=1), pd.get_dummies(df1['gender'], prefix='gender')], axis=1)

In [None]:
df1.columns

In [None]:
# scale the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

scaler = MinMaxScaler()
df2 = scaler.fit_transform(df1)

### HELPER FUNCTIONS FOR PLOTS

In [None]:
def plot_clusters(X,y_res, plt_cluster_centers = False):
    X_centroids = []
    Y_centroids = []

    for cluster in set(y_res):
        x = X[y_res == cluster,0]
        y = X[y_res == cluster,1]
        X_centroids.append(np.mean(x))
        Y_centroids.append(np.mean(y))

        plt.scatter(x,
                    y,
                    s=50,
                    marker='s',
                    label=f'cluster {cluster}')

    if plt_cluster_centers:
        plt.scatter(X_centroids,
                    Y_centroids,
                    marker='*',
                    c='red',
                    s=250,
                    label='centroids')
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
def plot_distortion(X,max_clusters = 10):
    distortions = []
    for i in range(1, max_clusters +1):
        km = KMeans(n_clusters=i,
                    init='k-means++',
                    n_init=10,
                    random_state=0)
        km.fit(X)
        distortions.append(km.inertia_)

    plt.plot(range(1,max_clusters +1), distortions, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.show() 

In [None]:
# elbow rule to find best number of clusters
plot_distortion(df2,max_clusters=10)

### PCA

In [None]:
from sklearn.decomposition import PCA

# pca to plot the principal component weight
pca = PCA()
pca.fit(df2)
pca_data = pca.transform(df2)

In [None]:
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range (1, len(per_var)+1)]

In [None]:
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

In [None]:
# do PCA on the data using two components
pca = PCA(n_components=2)

In [None]:
# transform the data
df3 = pca.fit_transform(df2)

In [None]:
df3.shape

### Kmeans

In [None]:
# instantiate kmeans
km = KMeans(n_clusters=6, n_init=10, random_state=0)

In [None]:
# fit and predict
label = km.fit_predict(df3)

In [None]:
# check the cluster labels
print(label)

In [None]:
# plot a few
filtered_label0 = df3[label == 0]
filtered_label1 = df3[label == 1]

In [None]:
plt.scatter(filtered_label0[:,0], filtered_label0[:, 1])

In [None]:
# plot all
u_labels = np.unique(label)

In [None]:
#Getting the Centroids
centroids = km.cluster_centers_
u_labels = np.unique(label)
 
#plotting the results:
 
for i in u_labels:
    plt.scatter(df3[label == i , 0] , df3[label == i , 1] , label = i)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'k')
plt.legend()
plt.show()

In [None]:
 # plot clustering result using that other function
plot_clusters(df3, label, plt_cluster_centers= True)

In [None]:
df2

In [None]:
# make dataframe out of scaled data
df4 = pd.DataFrame(df2, columns=df1.columns)

In [None]:
#get stuff ready for radar plot
# add cluster
df4['cluster'] = label
# aggregate by cluster
dfradar = df4.groupby('cluster').mean()

In [None]:
dfradar

In [None]:
# list for plotting
categories = list(dfradar.columns)

In [None]:
# make radar charts for each cluster
fig = go.Figure()

for i in range(0,6):
    fig.add_trace(go.Scatterpolar(
        r=dfradar.iloc[i,:],
        theta=categories,
        fill='toself',
        name=f'Cluster {i}'
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True
        )),
    showlegend=False
)

fig.show()

In [None]:
# one at a time
i=5

fig = go.Figure()

fig.add_trace(go.Scatterpolar(
    r=dfradar.iloc[i,:], 
    theta=categories, 
    fill='toself', 
    name='Cluster ' + str(i)
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  ),
  showlegend=False
)

fig.show()

## ACCOUNTS AND TRANSACTIONS

In [None]:
# get accounts and transactions
accounts = pd.read_csv('data/twm_accounts.csv', delimiter=';')

transactions = pd.read_csv('data/twm_transactions.csv', delimiter=';')

joint_df = pd.merge(accounts, transactions, on='acct_nbr', how='outer')

In [None]:
joint_df.head()

In [None]:
# hot encode acct_type, account_active, channel, tran_code
hotenc = ['acct_type', 'account_active']

for i in hotenc:
    joint_df = pd.concat([joint_df.drop(i, axis=1), pd.get_dummies(joint_df[i], prefix=i)], axis=1)

In [None]:
joint_df.columns

In [None]:
# tran_date is tran_date - acct_start_date in days
joint_df['tran_date'] = (pd.to_datetime(joint_df['tran_date']) - pd.to_datetime(joint_df['acct_start_date'])).dt.days

In [None]:
joint_df.head()

In [None]:
# remove columns
joint_df = joint_df.drop(columns=['acct_nbr', 'cust_id', 'acct_start_date', 'acct_end_date', 'tran_id', 'tran_time', 'channel', 'tran_code'])

In [None]:
# fill na values
joint_df.fillna(0, inplace=True)

In [None]:
# scale the data
scaler = MinMaxScaler()
joint_df2 = scaler.fit_transform(joint_df)

In [None]:
plot_distortion(joint_df2, max_clusters=10)

In [None]:
# pca to plot the principal component weight
pca = PCA()
pca.fit(joint_df2)
pca_data = pca.transform(joint_df2)

In [None]:
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range (1, len(per_var)+1)]

In [None]:
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

In [None]:
pca = PCA(n_components=3)

In [None]:
joint_df3=pca.fit_transform(joint_df2)

In [None]:
# create kmeans model
km = KMeans(n_clusters=4, n_init=10, random_state=0)

In [None]:
# fit and predict
label = km.fit_predict(joint_df3)

In [None]:
#Getting the Centroids
centroids = km.cluster_centers_
u_labels = np.unique(label)
 
#plotting the results:
 
for i in u_labels:
    plt.scatter(joint_df3[label == i , 0] , joint_df3[label == i , 1] , label = i)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'k')
plt.legend()
plt.show()

In [None]:
# make dataframe out of scaled data and column names
joint_df = pd.DataFrame(joint_df2, columns=joint_df.columns)

In [None]:
# add cluster
joint_df['cluster'] = label
# aggregate by cluster
dfradar = joint_df.groupby('cluster').mean()

In [None]:
dfradar.head()

In [None]:
# list for plotting
categories = list(dfradar.columns)

In [None]:
# make radar charts for each cluster
fig = go.Figure()

for i in range(0,4):
    fig.add_trace(go.Scatterpolar(
        r=dfradar.iloc[i,:],
        theta=categories,
        fill='toself',
        name=f'Cluster {i}'
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True
        )),
    showlegend=False
)

fig.show()

In [None]:
import plotly.express as px

In [None]:
n_components = 3

pca = PCA(n_components=n_components)
components = pca.fit_transform(joint_df2)

total_var = pca.explained_variance_ratio_.sum() * 100

labels = {str(i): f"PC {i+1}" for i in range(n_components)}
labels['color'] = 'Median Price'

fig = px.scatter_matrix(
    components,
    dimensions=range(n_components),
    labels=labels,
    title=f'Total Explained Variance: {total_var:.2f}%',
)
fig.update_traces(diagonal_visible=False)
fig.show()