## Calculating Gower matrix for mixed variable types
- If we have a mix of continuous and categorical variables from our NLP analysis, we will have to calculate a Gower matrix in order to perform clustering. 

### Installations
pip install gower

In [9]:
import pandas as pd
import numpy as np
import gower
import matplotlib.pyplot as plt

from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.metrics import silhouette_samples, silhouette_score

from sklearn_extra.cluster import KMedoids

ModuleNotFoundError: No module named 'gower'

In [10]:
df = pd.DataFrame({'City':['Atlanta','Baltimore','Chicago','Philadelphia'],'N-gram_Phrase_occurence': [14, 35, 9,12],
                  'Requires_Comprehensive_Reporting':['1','0','1','1'], 'TF_IDF_score': [2,9,6,3]})
df.set_index('City', inplace=True)

In [11]:
df

Unnamed: 0_level_0,N-gram_Phrase_occurence,Requires_Comprehensive_Reporting,TF_IDF_score
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Atlanta,14,1,2
Baltimore,35,0,9
Chicago,9,1,6
Philadelphia,12,1,3


### Specify categorical features

In [12]:
cat_feats = ['Requires_Comprehensive_Reporting']
cat_idx = [True if x in cat_feats else False for x in df.columns]

### Create gower matrix

In [8]:
gw = gower.gower_matrix(df, cat_features=cat_idx)
gw

NameError: name 'gower' is not defined

In [None]:
kmedoids = KMedoids(n_clusters=1, random_state=0).fit(gw)
kmedoids.labels_
kmedoids.cluster_centers_

In [None]:
ls = []
for i in gw:
    diff = i - kmedoids.cluster_centers_
    ls.append(diff)
ls

In [None]:
ls[0]

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

img = ax.scatter(ls[0], ls[1], ls[2], ls[3], cmap=plt.hot())
fig.colorbar(img)
plt.show()

### K-medoids calculations

In [None]:
def init_medoids(X, k):
    from numpy.random import choice
    from numpy.random import seed
    
    seed(1)
    samples = choice(len(X), size=k, replace=False)
    return X[samples, :]

In [None]:
meds = init_medoids(gw, 1)
meds

In [None]:
def compute_d_p(X, medoids, p):
    m = len(X)
    medoids_shape = medoids.shape
    
    if len(medoids_shape) == 1:
        medoids = medoids.reshape((1,len(medoids)))
    k = len(medoids)
    
    S = np.empty((m, k))
    
    for i in range(m):
        d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
        S[i, :] = d_i**p
    
    return S

In [None]:
dists = compute_d_p(gw, meds, 2)
dists = dists.flatten()
dists

In [None]:
cities = ['Atlanta','Baltimore','Chicago','Philadelphia']

dist = pd.DataFrame({'City': cities, 'Distance':dists})
dist['Pct'] = dist['Distance'].rank(pct=True)
dist['Is_High_Risk'] = np.where(dist['Pct'] >= 0.75, 'Yes', 'No')
dist