In [12]:
import pandas as pd
from kmodes.kmodes import KModes
import matplotlib.pyplot as plt
from seaborn import countplot

In [2]:
# read csv into pandas dataframe
data = pd.read_csv(file_path)
data = data.drop(columns=['DATE.OCC','DR_NO','AREA.NAME'])
data.head(10)

In [3]:
#Remove any rows with H in their Vict.Sex column
cleaned = data.drop(data[data['Vict.Sex']=='H'].index)
#Remove any rows without the specified characters below in their Vict.Descent column
toKeep = ['B','H','O','W','X']
cleaned = cleaned.drop(cleaned[cleaned['Vict.Descent'].isin(toKeep)==False].index)
encode.head(10)
#the table printed now only stores records with the most meaningful data

In [4]:
#here, we will implement Kmodes (similar to KMeans, but clusters categorical variables rather than numerical). 
#to do this, for now, we will strip the loc (lat, lon) data. What's left is our categorical data
#but we will categorize age data in ranges (10-20, 21-30, 31-40, etc.) as well as time data
KmodesData = pd.DataFrame.copy(cleaned)
KmodesData.head()
KmodesData.drop(columns=['LAT', 'LON'], inplace=True)
#group ages into age bins
KmodesData['AgeBins'] = pd.cut(KmodesData['Vict.Age'], bins=[0,20,30,40,50,60,70,80,max(KmodesData['Vict.Age'])])
KmodesData.drop(columns=['Vict.Age'], inplace=True)
#group times into time bins
KmodesData['TimeOccBins'] = pd.cut(KmodesData['TIME.OCC'], bins=[0, 600, 1200, 1600, 2100, 2400])
KmodesData.drop(columns=['TIME.OCC'], inplace=True)
#convert it all to strings to ensure categories can be determined by KModes
KmodesData = KmodesData.astype('str', copy=True)
KmodesData.head(10)

In [7]:
#Kmodes requires us to give it the number of clusters we wish to categorize
#we will use the Elbow method to determine this number of clusters K
cost = []
K = [1,2,3,4]
for i in K:
    kout = KModes(n_clusters=i, init='Cao', n_init=4)
    kout.fit_predict(KmodesData)
    cost.append(kout.cost_)
plt.plot(K, cost)
plt.xlabel('K')
plt.ylabel('Cost')
plt.show()
#we will select the farthest right significant bend...
#we can see the bend at K=2, so we will use 2 clusters

In [8]:
#we will now implement our KModes algo
kout = KModes(n_clusters=2, init='Cao', n_init=4)
#and use the fitted model to assign clusters to each victim
clusters = kout.fit_predict(KmodesData)
#finally append the cluster values to our dataframe
KmodesData['Cluster'] = clusters
#KmodesData.head(10)
#make copy of of this data and add lat and long back to it.
#We will use this dataframe for further data analysis
csv = pd.DataFrame.copy(cleaned)
csv['Cluster'] = clusters
csv.to_csv(path)

In [10]:
for column in KmodesData.iloc[:,:-1]:
    plt.subplots()
    countplot(x='Cluster', hue=column, data=KmodesData)
    plt.show()