In [1]:
import sys
import numpy as np
import pandas as pd

In [2]:
## read in the data
crime = pd.read_csv('Crime_Data_Chicago.csv',low_memory = False)
crime = crime[:5000]

In [3]:
## create latitude and longitude columns separately
crime['latitude'] = np.array([float(s.split(',')[0][1:]) for s in crime['Location ']])
crime['longitude'] = np.array([float(s.split(',')[1][:-1]) for s in crime['Location ']])

In [4]:
crime.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location,latitude,longitude
0,1208575,3/14/2013,3/11/2013,1800,12,77th Street,1241,626,INTIMATE PARTNER - SIMPLE ASSAULT,0416 0446 1243 2000,...,Adult Other,626,,,,6300 BRYNHURST AV,,"(33.9829, -118.3338)",33.9829,-118.3338
1,102005556,1/25/2010,1/22/2010,2300,20,Olympic,2071,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,VAN NESS,15TH,"(34.0454, -118.3157)",34.0454,-118.3157
2,418,3/19/2013,3/18/2013,2030,18,Southeast,1823,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,200 E 104TH ST,,"(33.942, -118.2717)",33.942,-118.2717
3,101822289,11/11/2010,11/10/2010,1800,18,Southeast,1803,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,88TH,WALL,"(33.9572, -118.2717)",33.9572,-118.2717
4,42104479,1/11/2014,1/4/2014,2300,21,Topanga,2133,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),329,...,Invest Cont,745,,,,7200 CIRRUS WY,,"(34.2009, -118.6369)",34.2009,-118.6369


In [5]:
## rescale time variable
def newTimes(df):
    df['Time Occurred'][np.where(df['Time Occurred'] < 800)[0]] = 2400 - (800 - df['Time Occurred'][np.where(df['Time Occurred'] < 800)[0]])
    df['Time Occurred'][np.where(df['Time Occurred'] >= 800)[0]] = df['Time Occurred'][np.where(df['Time Occurred'] >= 800)[0]] - 800
    
## make sex binary!
def newSex(df):
    df['Sex'] = np.zeros(len(df['Victim Sex']))
    df['Sex'][np.where(df['Victim Sex'] == 'M')[0]] = 1
    df['Sex'][np.where(df['Victim Sex'] == 'F')[0]] = 0
    df['Sex'][np.where(df['Victim Sex'] == '')[0]] = 0

## clean age (remove missing values)
def cleanAge(df):
    meanAge = np.mean(df['Victim Age'])
    df['Victim Age'][np.isnan(df['Victim Age'])] = meanAge

## Normilze All Variables
def normalizeDF(df,columns):
    return (df[columns] - df[columns].mean()) / (np.absolute(df[columns]).max()) 

In [6]:
## distance formulas
def L2Norm(pt1,pt2):
    return np.sqrt(np.sum((pt1.values[0] - pt2.values[0]) ** 2))

def L1Norm(pt1,pt2):
    return np.sum(np.absolute((pt1.values[0] - pt2.values[0])))

In [7]:
def classifyPoint(point, centroids, clusters, distFunc):
    smallestDist = np.inf
    closestCentroid = 0
    
    for c in range(len(centroids)):
        #if (centroids.iloc[[c]].equals(point)):
        #    closestCentroid = c
        
        temp = distFunc(centroids.iloc[[c]], point)
        if temp < smallestDist:
            closestCentroid = c
            smallestDist = temp
            
    # Add point to closest cluster
    clusters[closestCentroid].append(point)

In [8]:
def getCentroids(df, centroids, clusters):
    newClusters = pd.DataFrame(data=None, columns=df.columns)
    for key, points in clusters.items():
        if points == []:
            newClusters = pd.concat([newClusters, centroids.iloc[[key]]], axis=0)
        else:
            cluster = pd.DataFrame(data=None, columns=df.columns)
            for p in points:
                cluster = pd.concat([cluster, pd.DataFrame(p)], axis=0)
            newClusters = newClusters.append(np.mean(cluster), ignore_index=True)
    
    return newClusters

In [9]:
def cluster(df,columns,k,distFunc):
    df = df[columns]
    
    ## pick k random points from the data set 
    ## and make sure they are unique
    firstIDs = np.random.randint(len(df), size = k)
    while (len(np.unique(firstIDs)) < k):
        firstIDs = np.random.randint(len(df), size = k)
    
    ## set centroids
    centroids = df.iloc[firstIDs]
    centroids.index = range(k)
    
    ## run until clusters don't change (hopefully not forever)
    while True:
        clusters = {k: [] for k in range(k)}
        for p in range(len(df)):
            point = df.iloc[[p]]
            classifyPoint(point, centroids, clusters, distFunc)
            
        newCentroids = getCentroids(df, centroids, clusters)
        
        # maybe check if it hits a certain threshold?
        if (centroids.equals(newCentroids)):
            break
        else:
            centroids = newCentroids
            
    return clusters

In [None]:
crimeCodes = crime[['Crime Code', 'Crime Code Description']]
crimeCodes['Crime Code'] = (crimeCodes['Crime Code'] - crimeCodes['Crime Code'].mean()) / (np.absolute(crimeCodes['Crime Code']).max())
#crimeCodesMap = {code: desc for code, desc in crimeCodes.itertuples(index=False)}
#crimeCodesMap

In [None]:
cols = ['Time Occurred','Area ID','Reporting District','Crime Code','Victim Age','Sex','Premise Code','latitude','longitude']

In [None]:
## data cleaning
newTimes(crime)
newSex(crime)
cleanAge(crime)
crime = normalizeDF(crime, cols)

In [None]:
clusters = cluster(crime,cols,3, L2Norm)

In [None]:
niceDisplay = pd.DataFrame(data=None, columns=cols)
for cluster, points in clusters.items():
    for point in points:
        temp = pd.DataFrame(point)
        temp['Cluster'] = cluster
        niceDisplay = niceDisplay.append(temp, ignore_index=True)

In [None]:
niceDisplay = niceDisplay.merge(crimeCodes, on=['Crime Code'])

# Clustered Data

In [None]:
#niceDisplay
niceDisplay.set_index('Cluster')

# Displays Top 5 Crimes in Cluster 0

In [None]:
niceDisplay.loc[niceDisplay['Cluster'] == 0].groupby(['Crime Code Description']).count().sort_values(by=['Crime Code'], ascending=False).head()