In [31]:
import sys
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
## read in the data
crime = pd.read_csv('Crime_Data_Chicago.csv',low_memory = False)
crime = crime[:5000]

In [3]:
## create latitude and longitude columns separately
crime['latitude'] = np.array([float(s.split(',')[0][1:]) for s in crime['Location ']])
crime['longitude'] = np.array([float(s.split(',')[1][:-1]) for s in crime['Location ']])

In [4]:
crime

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location,latitude,longitude
0,1208575,3/14/2013,3/11/2013,1800,12,77th Street,1241,626,INTIMATE PARTNER - SIMPLE ASSAULT,0416 0446 1243 2000,...,Adult Other,626,,,,6300 BRYNHURST AV,,"(33.9829, -118.3338)",33.9829,-118.3338
1,102005556,1/25/2010,1/22/2010,2300,20,Olympic,2071,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,VAN NESS,15TH,"(34.0454, -118.3157)",34.0454,-118.3157
2,418,3/19/2013,3/18/2013,2030,18,Southeast,1823,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,200 E 104TH ST,,"(33.942, -118.2717)",33.9420,-118.2717
3,101822289,11/11/2010,11/10/2010,1800,18,Southeast,1803,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,88TH,WALL,"(33.9572, -118.2717)",33.9572,-118.2717
4,42104479,1/11/2014,1/4/2014,2300,21,Topanga,2133,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),329,...,Invest Cont,745,,,,7200 CIRRUS WY,,"(34.2009, -118.6369)",34.2009,-118.6369
5,120125367,1/8/2013,1/8/2013,1400,1,Central,111,110,CRIMINAL HOMICIDE,1243 2000 1813 1814 2002 0416 0400,...,Adult Arrest,110,,,,600 N HILL ST,,"(34.0591, -118.2412)",34.0591,-118.2412
6,101105609,1/28/2010,1/27/2010,2230,11,Northeast,1125,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,YORK,AVENUE 51,"(34.1211, -118.2048)",34.1211,-118.2048
7,101620051,11/11/2010,11/7/2010,1600,16,Foothill,1641,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,EL DORADO,TRUESDALE,"(34.241, -118.3987)",34.2410,-118.3987
8,101910498,4/7/2010,4/7/2010,1600,19,Mission,1902,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,GLENOAKS,DRELL,"(34.3147, -118.4589)",34.3147,-118.4589
9,120908292,3/29/2013,1/15/2013,800,9,Van Nuys,904,668,"EMBEZZLEMENT, GRAND THEFT ($950.01 & OVER)",0344 1300,...,Invest Cont,668,,,,7200 SEPULVEDA BL,,"(34.2012, -118.4662)",34.2012,-118.4662


In [5]:
## rescale time variable
def newTimes(df):
    df['Time Occurred'][np.where(df['Time Occurred'] < 800)[0]] = 2400 - (800 - df['Time Occurred'][np.where(df['Time Occurred'] < 800)[0]])
    df['Time Occurred'][np.where(df['Time Occurred'] >= 800)[0]] = df['Time Occurred'][np.where(df['Time Occurred'] >= 800)[0]] - 800
    
## make sex binary!
def newSex(df):
    df['Sex'] = np.zeros(len(df['Victim Sex']))
    df['Sex'][np.where(df['Victim Sex'] == 'M')[0]] = 1
    df['Sex'][np.where(df['Victim Sex'] == 'F')[0]] = 0
    df['Sex'][np.where(df['Victim Sex'] == '')[0]] = 0

## clean age (remove missing values)
def cleanAge(df):
    meanAge = np.mean(df['Victim Age'])
    df['Victim Age'][np.isnan(df['Victim Age'])] = meanAge

## Normilze All Variables
def normalizeDF(df,columns):
    return (df[columns] - df[columns].mean()) / (np.absolute(df[columns]).max()) 

In [6]:
## distance formulas
def L2Norm(pt1,pt2):
    return np.sqrt(np.sum((pt1.values[0] - pt2.values[0]) ** 2))

def L1Norm(pt1,pt2):
    return np.sum(np.absolute((pt1.values[0] - pt2.values[0])))

In [7]:
def classifyPoint(point, centroids, clusters, distFunc):
    smallestDist = np.inf
    closestCentroid = 0
    
    for c in range(len(centroids)):
        #if (centroids.iloc[[c]].equals(point)):
        #    closestCentroid = c
        
        temp = distFunc(centroids.iloc[[c]], point)
        if temp < smallestDist:
            closestCentroid = c
            smallestDist = temp
            
    # Add point to closest cluster
    clusters[closestCentroid].append(point)

In [8]:
def getCentroids(df, centroids, clusters):
    newClusters = pd.DataFrame(data=None, columns=df.columns)
    for key, points in clusters.items():
        if points == []:
            newClusters = pd.concat([newClusters, centroids.iloc[[key]]], axis=0)
        else:
            cluster = pd.DataFrame(data=None, columns=df.columns)
            for p in points:
                cluster = pd.concat([cluster, pd.DataFrame(p)], axis=0)
            newClusters = newClusters.append(np.mean(cluster), ignore_index=True)
    
    return newClusters

In [9]:
def cluster(df,columns,k):
    df = df[columns]
    
    ## pick k random points from the data set 
    ## and make sure they are unique
    firstIDs = np.random.randint(len(df), size = k)
    while (len(np.unique(firstIDs)) < k):
        firstIDs = np.random.randint(len(df), size = k)
    
    ## set centroids
    centroids = df.iloc[firstIDs]
    centroids.index = range(k)
    
    ## run until clusters don't change (hopefully not forever)
    while True:
        clusters = {k: [] for k in range(k)}
        for p in range(len(df)):
            point = df.iloc[[p]]
            classifyPoint(point, centroids, clusters, L2Norm)
            
        newCentroids = getCentroids(df, centroids, clusters)
        
        # maybe check if it hits a certain threshold?
        if (centroids.equals(newCentroids)):
            break
        else:
            centroids = newCentroids
            
    return clusters

In [10]:
crimeCodes = crime[['Crime Code', 'Crime Code Description']]
crimeCodes['Crime Code'] = (crimeCodes['Crime Code'] - crimeCodes['Crime Code'].mean()) / (np.absolute(crimeCodes['Crime Code']).max())
crimeCodesMap = {code: desc for code, desc in crimeCodes.itertuples(index=False)}
#crimeCodesMap

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
cols = ['Time Occurred','Area ID','Reporting District','Crime Code','Victim Age','Sex','Premise Code','latitude','longitude']

In [12]:
## data cleaning
newTimes(crime)
newSex(crime)
cleanAge(crime)
crime = normalizeDF(crime, cols)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy o

In [13]:
clusters = cluster(crime,cols,3)

In [None]:
clusters.keys()

In [43]:
clusters.items()[0]

TypeError: 'dict_items' object does not support indexing

In [54]:
niceDisplay = pd.DataFrame(data=None, columns=cols)
for cluster, points in clusters.items():
    for point in points:
        temp = pd.DataFrame(point)
        temp['Cluster'] = cluster
        niceDisplay = niceDisplay.append(temp, ignore_index=True)

In [50]:
crimeCodes.head()

Unnamed: 0,Crime Code,Crime Code Description
0,0.107352,INTIMATE PARTNER - SIMPLE ASSAULT
1,-0.013987,VEHICLE - STOLEN
2,-0.013987,VEHICLE - STOLEN
3,-0.013987,VEHICLE - STOLEN
4,0.231829,VANDALISM - MISDEAMEANOR ($399 OR UNDER)


In [51]:
niceDisplay.head()

Unnamed: 0,Area ID,Cluster,Crime Code,Premise Code,Reporting District,Sex,Time Occurred,Victim Age,latitude,longitude
0,0.37141,0.0,-0.087209,-0.176571,0.360983,0.5086,-0.306978,0.045669,-0.002706,-6e-06
1,0.37141,0.0,0.250658,-0.185203,0.361893,0.5086,-0.306978,-0.085644,-0.002706,1.8e-05
2,0.514267,0.0,-0.085117,0.18841,0.472448,0.5086,0.225415,-0.257361,0.004162,-0.001167
3,-0.1524,0.0,0.10526,-0.18397,-0.135833,0.5086,-0.165862,-0.14625,-0.000437,0.000202
4,-0.1524,0.0,0.153377,0.18841,-0.141292,0.5086,0.032984,-0.126048,-0.000294,0.000167


In [56]:
niceDisplay1 = pd.merge_asof(niceDisplay.sort_values(by=["Crime Code"]), crimeCodes., on=['Crime Code'])

ValueError: left keys must be sorted

In [None]:
niceDisplay1.shape

# Clustered Data

In [16]:
#niceDisplay
niceDisplay.set_index('Cluster')

Unnamed: 0_level_0,Area ID,Crime Code,Premise Code,Reporting District,Sex,Time Occurred,Victim Age,latitude,longitude,Crime Code Description
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,0.371410,-0.087209,-0.176571,0.360983,0.5086,-0.306978,4.566910e-02,-0.002706,-0.000006,THEFT PLAIN - PETTY ($950 & UNDER)
0.0,0.371410,-0.087209,-0.176571,0.360983,0.5086,-0.306978,4.566910e-02,-0.002706,-0.000006,THEFT PLAIN - PETTY ($950 & UNDER)
0.0,0.371410,-0.087209,-0.176571,0.360983,0.5086,-0.306978,4.566910e-02,-0.002706,-0.000006,THEFT PLAIN - PETTY ($950 & UNDER)
0.0,0.371410,-0.087209,-0.176571,0.360983,0.5086,-0.306978,4.566910e-02,-0.002706,-0.000006,THEFT PLAIN - PETTY ($950 & UNDER)
0.0,0.371410,-0.087209,-0.176571,0.360983,0.5086,-0.306978,4.566910e-02,-0.002706,-0.000006,THEFT PLAIN - PETTY ($950 & UNDER)
0.0,0.371410,-0.087209,-0.176571,0.360983,0.5086,-0.306978,4.566910e-02,-0.002706,-0.000006,THEFT PLAIN - PETTY ($950 & UNDER)
0.0,0.371410,-0.087209,-0.176571,0.360983,0.5086,-0.306978,4.566910e-02,-0.002706,-0.000006,THEFT PLAIN - PETTY ($950 & UNDER)
0.0,0.371410,-0.087209,-0.176571,0.360983,0.5086,-0.306978,4.566910e-02,-0.002706,-0.000006,THEFT PLAIN - PETTY ($950 & UNDER)
0.0,0.371410,-0.087209,-0.176571,0.360983,0.5086,-0.306978,4.566910e-02,-0.002706,-0.000006,THEFT PLAIN - PETTY ($950 & UNDER)
0.0,0.371410,-0.087209,-0.176571,0.360983,0.5086,-0.306978,4.566910e-02,-0.002706,-0.000006,THEFT PLAIN - PETTY ($950 & UNDER)


# Displays Top 5 Crimes in Cluster 0

In [17]:
niceDisplay.loc[niceDisplay['Cluster'] == 0].groupby(['Crime Code Description']).count().sort_values(by=['Crime Code'], ascending=False).head()

Unnamed: 0_level_0,Area ID,Cluster,Crime Code,Premise Code,Reporting District,Sex,Time Occurred,Victim Age,latitude,longitude
Crime Code Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BATTERY - SIMPLE ASSAULT,285520,285520,285520,285520,285520,285520,285520,285520,285520,285520
THEFT PLAIN - PETTY ($950 & UNDER),119700,119700,119700,119700,119700,119700,119700,119700,119700,119700
BURGLARY FROM VEHICLE,56917,56917,56917,56917,56917,56917,56917,56917,56917,56917
ROBBERY,32864,32864,32864,32864,32864,32864,32864,32864,32864,32864
VANDALISM - MISDEAMEANOR ($399 OR UNDER),32560,32560,32560,32560,32560,32560,32560,32560,32560,32560


In [18]:
niceDisplay.loc[niceDisplay['Cluster'] == 1].groupby(['Crime Code Description']).count().sort_values(by=['Crime Code'], ascending=False).head()

Unnamed: 0_level_0,Area ID,Cluster,Crime Code,Premise Code,Reporting District,Sex,Time Occurred,Victim Age,latitude,longitude
Crime Code Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BATTERY - SIMPLE ASSAULT,187136,187136,187136,187136,187136,187136,187136,187136,187136,187136
VEHICLE - STOLEN,125736,125736,125736,125736,125736,125736,125736,125736,125736,125736
THEFT PLAIN - PETTY ($950 & UNDER),80550,80550,80550,80550,80550,80550,80550,80550,80550,80550
BURGLARY FROM VEHICLE,39151,39151,39151,39151,39151,39151,39151,39151,39151,39151
INTIMATE PARTNER - SIMPLE ASSAULT,23270,23270,23270,23270,23270,23270,23270,23270,23270,23270


In [19]:
niceDisplay.loc[niceDisplay['Cluster'] == 2].groupby(['Crime Code Description']).count().sort_values(by=['Crime Code'], ascending=False).head()

Unnamed: 0_level_0,Area ID,Cluster,Crime Code,Premise Code,Reporting District,Sex,Time Occurred,Victim Age,latitude,longitude
Crime Code Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
VEHICLE - STOLEN,897390,897390,897390,897390,897390,897390,897390,897390,897390,897390
BURGLARY FROM VEHICLE,12173,12173,12173,12173,12173,12173,12173,12173,12173,12173
BURGLARY,3036,3036,3036,3036,3036,3036,3036,3036,3036,3036
THEFT PLAIN - PETTY ($950 & UNDER),2250,2250,2250,2250,2250,2250,2250,2250,2250,2250
ROBBERY,1248,1248,1248,1248,1248,1248,1248,1248,1248,1248


In [None]:
## Was trying to see if there was a good way to chart some of this stuff for the presentation.
## Still thinking about it

In [27]:
import matplotlib.pyplot as plt
%matplotlib inline

In [24]:
niceDisplay.head()

Unnamed: 0,Area ID,Cluster,Crime Code,Premise Code,Reporting District,Sex,Time Occurred,Victim Age,latitude,longitude,Crime Code Description
0,0.37141,0.0,-0.087209,-0.176571,0.360983,0.5086,-0.306978,0.045669,-0.002706,-6e-06,THEFT PLAIN - PETTY ($950 & UNDER)
1,0.37141,0.0,-0.087209,-0.176571,0.360983,0.5086,-0.306978,0.045669,-0.002706,-6e-06,THEFT PLAIN - PETTY ($950 & UNDER)
2,0.37141,0.0,-0.087209,-0.176571,0.360983,0.5086,-0.306978,0.045669,-0.002706,-6e-06,THEFT PLAIN - PETTY ($950 & UNDER)
3,0.37141,0.0,-0.087209,-0.176571,0.360983,0.5086,-0.306978,0.045669,-0.002706,-6e-06,THEFT PLAIN - PETTY ($950 & UNDER)
4,0.37141,0.0,-0.087209,-0.176571,0.360983,0.5086,-0.306978,0.045669,-0.002706,-6e-06,THEFT PLAIN - PETTY ($950 & UNDER)


In [34]:
niceDisplay.loc[niceDisplay["Cluster"] == 0.0].groupby(["Cluster", "Crime Code Description"]).count()["Area ID"]

Cluster  Crime Code Description                                         
0.0      ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER                            1
         ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT                      23355
         ATTEMPTED ROBBERY                                                     468
         BATTERY - SIMPLE ASSAULT                                           285520
         BATTERY POLICE (SIMPLE)                                               837
         BATTERY WITH SEXUAL CONTACT                                            66
         BEASTIALITY, CRIME AGAINST NATURE SEXUAL ASSLT WITH ANIM0065            1
         BIKE - ATTEMPTED STOLEN                                                 1
         BIKE - STOLEN                                                       10324
         BOMB SCARE                                                             28
         BRANDISH WEAPON                                                       980
         BUNCO

In [None]:
#cluster1 = niceDisplay[niceDisplay['Cluster'] == 0]
#cluster1

In [None]:
#cluster1.groupby('Crime Code').count()['Sex'].plot.bar()

In [None]:
#cluster1.set_index("Cluster")["Crime Code"].plot.bar()

In [None]:
#cluster1