In [1]:
import sys
import numpy as np
import pandas as pd

In [2]:
## read in the data
crime = pd.read_csv('Crime_Data_Chicago.csv',low_memory = False)
crime = crime[:5000]

In [3]:
## create latitude and longitude columns separately
crime['latitude'] = np.array([float(s.split(',')[0][1:]) for s in crime['Location ']])
crime['longitude'] = np.array([float(s.split(',')[1][:-1]) for s in crime['Location ']])

In [4]:
crime

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location,latitude,longitude
0,1208575,3/14/2013,3/11/2013,1800,12,77th Street,1241,626,INTIMATE PARTNER - SIMPLE ASSAULT,0416 0446 1243 2000,...,Adult Other,626,,,,6300 BRYNHURST AV,,"(33.9829, -118.3338)",33.9829,-118.3338
1,102005556,1/25/2010,1/22/2010,2300,20,Olympic,2071,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,VAN NESS,15TH,"(34.0454, -118.3157)",34.0454,-118.3157
2,418,3/19/2013,3/18/2013,2030,18,Southeast,1823,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,200 E 104TH ST,,"(33.942, -118.2717)",33.9420,-118.2717
3,101822289,11/11/2010,11/10/2010,1800,18,Southeast,1803,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,88TH,WALL,"(33.9572, -118.2717)",33.9572,-118.2717
4,42104479,1/11/2014,1/4/2014,2300,21,Topanga,2133,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),329,...,Invest Cont,745,,,,7200 CIRRUS WY,,"(34.2009, -118.6369)",34.2009,-118.6369
5,120125367,1/8/2013,1/8/2013,1400,1,Central,111,110,CRIMINAL HOMICIDE,1243 2000 1813 1814 2002 0416 0400,...,Adult Arrest,110,,,,600 N HILL ST,,"(34.0591, -118.2412)",34.0591,-118.2412
6,101105609,1/28/2010,1/27/2010,2230,11,Northeast,1125,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,YORK,AVENUE 51,"(34.1211, -118.2048)",34.1211,-118.2048
7,101620051,11/11/2010,11/7/2010,1600,16,Foothill,1641,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,EL DORADO,TRUESDALE,"(34.241, -118.3987)",34.2410,-118.3987
8,101910498,4/7/2010,4/7/2010,1600,19,Mission,1902,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,GLENOAKS,DRELL,"(34.3147, -118.4589)",34.3147,-118.4589
9,120908292,3/29/2013,1/15/2013,800,9,Van Nuys,904,668,"EMBEZZLEMENT, GRAND THEFT ($950.01 & OVER)",0344 1300,...,Invest Cont,668,,,,7200 SEPULVEDA BL,,"(34.2012, -118.4662)",34.2012,-118.4662


In [5]:
## rescale time variable
def newTimes(df):
    df['Time Occurred'][np.where(df['Time Occurred'] < 800)[0]] = 2400 - (800 - df['Time Occurred'][np.where(df['Time Occurred'] < 800)[0]])
    df['Time Occurred'][np.where(df['Time Occurred'] >= 800)[0]] = df['Time Occurred'][np.where(df['Time Occurred'] >= 800)[0]] - 800
    
## make sex binary!
def newSex(df):
    df['Sex'] = np.zeros(len(df['Victim Sex']))
    df['Sex'][np.where(df['Victim Sex'] == 'M')[0]] = 1
    df['Sex'][np.where(df['Victim Sex'] == 'F')[0]] = 0
    df['Sex'][np.where(df['Victim Sex'] == '')[0]] = 0

## clean age (remove missing values)
def cleanAge(df):
    meanAge = np.mean(df['Victim Age'])
    df['Victim Age'][np.isnan(df['Victim Age'])] = meanAge

In [6]:
## distance formulas
def L2Norm(pt1,pt2):
    return np.sqrt(np.sum((pt1.values[0] - pt2.values[0]) ** 2))

def L1Norm(pt1,pt2):
    return np.sum(np.absolute((pt1.values[0] - pt2.values[0])))

In [7]:
def classifyPoint(point, centroids, clusters):
    smallestDist = np.inf
    closestCentroid = -1
    
    for c in range(len(centroids)):
        if (centroids.iloc[[c]].equals(point)):
            break
        temp = L2Norm(centroids.iloc[[c]], point)
        if temp < smallestDist:
            closestCentroid = c
            smallestDist = temp

    # Add point to closest cluster  
    if c not in clusters:
        clusters[c] = []
    clusters[c].append(point)

In [8]:
def getNewCentroids(df, clusters):
    newClusters = pd.DataFrame(data=None, columns=df.columns)
    for key, points in clusters.items():
        cluster = pd.DataFrame(data=None, columns=df.columns)
        for p in points:
            cluster = pd.concat([cluster, pd.DataFrame(p)], axis=0)
        
        newClusters = newClusters.append(np.mean(cluster), ignore_index=True)
    return newClusters

In [9]:
def cluster(df,columns,k):
    df = df[columns]
    
    ## pick k random points from the data set 
    ## and make sure they are unique
    firstIDs = np.random.randint(len(df), size = 3)
    while (len(np.unique(firstIDs)) < 3):
        firstIDs = np.random.randint(len(df), size = 3)
    
    ## set centroids
    centroids = df.iloc[firstIDs]
    #centroids.reindex(index=range(k))
    
    #clusters = {}#{k: [] for k in range(k)}
    ## run until clusters don't change (hopefully not forever)
    while True:
        clusters = {}
        for p in range(len(df)):
            point = df.iloc[[p]]
            classifyPoint(point, centroids, clusters)
            
        newCentroids = getNewCentroids(df, clusters)
        
        # maybe check if it hits a certain threshold?
        if (centroids.equals(newCentroids)):
            break
        else:
            centroids = newCentroids
    return clusters

In [10]:
## data cleaning
newTimes(crime)
newSex(crime)
cleanAge(crime)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set

In [11]:
cols = ['Time Occurred','Area ID','Reporting District','Crime Code','Victim Age','Sex','Premise Code','latitude','longitude']

In [12]:
cluster(crime,cols,50)

{0: [   Time Occurred  Area ID  Reporting District  Crime Code  Victim Age  Sex  \
  0           1000       12                1241         626        30.0  0.0   
  
     Premise Code  latitude  longitude  
  0         502.0   33.9829  -118.3338  ,
     Time Occurred  Area ID  Reporting District  Crime Code  Victim Age  Sex  \
  1           1500       20                2071         510   37.478759  0.0   
  
     Premise Code  latitude  longitude  
  1         101.0   34.0454  -118.3157  ,
     Time Occurred  Area ID  Reporting District  Crime Code  Victim Age  Sex  \
  2           1230       18                1823         510        12.0  0.0   
  
     Premise Code  latitude  longitude  
  2         101.0    33.942  -118.2717  ,
     Time Occurred  Area ID  Reporting District  Crime Code  Victim Age  Sex  \
  3           1000       18                1803         510   37.478759  0.0   
  
     Premise Code  latitude  longitude  
  3         101.0   33.9572  -118.2717  ,
     Time Occ