In [1]:
import sys
import numpy as np
import pandas as pd

from pandas.util.testing import assert_frame_equal

## read in the data
crime = pd.read_csv('Crime_Data_Chicago.csv',low_memory = False)

In [2]:
crime.shape

(100000, 26)

In [3]:
## create latitude and longitude columns separately
crime['latitude'] = np.array([float(s.split(',')[0][1:]) for s in crime['Location ']])
crime['longitude'] = np.array([float(s.split(',')[1][:-1]) for s in crime['Location ']])

In [4]:
crime.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location,latitude,longitude
0,1208575,3/14/2013,3/11/2013,1800,12,77th Street,1241,626,INTIMATE PARTNER - SIMPLE ASSAULT,0416 0446 1243 2000,...,Adult Other,626,,,,6300 BRYNHURST AV,,"(33.9829, -118.3338)",33.9829,-118.3338
1,102005556,1/25/2010,1/22/2010,2300,20,Olympic,2071,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,VAN NESS,15TH,"(34.0454, -118.3157)",34.0454,-118.3157
2,418,3/19/2013,3/18/2013,2030,18,Southeast,1823,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,200 E 104TH ST,,"(33.942, -118.2717)",33.942,-118.2717
3,101822289,11/11/2010,11/10/2010,1800,18,Southeast,1803,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,88TH,WALL,"(33.9572, -118.2717)",33.9572,-118.2717
4,42104479,1/11/2014,1/4/2014,2300,21,Topanga,2133,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),329,...,Invest Cont,745,,,,7200 CIRRUS WY,,"(34.2009, -118.6369)",34.2009,-118.6369


In [5]:
## rescale time variable
def newTimes(df):
    df['Time Occurred'][np.where(df['Time Occurred'] < 800)[0]] = 2400 - (800 - df['Time Occurred'][np.where(df['Time Occurred'] < 800)[0]])
    df['Time Occurred'][np.where(df['Time Occurred'] >= 800)[0]] = df['Time Occurred'][np.where(df['Time Occurred'] >= 800)[0]] - 800
    
## make sex binary!
def newSex(df):
    df['Sex'] = np.zeros(len(df['Victim Sex']))
    df['Sex'][np.where(df['Victim Sex'] == 'M')[0]] = 1
    df['Sex'][np.where(df['Victim Sex'] == 'F')[0]] = 0
    df['Sex'][np.where(df['Victim Sex'] == '')[0]] = 0


# def classifyPoints(df,centroids):


In [6]:
## clean age (remove missing values)
def cleanAge(df):
    meanAge = np.mean(df['Victim Age'])
    df['Victim Age'][np.isnan(df['Victim Age'])] = meanAge
    
## normalize all the variables
def normalizeDF(df,columns):
    return (df[columns] - df[columns].mean()) / (np.absolute(df[columns]).max()) 

## distance formulas
def L2Norm(pt1,pt2):
    return np.sqrt(np.sum((pt1.values[0] - pt2.values[0]) ** 2))
    
def L1Norm(pt1,pt2):
    return np.sum(np.absolute((pt1.values[0] - pt2.values[0])))
    
## getting the new centroids
def getCentroids(df, clusters):
    newCentroids = {k: None for k in range(len(clusters))}
    for key, value in clusters.items():
        newCentroids[key] = np.mean(df.loc[value])
    return pd.DataFrame.from_dict(newCentroids, orient="index")

## classifies points
def classifyPoint(clusters, centroids, point, index, distFunc, k):
    distDict = {key: distFunc(point, centroids.loc[key]) for key in range(k)}
    clusters[min(distDict, key=distDict.get)].append(index)

In [7]:
## the big function
def cluster(df, columns, k):
    df = df[columns]
    ## pick k random points from the data set
    firstIDs = np.random.randint(len(df), size=k)
    ## set centroids
    centroids = df.iloc[firstIDs].reset_index(drop=True)
    ## run until clusters don't change (hopefully not forever)
    while True:
        clusters = {k: [] for k in range(k)}
        # print ("Here")
        for i in range(len(df)):
            classifyPoint(clusters, centroids, df.loc[i], i, L2Norm, k)
        newCentroids = getCentroids(df, clusters)
        # maybe check if it hits a certain threshold?
        try:
            assert_frame_equal(centroids, newCentroids)
            break
        except:
            centroids = newCentroids
    return clusters

In [8]:
## columns we wanna use
cols = ['Time Occurred','Area ID','Reporting District','Crime Code','Victim Age','Sex','Premise Code','latitude','longitude']

## data cleaning
newTimes(crime)
newSex(crime)
cleanAge(crime)
crime = normalizeDF(crime, cols)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy o

In [9]:
crimeSmall = crime.head(100)

In [10]:
cluster(crimeSmall, cols, 3)

{0: [1,
  2,
  4,
  6,
  10,
  14,
  15,
  21,
  23,
  24,
  36,
  38,
  45,
  46,
  49,
  52,
  54,
  55,
  62,
  64,
  66,
  68,
  72,
  75,
  76,
  77,
  78,
  85,
  86,
  87,
  88,
  92,
  93,
  96,
  98,
  99],
 1: [5,
  9,
  11,
  12,
  13,
  16,
  17,
  18,
  25,
  26,
  33,
  34,
  39,
  40,
  44,
  48,
  51,
  59,
  63,
  67,
  70,
  82,
  89,
  94,
  97],
 2: [0,
  3,
  7,
  8,
  19,
  20,
  22,
  27,
  28,
  29,
  30,
  31,
  32,
  35,
  37,
  41,
  42,
  43,
  47,
  50,
  53,
  56,
  57,
  58,
  60,
  61,
  65,
  69,
  71,
  73,
  74,
  79,
  80,
  81,
  83,
  84,
  90,
  91,
  95]}

In [None]:
crime.head()

In [None]:
## still need to fix and use the normalizing of variables
cluster(crime,cols,50)