In [1]:
import sys
import numpy as np
import pandas as pd

## read in the data
crime = pd.read_csv('Crime_Data_Chicago.csv',low_memory = False)

In [7]:
crime.shape

(100000, 28)

In [8]:
## create latitude and longitude columns separately
crime['latitude'] = np.array([float(s.split(',')[0][1:]) for s in crime['Location ']])
crime['longitude'] = np.array([float(s.split(',')[1][:-1]) for s in crime['Location ']])

In [9]:
crime.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location,latitude,longitude
0,1208575,3/14/2013,3/11/2013,1800,12,77th Street,1241,626,INTIMATE PARTNER - SIMPLE ASSAULT,0416 0446 1243 2000,...,Adult Other,626,,,,6300 BRYNHURST AV,,"(33.9829, -118.3338)",33.9829,-118.3338
1,102005556,1/25/2010,1/22/2010,2300,20,Olympic,2071,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,VAN NESS,15TH,"(34.0454, -118.3157)",34.0454,-118.3157
2,418,3/19/2013,3/18/2013,2030,18,Southeast,1823,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,200 E 104TH ST,,"(33.942, -118.2717)",33.942,-118.2717
3,101822289,11/11/2010,11/10/2010,1800,18,Southeast,1803,510,VEHICLE - STOLEN,,...,Invest Cont,510,,,,88TH,WALL,"(33.9572, -118.2717)",33.9572,-118.2717
4,42104479,1/11/2014,1/4/2014,2300,21,Topanga,2133,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),329,...,Invest Cont,745,,,,7200 CIRRUS WY,,"(34.2009, -118.6369)",34.2009,-118.6369


In [15]:
## rescale time variable
def newTimes(df):
    df['Time Occurred'][np.where(df['Time Occurred'] < 800)[0]] = 2400 - (800 - df['Time Occurred'][np.where(df['Time Occurred'] < 800)[0]])
    df['Time Occurred'][np.where(df['Time Occurred'] >= 800)[0]] = df['Time Occurred'][np.where(df['Time Occurred'] >= 800)[0]] - 800
    
## make sex binary!
def newSex(df):
    df['Sex'] = np.zeros(len(df['Victim Sex']))
    df['Sex'][np.where(df['Victim Sex'] == 'M')[0]] = 1
    df['Sex'][np.where(df['Victim Sex'] == 'F')[0]] = 0
    df['Sex'][np.where(df['Victim Sex'] == '')[0]] = 0

## clean age (remove missing values)
def cleanAge(df):
    meanAge = np.mean(df['Victim Age'])
    df['Victim Age'][np.isnan(df['Victim Age'])] = meanAge
    
## normalize all the variables
def normalizeDF(df,columns):
    return (df[columns] - df[columns].mean()) / (np.absolute(df[columns]).max()) 

## distance formulas
def L2Norm(pt1,pt2):
    return np.sqrt(np.sum((pt1.values[0] - pt2.values[0]) ** 2))
    
def L1Norm(pt1,pt2):
    return np.sum(np.absolute((pt1.values[0] - pt2.values[0])))
    
## getting the new centroids
def getCentroids(df, clusters):
    newClusters = pd.DataFrame(data=None, columns=df.columns)
    for key, value in clusters.items():
        rows = df[value]
        newClusters.append(np.mean(rows))
    return newClusters
    #for i in range(k):
    #   for col in df.columns:
    #        cent[i][col] = np.sum(df[col]) / len(df[col])
    #return cent

## classifies points
def classifyPoint(df,centroids,point):
    smallestDist = np.inf
    closest = 0
    i = 0
    for c in range(len(centroids)):
        i += 1
        if i%1000 == 0:
            print ("Hans")
        temp = L2Norm(centroids.iloc[[c]],df.iloc[[point]])
        if temp < smallestDist:
            closest = i
            smallestDist = temp
    df.iloc[[point]]['Class'] = closest

# def classifyPoints(df,centroids):


## the big function
def cluster(df,columns,k):
    df = df[columns]
    ## pick k random points from the data set
    firstIDs = np.random.randint(len(df),size = k)
    ## set centroids
    centroids = df.iloc[firstIDs]
    ## run until clusters don't change (hopefully not forever)
    while True:
        clusters = {k: [] for k in range(k)}
        # print ("Here")
        df['Class'] = np.zeros(len(df))
        centroids['Class'] = np.zeros(len(centroids))
        for i in range(len(df)):
            classifyPoint(df,centroids,i)
        newCentroids = getCentroids(df, clusters)
        # maybe check if it hits a certain threshold?
        if (centroids == newCentroids):
            break
        else:
            centroids = newCentroids
    return clusters

In [11]:
## columns we wanna use
cols = ['Time Occurred','Area ID','Reporting District','Crime Code','Victim Age','Sex','Premise Code','latitude','longitude']

## data cleaning
newTimes(crime)
newSex(crime)
cleanAge(crime)
crime = normalizeDF(crime, cols)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy o

In [12]:
crime.head()

Unnamed: 0,Time Occurred,Area ID,Reporting District,Crime Code,Victim Age,Sex,Premise Code,latitude,longitude
0,0.091088,0.264772,0.250982,0.120058,-0.06467954,-0.48127,0.228746,-0.002317,5.6e-05
1,0.411807,0.645724,0.628598,-0.001281,-8.35426e-14,-0.48127,-0.265705,-0.000515,0.000209
2,0.238619,0.550486,0.515769,-0.001281,-0.2464977,-0.48127,-0.265705,-0.003496,0.000579
3,0.091088,0.550486,0.506669,-0.001281,-8.35426e-14,-0.48127,-0.265705,-0.003058,0.000579
4,0.411807,0.693343,0.656806,0.244535,0.480775,0.51873,0.227513,0.003969,-0.002494


In [None]:
## still need to fix and use the normalizing of variables
cluster(crime,cols,50)