In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
%%time
arrest_data = pd.read_csv('data/arrest-data-from-2010-to-present.csv')
crime_data = pd.read_csv('data/crime-data-from-2010-to-present.csv')
income_data = pd.read_csv('data/LAIncome.csv', sep='\t')
moon_data = pd.read_csv('data/full_moon.csv')
race_data = pd.read_csv('data/LARace.csv')
zip_data = pd.read_csv('data/ziplatlon.csv', dtype={'ZIP': 'str'})

Wall time: 11.8 s


In [3]:
arrest_data.columns

Index(['Report ID', 'Arrest Date', 'Time', 'Area ID', 'Area Name',
       'Reporting District', 'Age', 'Sex Code', 'Descent Code',
       'Charge Group Code', 'Charge Group Description', 'Arrest Type Code',
       'Charge', 'Charge Description', 'Address', 'Cross Street', 'Location',
       'Zip Codes', 'Census Tracts', 'Precinct Boundaries',
       'LA Specific Plans', 'Council Districts',
       'Neighborhood Councils (Certified)'],
      dtype='object')

In [4]:
income_data.head()

Unnamed: 0,Zip,Community,Amount
0,90001,"Los Angeles (South Los Angeles), Florence-Graham","$43,360"
1,90002,"Los Angeles (Southeast Los Angeles, Watts)","$37,285"
2,90003,"Los Angeles (South Los Angeles, Southeast Los ...","$40,598"
3,90004,"Los Angeles (Hancock Park, Rampart Village, Vi...","$49,675"
4,90005,"Los Angeles (Hancock Park, Koreatown, Wilshire...","$38,491"


In [5]:
moon_data.head()

Unnamed: 0,Day,Date,Time
0,Friday,21 January 2000,05:40:24 am [**]
1,Saturday,19 February 2000,05:26:42 pm
2,Monday,20 March 2000,05:44:24 am
3,Tuesday,18 April 2000,07:41:30 pm
4,Thursday,18 May 2000,09:34:24 am


### Dealing with location

In [6]:
eval(arrest_data['Location'][0])

{'latitude': '33.992',
 'human_address': '{"address": "", "city": "", "state": "", "zip": ""}',
 'longitude': '-118.4201'}

In [7]:
race_data['Zip Code'].nunique()

110

In [8]:
def convert_lat_lon(x):
    return (eval(x)['latitude'], eval(x)['longitude']) 
convert_lat_lon = np.vectorize(convert_lat_lon)

In [9]:
arrest_data['Lat'], arrest_data['Lon'] = convert_lat_lon(arrest_data['Location'])

In [10]:
zip_data = zip_data[zip_data['ZIP'].apply(lambda x: x[:2]=='90')]
zip_data

Unnamed: 0,ZIP,LAT,LNG
30021,90001,33.974027,-118.249509
30022,90002,33.949099,-118.246737
30023,90003,33.964131,-118.272783
30024,90004,34.076198,-118.310722
30025,90005,34.059163,-118.306892
...,...,...,...
30187,90813,33.782259,-118.196793
30188,90814,33.771616,-118.143631
30189,90815,33.794348,-118.116391
30190,90822,33.778436,-118.118648


In [11]:
sample_lat, sample_lon = arrest_data['Lat'][0], arrest_data['Lon'][0] 

In [12]:
temp = pd.DataFrame( { 'ZIP':zip_data['ZIP'], 'DifLat': zip_data['LAT'] - float(sample_lat) , 'DifLon': zip_data['LNG'] - float(sample_lon)})
temp['ToMin'] = temp['DifLat']**2 + temp['DifLon']**2
temp

Unnamed: 0,ZIP,DifLat,DifLon,ToMin
30021,90001,-0.017973,0.170591,0.029424
30022,90002,-0.042901,0.173363,0.031895
30023,90003,-0.027869,0.147317,0.022479
30024,90004,0.084198,0.109378,0.019053
30025,90005,0.067163,0.113208,0.017327
...,...,...,...,...
30187,90813,-0.209741,0.223307,0.093857
30188,90814,-0.220384,0.276469,0.125004
30189,90815,-0.197652,0.303709,0.131305
30190,90822,-0.213564,0.301452,0.136483


In [13]:
temp[temp['ToMin']==temp['ToMin'].min()]['ZIP'].iloc[0]

'90066'

In [14]:
sample_lat, sample_lon

('33.992', '-118.4201')

In [15]:
def calc_zip(lat, lon):
    temp = pd.DataFrame( { 'ZIP':zip_data['ZIP'], 'DifLat': zip_data['LAT'] - float(lat) , 'DifLon': zip_data['LNG'] - float(lon)})
    temp['ToMin'] = temp['DifLat']**2 + temp['DifLon']**2
    return temp[temp['ToMin']==temp['ToMin'].min()]['ZIP'].iloc[0]
calc_zip = np.vectorize(calc_zip)

In [16]:
%%time
arrest_data['ZipCode'] = calc_zip(arrest_data['Lat'], arrest_data['Lon'])

KeyboardInterrupt: 

In [17]:
arrest_data.to_csv('arrest_with_correct_zipcode.csv', index=False)

KeyboardInterrupt: 

In [None]:
%%time
pd.read_csv('arrest_with_correct_zipcode.csv')

In [None]:
crime_data['Lat'], crime_data['Lon'] = convert_lat_lon(crime_data['Location '])

In [None]:
%%time
crime_data['ZipCode'] = calc_zip(crime_data['Lat'], crime_data['Lon'])

In [None]:
crime_data.to_csv('crime_with_correct_zipcode.csv', index=False)

In [None]:
%%time
pd.read_csv('crime_with_correct_zipcode.csv')