# Merge Crime data


In [None]:
import pandas as pd 
import numpy as np
import urllib2

### Function to pull relevant data

In [None]:
def get_data(city,startyear,endyear):
    '''Function to get data from the github repo. city refers to the city of interest, startyear and
    endyear are the years of data you want.'''
    dd=[]
    for x in np.arange(startyear,endyear+1):
        url='https://raw.githubusercontent.com/Data4Democracy/usa-dashboard/master/'+str(city)+'/data/'+str(city)+'-'+str(x)+'-crime.csv'
        try:
            dd.append(pd.read_csv(url))
        except urllib2.HTTPError, err: # We can get this error if the city or year data is absent
            if err.code==404:
                url='https://raw.githubusercontent.com/Data4Democracy/usa-dashboard/master/'+str(city)+'/'+str(city)+'-'+str(x)+'-crime.csv'
                try:
                    dd.append(pd.read_csv(url))
                except urllib2.HTTPError, err:
                    if err.code==404:
                        print "data for " + str(city) + str(x)+ " doesn't exist"
    data=pd.concat(dd)
    data['city']=city
    data['date']=pd.to_datetime(data['year'].astype(str)+'/'+data['month'].astype(str)+'/'+data['day'].astype(str))
    return data
    

In [None]:
bal=get_data('bal', 2006,2016) # baltimore

In [None]:
bal.rename(columns={'description':'crime_type'},inplace=True) #

In [None]:
chi=get_data('chi',2006,2016) # Chicago

In [None]:
chi.rename(columns={'primary_type':'crime_type'},inplace=True)

In [None]:
nyc=get_data('nyc',2006,2016) # NYC

In [None]:
nyc.rename(columns={'ofns_desc':'crime_type'},inplace=True) 

In [None]:
sea=get_data('sea',2005,2015) # Seattle 

In [None]:
df=pd.concat([nyc,sea,bal,chi],ignore_index=True) #Combine the data

In [None]:
df['crime_type']=df.crime_type.str.lower() #make all entries lower case. This makes manipulation easy

In [None]:
def normalize_name(name):
    """
    Standardizes names to make for easier comparisons
    
    name: full name of a crime type can be in any format 
    """
    if " and" in name:
        name = name.replace(' and ',' & ')
    if " codes" in name:
        name= name.replace(' codes',' code')
    return name

In [None]:
df['crime_type']=df['crime_type'].apply(normalize_name) # normalize the names of the crime types

In [None]:
## Create standard categories based on FBI Uniform Crime Reporting

ucr=[]
for row in df['crime_type']:
    if 'rape' in row:
        ucr.append('rape')
    elif 'robbery' in row:
        ucr.append('robbery')
    elif 'agg. assault' in row or 'aggravated assault' in row:
        ucr.append('aggravated assault')
    elif 'auto theft' in row or 'motor vehicle theft' in row or 'robbery - carjacking' in row:
        ucr.append('motor vehicle theft')
    elif 'arson' in row:
        ucr.append('arson')
    elif 'larceny' in row:
        ucr.append('larceny')
    elif 'burglar' in row:
        ucr.append('burglary')
    elif 'murder' in row:
        ucr.append('murder & non-negl. manslaughter')
    else:
        ucr.append(None) # Crime reports that don't have an equivalent UCR category            

In [None]:
df['ucr']=ucr