# This is currently a work in progress.  Feel free to add to it as it progresses along!!!

## This notebook is a gradual work in progress to make this data set more complete and easier for people to import and use for their own data analysis with minimal data cleaning.

In [138]:
import pandas as pd
import os

## Read in the raw data files and combine into one data frame

In [139]:
file_list = os.listdir('raw_data/')

In [140]:
d = pd.concat([pd.read_csv('raw_data/' + f) for f in file_list], keys=file_list, names = ['FileName','RowNumber'])

In [141]:
df = d.reset_index().drop('RowNumber', axis = 1)

## Load in neighborhood lookup table and join it with the data frame

In [142]:
# Load in neighborhood lookup table
nbhds = pd.read_csv('neighborhood_lookup.csv')

In [143]:
df2 = pd.merge(df, nbhds, left_on = 'Neighborhood', right_on = 'Neighborhood Number', how = 'left')
df2.drop('Neighborhood Number', axis = 1, inplace = True) # Drop duplicate nbhd number column

## Fixing column names to something more consistent

In [144]:
col_names = list(df2.columns)
df2.columns = [name.replace(' ', '') for name in col_names]

In [145]:
#df2.columns

## Here we're trying to align the crime codes in the Crime field to the Uniform Crime Reporting codes.
First we need to pad out the Crime column to be six digits, since the leading zero was trimmed in the .csv files

*Need to figure out the numbering system for these crimes.  See UCR, NIBRS, NCIC crime codes and try to make sense of it.*

In [146]:
df2['Crime'] = df2['Crime'].astype(str)
df2['Crime'] = df2['Crime'].apply(lambda x: x.zfill(6))

In [147]:
df2['ShortCrimeCode'] = df2['Crime'].apply(lambda x: x[:2])

In [148]:
'''
Bring in UCR CSV file here and merge df2 and it together on 'ShortenedCrime' column
'''
ucr_codes = pd.read_csv('UCRCodes.csv')

In [149]:
ucr_codes['UCRCode'] = ucr_codes['UCRCode'].astype(str).apply(lambda x: x.zfill(2))

In [150]:
df3 = pd.merge(df2, ucr_codes, left_on='ShortCrimeCode', right_on='UCRCode', how = 'left')
df3.drop('UCRCode', axis = 1, inplace=True)

# Working to fill in missing data

## The below code merges two disjoint columns that were renamed at some point.  The merged columns are:
### DateOccured/DateOccur
### CodedMonth/MonthReportedtoMSHP
### FlagUnfounded/UnfoundedCrimeIndicator
### FlagAdministrative/AdministrativeAdjustmentIndicator
### FlagCrime/NewCrimeIndicator

In [151]:
df3['DateOccured'] = pd.concat([df3['DateOccur'].dropna(), df3['DateOccured'].dropna()]).reindex_like(df3)
df3['CodedMonth'] = pd.concat([df3['CodedMonth'].dropna(), df3['MonthReportedtoMSHP'].dropna()]).reindex_like(df3)
df3['FlagUnfounded'] = pd.concat([df3['FlagUnfounded'].dropna(), df3['UnfoundedCrimeIndicator'].dropna()]).reindex_like(df3)
df3['FlagAdministrative'] = pd.concat([df3['FlagAdministrative'].dropna(), df3['AdministrativeAdjustmentIndicator'].dropna()]).reindex_like(df3)
df3['FlagCrime'] = pd.concat([df3['FlagCrime'].dropna(), df3['NewCrimeIndicator'].dropna()]).reindex_like(df3)

In [152]:
# Now that columns are merged, cleanup missing values appropriately
df3['FlagAdministrative'] = df3['FlagAdministrative'].replace(' ', 'N')

df3['FlagCrime'] = df3['FlagCrime'].replace(' ', 'N')

# This column actually contains no 'Y' flags.  I suspect it's just a Flag column that has yet to be used.
df3['FlagCleanup'] = df3['FlagCleanup'].replace(' ', 'N')
df3['FlagCleanup'] = df3['FlagCleanup'].replace(pd.np.NaN, 'N')

df3['FlagUnfounded'] = df3['FlagUnfounded'].replace(' ', 'N')

# Filling out some other fields with NaN's appropriately
df3['LocationName'] = df3['LocationName'].replace(' ', pd.np.NaN)
df3['LocationComment'] = df3['LocationComment'].replace(' ', pd.np.NaN)

# Let's change the X/Y to WGS84 lat/lng

### Import pyproj to use Proj and transform

In [153]:
import pyproj

### Define the convert function

In [154]:
state_plane = pyproj.Proj(init='EPSG:26996', preserve_units=True) #East MO State Plane Coord
wgs = pyproj.Proj(proj='latlong', datum='WGS84', ellps='WGS84') #Want WGS84

def convert(x,y):
    x *= 0.3048  #Change to meters
    y *= 0.3048  #Change to meters
    lng, lat = pyproj.transform(state_plane, wgs, x, y) #Get lng/lat
    return lat,lng

### Apply the convert function to the X/Y Coord

In [155]:
a = df3['XCoord'].astype(object).combine(df3['YCoord'], func=convert)
y = pd.DataFrame(a,columns = ['LatLong'])

### Adds a Latitude and Longitude column to df2

In [156]:
lista = [float(str(item).rstrip(")").split(',')[1]) for item in y['LatLong']]
listb = [float(str(item).lstrip("()").split(',')[0]) for item in y['LatLong']]
df3['Latitude'] = listb
df3['Longitude'] = lista

## Write the file out to tab delimited file

In [157]:
# Drop columns we don't need and reorder them in a way that makes some sort of sense
df3.columns

Index([u'FileName', u'AdministrativeAdjustmentIndicator', u'CADAddress', u'CADStreet', u'CodedMonth', u'Complaint', u'Count', u'Crime', u'DateOccur', u'DateOccured', u'Description', u'District', u'FlagAdministrative', u'FlagCleanup', u'FlagCrime', u'FlagUnfounded', u'ILEADSAddress', u'ILEADSStreet', u'LocationComment', u'LocationName', u'MonthReportedtoMSHP', u'Neighborhood', u'NewCrimeIndicator', u'UnfoundedCrimeIndicator', u'XCoord', u'YCoord', u'NeighborhoodName', u'NeighborhoodPrimaryDistrict', u'NeighborhoodAddlDistrict', u'ShortCrimeCode', u'UCRType', u'UCRCrime', u'Latitude', u'Longitude'], dtype='object')

In [166]:
keep_columns = [u'FileName', u'CADAddress', u'CADStreet', u'CodedMonth', u'Complaint', u'Count', u'Crime', u'ShortCrimeCode', u'UCRType', u'DateOccured', u'Description', u'District', u'FlagAdministrative', u'FlagCleanup', u'FlagCrime', u'FlagUnfounded', u'ILEADSAddress', u'ILEADSStreet', u'LocationComment', u'LocationName', u'Neighborhood', u'NeighborhoodName', u'NeighborhoodPrimaryDistrict', u'NeighborhoodAddlDistrict', u'Latitude', u'Longitude']

In [167]:
df4 = df3[keep_columns]

In [168]:
df4.to_csv('clean_data/stl_crime_data.txt', sep='\t')