In [1]:
##########################################
#Step 0: Import libraries
##########################################
import os
import pandas as pd
from datetime import date
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import missingno as msno

In [2]:
##########################################
#Step 1: Import data
##########################################

In [3]:
#Setting directory

os.chdir(r'C:\Users\jlenehan\OneDrive - Intel Corporation\Documents\0 - Data Science\Data Analytics Essentials\UCDPA_JohnLenehan\UCDPA_JohnLenehan')

In [None]:
#importing car crash data from chicago data portal
#url to overview page - https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if
collision_json = r'https://data.cityofchicago.org/resource/85ca-t3if.json?$limit=99999999' #json url

#using chunksize method to read in large datasets
collision_raw = pd.read_json(collision_json) #reading collisions json

In [None]:
print(collision_raw.head())

In [None]:
#importing beat data to join to main dataset
beat_data=pd.read_csv('PoliceBeatDec2012.csv')

print(beat_data.info())
print(beat_data.describe())

In [None]:
##########################################
#Step 2: Merge Data
##########################################

In [None]:
#joining collision data to beat data - inner join
collisions = collision_raw.merge(beat_data, how='inner',
                                 left_on='beat_of_occurence',
                                 right_on='BEAT_NUM',
                                 suffixes=('_df1','_df2'))

print(collisions.head())

In [None]:
##########################################
#Step 3: Describe data
##########################################

In [None]:
#Describe recent incidents dataset
print(collisions.columns)
print(collisions.info())
print(collisions.describe())
print(collisions.shape)


In [None]:
#converting location data to string
collisions['location']=collisions['location'].astype(str)

print('\nSFPD Historical Incidents - Unique Values:')
for x in collisions.columns:
    print(x+':')
    print(collisions[x].unique())
    print(str(collisions[x].nunique()) + str(' unique values'))

In [None]:
##########################################
#Step 4: Clean + Manipulate data
##########################################

In [None]:
#Converting Incident datetime, Report Datetime to a datetime object
#sfpd_incident_current[['incident_datetime','report_datetime']] = sfpd_incident_current[['incident_datetime','report_datetime']].apply(pd.to_datetime)

In [None]:
#Visualising missing data

#Sorting values by report received date
collisions = collisions.sort_values(by='crash_date',ascending=True)

#plotting matrix of missing data
msno.matrix(collisions)
plt.show()

#info of sorted data
print(collisions.info())

In [None]:
#dropping unnecessary columns
drop_cols = ['location', 'report_type', 'intersection_related_i',
       'hit_and_run_i', 'photos_taken_i', 'crash_date_est_i', 'beat_of_occurence',
       'private_property_i', 'statements_taken_i', 'dooring_i', 'work_zone_i',
       'work_zone_type', 'workers_present_i','lane_cnt'
            ]

collisions=collisions.drop(columns=drop_cols)

#plotting matrix of missing data
msno.matrix(collisions)
plt.show()

#info of sorted data
print(collisions.info())

In [None]:
#exclude rows missing geolocation data
collisions = collisions.dropna(subset = ['latitude'])

#plotting matrix of data
msno.matrix(collisions)
plt.show()

#data info
print(collisions.info())


In [None]:
##########################################
#Step 5: Plot data
##########################################

In [None]:
##########################################
#Step 6: Machine Learning
##########################################