In [54]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

BASE_DIR = '../'


In [55]:
import os
import fiona
BASE_DIR = os.path.dirname(os.getcwd())
os.chdir(BASE_DIR + '/src/data/')
import util
os.chdir(BASE_DIR + '/notebooks/')

In [56]:
def is_inter(id):
    if len(str(id)) > 1 and str(id)[0:2] == '00':
        return False
    return True

In [57]:
def summary_crash_rate(crashes, inter_count, non_inter_count):
    """                                                                                                                                                                 
    Info about intersections vs non intersections and their crash rate                                                                                                  
    """
    counts = {
        'inter': 0,
        'non_inter': 0,
        'no_match': 0,
        'inter_plus': 0,
        'non_inter_plus': 0,
    }
    total_crashes = 0
    for k, v in crashes.iteritems():
        total_crashes += v['count']
        if str(k) == '':
            # Sometimes crashes can't be snapped to a segment, probably due to bad data entry from the crash
            counts['no_match'] += 1
        elif is_inter(k):
            if int(v['count']) > 1:
                counts['inter_plus'] += 1
            counts['inter'] += 1
        else:
            if int(v['count']) > 1:
                counts['non_inter_plus'] += 1
            counts['non_inter'] += 1
    print "Total number of crashes:\t\t\t\t" + str(total_crashes)
    print "Total number of crash locations:\t\t\t\t" + str(len(crashes))
    print "Number of intersections:\t\t\t\t" + str(inter_count)
    print "Number of non-intersections:\t\t\t\t" + str(non_inter_count)
    print "Percentage of segments that are intersections:\t\t" \
        + str(round(100 * float(inter_count)/float(inter_count + non_inter_count)))
    print ""

    print "Number of intersections with at least 1 crash:\t\t" + str(counts['inter']) 
    print 'Number of intersections with more than 1 crash:\t\t' + str(counts['inter_plus'])
    print "Number of non-intersections with at least 1 crash:\t" + str(counts['non_inter'])
    print "Number of non-intersections with more than 1 crash:\t" + str(counts['non_inter_plus'])
    print ""

    total_percent = round(float(100 * (counts['inter'] + counts['non_inter']) / float(inter_count + non_inter_count)))
    print "percent of all segments with crash:\t\t\t" + str(total_percent)

    # Percentage of intersections/non-intersections                                                                                                                     
    # that have at least one crash
    inter_percent = round(float(100 * counts['inter'])/float(inter_count))
    print "percent of intersections with crash:\t\t\t" + str(inter_percent)

    non_inter_percent = round(100 * float(counts['non_inter'])/float(non_inter_count))
    print "percentage of non-intersections with crash:\t\t" + str(non_inter_percent)

    print "percentage of intersections with more than 1 crash:\t" + str(
        round(float(100 * counts['inter_plus'])/float(inter_count)))

    print "percentage of non-intersections with more than 1 crash:\t" + str(
        round(float(100 * counts['non_inter_plus'])/float(non_inter_count)))


In [58]:
def summary_concern_counts(crashes, concerns):
    """                                                                                                                                                                 
    What percentage of intersections with concerns had crashes                                                                                                          
    at varying counts of concerns?                                                                                                                                      
    """

    matching = {}
    # Go through each concern location
    # Increment counts for crash/no crash intersection/no intersection at this location
    for id, d in concerns.iteritems():
        if d['count'] not in matching.keys():
            matching[d['count']] = {
                'inter': {'crash': 0, 'no_crash': 0},
                'non_inter': {'crash': 0, 'no_crash': 0}
            }
        if is_inter(id):
            key = 'inter'
        else:
            key = 'non_inter'

        if id in crashes.keys():
            matching[d['count']][key]['crash'] += 1
        else:
            matching[d['count']][key]['no_crash'] += 1

    sorted_matching = sorted(matching.items())

    results = []
    for key, value in sorted_matching:

        # Do the 1+,2+ stats as well as 1, 2                                                                                                                            
        # Still need to break it out by int/non-int                                                                                                                     
        counts = {
            'total': value['inter']['crash'] + value['inter']['no_crash'] \
                + value['non_inter']['crash'] + value['non_inter']['no_crash'],
            'crashes': value['inter']['crash'] + value['non_inter']['crash'],
            'inters_total': value['inter']['crash'] + value['inter']['no_crash'], # Count of intersections with a concern
            'inters_crashes': value['inter']['crash'],
            'non_inters_total': value['non_inter']['crash'] + value['non_inter']['no_crash'],
            'non_inters_crashes':  value['non_inter']['crash'],
        }
        # Add all the data for segments with more complaints than the current complaint # we're on
        for key2, value2 in sorted_matching[key:len(sorted_matching)]:

            if key2 > key:
                counts['total'] += value2['inter']['crash'] + value2['inter']['no_crash'] \
                    + value2['non_inter']['crash'] + value2['non_inter']['no_crash']
                counts['crashes'] += value2['inter']['crash'] + value2['non_inter']['crash']
                counts['inters_total'] += value2['inter']['crash'] + value2['inter']['no_crash']
                counts['inters_crashes'] += value2['inter']['crash']
                counts['non_inters_total'] += value2['non_inter']['crash'] + value2['non_inter']['no_crash']
                counts['non_inters_crashes'] += value2['non_inter']['crash']
        total_percent_v0 = round(100 * float(counts['crashes'])/float(counts['total']))
        inter_percent_v0 = round(100 * float(counts['inters_crashes'])/float(counts['inters_total'])) if counts['inters_total'] else 0
        non_inter_percent_v0 = round((100 * float(counts['non_inters_crashes'])/float(counts['non_inters_total'])) if counts['non_inters_total'] else 0)
        results.append([
            key,
            total_percent_v0,
            counts['total'], # total count      
            inter_percent_v0,
            counts['inters_total'], # total # of intersections with this many or more complaints
            non_inter_percent_v0,
            counts['non_inters_total'], # total # of non-intersections with this many or more complaints
            round(100 * float(counts['inters_total'])/float(counts['total']))
        ])

    results = results[0:5]

    display(pd.DataFrame(results, columns=[
        '# of concerns at this segment',
        '% of segments w/ v0 complaint w/ crash',
        'total # of segments with this many or more concerns',
        '% of intersections with this many or more concerns with a crash',
        '# of intersections with this many concerns',
        '% of non-intersections with this many concerns with a crash',
        '# of non-intersections with this many concerns',
        '% of these locations that were at an intersection'
    ]))

In [59]:
def concerns_by_type(concerns, concern_data, crashes, category_field='REQUEST_TYPE'):
    # For intersections with a specific complaint type:                                                                                                                 
    #    what percentage had a crash?                                                                                                                                   
    requests = {}
    for data in concern_data:
        if data[category_field] not in requests.keys():
            requests[data[category_field]] = 1

    requests = {}
    all_unique = []

    for k, v in concerns.iteritems():
        unique_requests = {}
        for request in v[category_field]:
            # Clean up badly formatted request types                                                                                                                    
            vals = request.split('nbsp;')
            if len(vals) > 1:
                request = vals[1]

            if request not in unique_requests.keys():
                unique_requests[request] = 0
            unique_requests[request] += 1

            if request not in requests.keys():
                requests[request] = {
                    'crashes': 0,
                    'total': 0,
                    'inter_crashes': 0,
                    'inter_total': 0,
                    'non_inter_crashes': 0,
                    'non_inter_total': 0,
                    'ped_inter_crashes': 0,
                    'ped_non_inter_crashes': 0
                }
            if str(k) in crashes.keys():
                requests[request]['crashes'] += 1
            requests[request]['total'] += 1
            if is_inter(k):
                if str(k) in crashes.keys():
                    requests[request]['inter_crashes'] += 1
                requests[request]['inter_total'] += 1
            else:
                if str(k) in crashes.keys():
                    requests[request]['non_inter_crashes'] += 1
                requests[request]['non_inter_total'] += 1


        for key, value in unique_requests.iteritems():
            if value > 1:
                all_unique.append([key, value])

    by_type = {}
    for k, v in all_unique:
        if k not in by_type.keys():
            by_type[k] = 0
        by_type[k] += 1

    results = []
    for k, v in requests.iteritems():
        if requests[k]['total'] >= 100:
            total_percent = round(100 * float(requests[k]['crashes'])/float(requests[k]['total']))
            inter_percent = round(100 * float(requests[k]['inter_crashes'])/float(requests[k]['inter_total']))
            non_inter_percent = round(100 * float(requests[k]['non_inter_crashes'])/float(requests[k]['non_inter_total']) if requests[k]['non_inter_total'] else 0)
            results.append([
                k,
                by_type[k],
                total_percent,
                requests[k]['crashes'],
                requests[k]['total'],
                inter_percent,
                requests[k]['inter_crashes'],
                requests[k]['inter_total'],
                non_inter_percent,
                requests[k]['non_inter_crashes'],
                requests[k]['non_inter_total'],
                round(100 * float(requests[k]['inter_total'])/float(requests[k]['total']))
            ])


    df = pd.DataFrame(results, columns=[
        'Request type',
        '# of requests of this type that appear more than once at a segment',
        '% of segments with this type of request with a crash',
        '# of segments with this type of request with a crash',
        '# of segments with this type of a request total',
        '% of intersections with this type of request with a crash',
        '# of intersections with this type of request with a crash',
        '# of intersections with this type of request total',
        '% of non-intersections with this type of request with a crash',
        '# of non-intersections with this type of request with a crash',
        '# of non-intersections with this type of request total',
        '% of complaint locations that were intersections'
        #'% of intersections with this type of request with a ped crash',
        #'# of ped crashes in intersection with this request',
        #'# of ped crashes in non-intersection with this request'
    ])
    df.sort_values(by="# of segments with this type of a request total", ascending=0)
    display(df)


In [60]:
def get_analysis_for_city(datadir, category_field='REQUESTTYPE', years=None):
    
    DATA_FP = BASE_DIR + '/' + datadir + '/processed/'
    MAP_FP = DATA_FP + 'maps/'
    crash_data, crashes = util.group_json_by_location(DATA_FP + 'crash_joined.json', years=years, yearfield='CALENDAR_DATE')

    concern_data, concerns = util.group_json_by_location(
        DATA_FP + 'concern_joined.json',
        otherfields=[category_field])
    print 'len concern data:' + str(len(concern_data))
    inters = util.read_segments(MAP_FP, get_non_inter=False)
    non_inters = util.read_segments(MAP_FP, get_inter=False)
    inter_count = len(inters[0])
    non_inter_count = len(non_inters[0])
    
    summary_crash_rate(crashes, inter_count, non_inter_count)
    summary_concern_counts(crashes, concerns)
    concerns_by_type(concerns, concern_data, crashes, category_field)
    

In [61]:
get_analysis_for_city('osm-data', years=[2016])

len concern data:8503
Read in 7845 intersection, 0 non-intersection segments
Read in 0 intersection, 15434 non-intersection segments
Total number of crashes:				3202
Total number of crash locations:				1958
Number of intersections:				7845
Number of non-intersections:				15434
Percentage of segments that are intersections:		34.0

Number of intersections with at least 1 crash:		1110
Number of intersections with more than 1 crash:		397
Number of non-intersections with at least 1 crash:	847
Number of non-intersections with more than 1 crash:	152

percent of all segments with crash:			8.0
percent of intersections with crash:			14.0
percentage of non-intersections with crash:		5.0
percentage of intersections with more than 1 crash:	5.0
percentage of non-intersections with more than 1 crash:	1.0


Unnamed: 0,# of concerns at this segment,% of segments w/ v0 complaint w/ crash,total # of segments with this many or more concerns,% of intersections with this many or more concerns with a crash,# of intersections with this many concerns,% of non-intersections with this many concerns with a crash,# of non-intersections with this many concerns,% of these locations that were at an intersection
0,1,20.0,4345,30.0,1981,11.0,2364,46.0
1,2,29.0,1596,42.0,814,15.0,782,51.0
2,3,36.0,804,51.0,430,19.0,374,53.0
3,4,41.0,474,52.0,271,27.0,203,57.0
4,5,49.0,308,62.0,181,31.0,127,59.0


Unnamed: 0,Request type,# of requests of this type that appear more than once at a segment,% of segments with this type of request with a crash,# of segments with this type of request with a crash,# of segments with this type of a request total,% of intersections with this type of request with a crash,# of intersections with this type of request with a crash,# of intersections with this type of request total,% of non-intersections with this type of request with a crash,# of non-intersections with this type of request with a crash,# of non-intersections with this type of request total,% of complaint locations that were intersections
0,people speed,126,16.0,191,1226,28.0,99,352,11.0,92,874,29.0
1,people double park their vehicles,90,22.0,141,631,35.0,57,161,18.0,84,470,26.0
2,there are no bike facilities or they need main...,9,24.0,34,143,59.0,17,29,15.0,17,114,20.0
3,bike facilities don't exist or need improvement,97,27.0,311,1136,49.0,148,303,20.0,163,833,27.0
4,it’s hard to see / low visibility,31,26.0,110,429,31.0,93,296,13.0,17,133,69.0
5,people cross away from the crosswalks,33,31.0,90,293,42.0,59,141,20.0,31,152,48.0
6,people don't yield while going straight,37,32.0,117,361,36.0,92,253,23.0,25,108,70.0
7,people don't yield while turning,88,46.0,297,644,53.0,275,515,17.0,22,129,80.0
8,the roadway surface needs improvement,27,32.0,90,277,52.0,53,102,21.0,37,175,37.0
9,sidewalks/ramps don't exist or need improvement,32,22.0,72,329,32.0,42,130,15.0,30,199,40.0


In [62]:
get_analysis_for_city('osm-dc', years=[2016])

len concern data:5357
Read in 7480 intersection, 0 non-intersection segments
Read in 0 intersection, 14901 non-intersection segments
Total number of crashes:				28801
Total number of crash locations:				8797
Number of intersections:				7480
Number of non-intersections:				14901
Percentage of segments that are intersections:		33.0

Number of intersections with at least 1 crash:		3074
Number of intersections with more than 1 crash:		1893
Number of non-intersections with at least 1 crash:	5722
Number of non-intersections with more than 1 crash:	3231

percent of all segments with crash:			39.0
percent of intersections with crash:			41.0
percentage of non-intersections with crash:		38.0
percentage of intersections with more than 1 crash:	25.0
percentage of non-intersections with more than 1 crash:	22.0


Unnamed: 0,# of concerns at this segment,% of segments w/ v0 complaint w/ crash,total # of segments with this many or more concerns,% of intersections with this many or more concerns with a crash,# of intersections with this many concerns,% of non-intersections with this many concerns with a crash,# of non-intersections with this many concerns,% of these locations that were at an intersection
0,1,66.0,3342,68.0,1652,63.0,1690,49.0
1,2,78.0,1023,82.0,531,72.0,492,52.0
2,3,83.0,422,87.0,245,77.0,177,58.0
3,4,91.0,206,89.0,129,94.0,77,63.0
4,5,95.0,115,93.0,74,98.0,41,64.0


Unnamed: 0,Request type,# of requests of this type that appear more than once at a segment,% of segments with this type of request with a crash,# of segments with this type of request with a crash,# of segments with this type of a request total,% of intersections with this type of request with a crash,# of intersections with this type of request with a crash,# of intersections with this type of request total,% of non-intersections with this type of request with a crash,# of non-intersections with this type of request with a crash,# of non-intersections with this type of request total,% of complaint locations that were intersections
0,Other Driving Issue,62,77.0,482,625,78.0,243,310,76.0,239,315,50.0
1,Failure to stop for pedestrians,79,73.0,534,731,75.0,421,562,67.0,113,169,77.0
2,Other Walking Issue,25,67.0,236,353,70.0,122,174,64.0,114,179,49.0
3,Double parking,24,83.0,203,246,81.0,22,27,83.0,181,219,11.0
4,Not enough time to cross,17,87.0,122,140,92.0,103,112,68.0,19,28,80.0
5,Poor visibility,22,61.0,175,287,64.0,122,191,55.0,53,96,67.0
6,Red light running,34,84.0,295,350,89.0,239,270,70.0,56,80,77.0
7,Blocking the crosswalk,6,85.0,111,131,90.0,85,94,70.0,26,37,72.0
8,Cyclist behavior,4,79.0,93,117,85.0,52,61,73.0,41,56,52.0
9,Jaywalking,13,85.0,149,175,92.0,89,97,77.0,60,78,55.0


In [63]:
get_analysis_for_city('osm-cambridge', category_field='issue_type', years=[2016])

len concern data:8686
Read in 1287 intersection, 0 non-intersection segments
Read in 0 intersection, 2456 non-intersection segments
Total number of crashes:				562
Total number of crash locations:				400
Number of intersections:				1287
Number of non-intersections:				2456
Percentage of segments that are intersections:		34.0

Number of intersections with at least 1 crash:		243
Number of intersections with more than 1 crash:		75
Number of non-intersections with at least 1 crash:	156
Number of non-intersections with more than 1 crash:	27

percent of all segments with crash:			11.0
percent of intersections with crash:			19.0
percentage of non-intersections with crash:		6.0
percentage of intersections with more than 1 crash:	6.0
percentage of non-intersections with more than 1 crash:	1.0


Unnamed: 0,# of concerns at this segment,% of segments w/ v0 complaint w/ crash,total # of segments with this many or more concerns,% of intersections with this many or more concerns with a crash,# of intersections with this many concerns,% of non-intersections with this many concerns with a crash,# of non-intersections with this many concerns,% of these locations that were at an intersection
0,1,14.0,2356,23.0,846,9.0,1510,36.0
1,2,15.0,1566,24.0,529,11.0,1037,34.0
2,3,17.0,1055,29.0,332,12.0,723,31.0
3,4,19.0,699,33.0,208,13.0,491,30.0
4,5,20.0,475,32.0,124,16.0,351,26.0


Unnamed: 0,Request type,# of requests of this type that appear more than once at a segment,% of segments with this type of request with a crash,# of segments with this type of request with a crash,# of segments with this type of a request total,% of intersections with this type of request with a crash,# of intersections with this type of request with a crash,# of intersections with this type of request total,% of non-intersections with this type of request with a crash,# of non-intersections with this type of request with a crash,# of non-intersections with this type of request total,% of complaint locations that were intersections
0,Missed Trash Pickup,40,23.0,63,272,45.0,35,78,14.0,28,194,29.0
1,Sidewalk Defect,103,19.0,140,727,37.0,77,206,12.0,63,521,28.0
2,Traffic Sign Complaint,42,27.0,95,348,32.0,59,182,22.0,36,166,52.0
3,Other,194,27.0,290,1064,43.0,174,408,18.0,116,656,38.0
4,Missed Compost Pickup (Green Bin),34,30.0,67,226,71.0,48,68,12.0,19,158,30.0
5,Streetlight Defect,39,22.0,56,259,34.0,36,105,13.0,20,154,41.0
6,Tree Pruning Request,46,21.0,77,375,45.0,48,106,11.0,29,269,28.0
7,Rodent Sighting,30,21.0,44,207,48.0,30,63,10.0,14,144,30.0
8,Manhole/Street Drain Issue,10,25.0,28,111,38.0,21,55,13.0,7,56,50.0
9,Park Maintenance Issue,68,32.0,149,469,63.0,128,204,8.0,21,265,43.0


In [64]:
# Why are there things like turning complaints at non-intersections?
# Pedestrian vs. bike vs. car
# Compare counts against ATR/TMC volume counts
# Parsing extra field's text?