In [17]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

BASE_DIR = '../../../'

DATA_FP = BASE_DIR + '/data/processed/'
MAP_FP = DATA_FP + 'maps/'


In [18]:
def parse_json(jsonfile, otherfields=[]):
    items = json.load(open(jsonfile))
    locations = {}
    for item in items:
        if str(item['near_id']) not in locations.keys():
            d = {'count': 0}
            for field in otherfields:
                d[field] = []
            locations[str(item['near_id'])] = d
        locations[str(item['near_id'])]['count'] += 1
        for field in otherfields:
            locations[str(item['near_id'])][field].append(item[field])
    return items, locations


In [19]:
def is_inter(id):
    if len(str(id)) > 1 and str(id)[0:2] == '00':
        return False
    return True

In [20]:
crash_data, crashes = parse_json(DATA_FP + 'crash_joined.json')

concern_data, concerns = parse_json(DATA_FP + 'concern_joined.json',
                                        otherfields=['REQUESTTYPE'])


In [71]:
"""                                                                                                                                                                 
Info about intersections vs non intersections and their crash rate                                                                                                  
"""

# Hard code counts in since they don't change (at least not for Boston)                                                                                             
# and it's much faster                                                                                                                                              
inter_count = 8574
non_inter_count = 17388

counts = {
    'inter': 0,
    'non_inter': 0,
    'no_match': 0,
    'inter_plus': 0,
    'non_inter_plus': 0,
}
for k, v in crashes.iteritems():
    if str(k) == '':
        counts['no_match'] += 1
    elif is_inter(k):
        if int(v['count']) > 1:
            counts['inter_plus'] += 1
        counts['inter'] += 1
    else:
        if int(v['count']) > 1:
            counts['non_inter_plus'] += 1
        counts['non_inter'] += 1

print "Number of intersections:" + str(inter_count)
print "Number of non-intersections:" + str(non_inter_count)

print "Number of intersection segments with 1/more than 1 crash:" \
    + str(counts['inter']) + '/' + str(counts['inter_plus'])
print "Number of non-intersection segments with 1/more than 1 crash:" \
    + str(counts['non_inter']) + '/' + str(counts['non_inter_plus'])

total_percent = float(counts['inter'] + counts['non_inter']) / float(inter_count + non_inter_count) * 100
print "percent of all segments with crash:" + str(total_percent)

# Percentage of intersections/non-intersections                                                                                                                     
# that have at least one crash
inter_percent = float(counts['inter'])/float(inter_count) * 100
print "percent of intersections with crash:" + str(inter_percent)

non_inter_percent = float(counts['non_inter'])/float(non_inter_count) * 100
print "percent of non-intersections with crash:" + str(non_inter_percent)

print "percent of intersections with more than 1 crash:" + str(
    float(counts['inter_plus'])/float(inter_count))

print "percent of non-intersections with more than 1 crash:" + str(
    float(counts['non_inter_plus'])/float(non_inter_count))



Number of intersections:8574
Number of non-intersections:17388
Number of intersection segments with 1/more than 1 crash:1877/914
Number of non-intersection segments with 1/more than 1 crash:1552/369
percent of all segments with crash:13.2077651953
percent of intersections with crash:21.8917658036
percent of non-intersections with crash:8.92569588222
percent of intersections with more than 1 crash:0.106601352927
percent of non-intersections with more than 1 crash:0.0212215320911


In [72]:
"""                                                                                                                                                                 
What percentage of intersections with concerns had crashes                                                                                                          
at varying counts of concerns?                                                                                                                                      
"""

matching = {}
for id, d in concerns.iteritems():
    if d['count'] not in matching.keys():
        matching[d['count']] = {
            'inter': [0, 0],
            'non_inter': [0, 0]
        }
    if is_inter(id):
        key = 'inter'
    else:
        key = 'non_inter'

    if id in crashes.keys():
        matching[d['count']][key][0] += 1
    else:
        matching[d['count']][key][1] += 1

sorted_matching = sorted(matching.items())

results = []
for key, value in sorted_matching:

    # Do the 1+,2+ stats as well as 1, 2                                                                                                                            
    # Still need to break it out by int/non-int                                                                                                                     
    counts = {
        'total': value['inter'][0] + value['inter'][1] + value['non_inter'][0] + value['non_inter'][1],
        'crashes': value['inter'][0] + value['non_inter'][0],
        'inters_total': value['inter'][0] + value['inter'][1],
        'inters_crashes': value['inter'][0],
        'non_inters_total': value['non_inter'][0] + value['non_inter'][1],
        'non_inters_crashes':  value['non_inter'][0],
    }
    # Add all the data for segments with more complaints than the current complaint # we're on
    for key2, value2 in sorted_matching:
        if key2 > key:
            counts['total'] += value2['inter'][0] + value2['inter'][1] + value2['non_inter'][0] + value2['non_inter'][1]
            counts['crashes'] += value2['inter'][0] + value2['non_inter'][0]
            counts['inters_total'] += value2['inter'][0] + value2['inter'][1]
            counts['inters_crashes'] += value2['inter'][0]
            counts['non_inters_total'] += value2['non_inter'][0] + value2['non_inter'][1]
            counts['non_inters_crashes'] += value2['non_inter'][0]
    results.append([
        key,
        float(counts['crashes'])/float(counts['total']) * 100, # total percent
        (float(counts['crashes'])/float(counts['total']) * 100)/total_percent,
        counts['total'], # total count
        float(counts['inters_crashes'])/float(counts['inters_total']) * 100, # total percent at intersections
        (float(counts['inters_crashes'])/float(counts['inters_total']) * 100)/inter_percent,
        counts['inters_total'], # total # of intersections with this many or more complaints
        (float(counts['non_inters_crashes'])/float(counts['non_inters_total']) * 100) if counts['non_inters_total'] else 0, # total percent at non-intersections
        ((float(counts['non_inters_crashes'])/float(counts['non_inters_total']) * 100) if counts['non_inters_total'] else 0)/non_inter_percent,
        counts['non_inters_total'] # total # of non-intersections with this many or more complaints
    ])

pd.DataFrame(results, columns=[
    '# of concerns',
    '% of segments w/ crash',
    '# of times more likely than average segment',
    'total # of segments with this many or more concerns',
    '% of intersections with this many or more concerns with a crash',
    '# of times more likely than average intersection',
    '# of intersections with this many concerns',
    '% of non-intersections with this many concerns with a crash',
    '# of times more likely than the average non-intersection',
    '# of non-intersections with this many concerns'
])

Unnamed: 0,# of concerns,% of segments w/ crash,# of times more likely than average segment,total # of segments with this many or more concerns,% of intersections with this many or more concerns with a crash,# of times more likely than average intersection,# of intersections with this many concerns,% of non-intersections with this many concerns with a crash,# of times more likely than the average non-intersection,# of non-intersections with this many concerns
0,1,29.31808,2.219761,4458,43.323727,1.978996,2082,17.045455,1.909706,2376
1,2,42.211055,3.195927,1592,57.949309,2.647082,868,23.342541,2.615207,724
2,3,52.099738,3.944629,762,67.391304,3.078386,460,28.807947,3.22753,302
3,4,58.40708,4.422177,452,73.986486,3.379649,296,28.846154,3.23181,156
4,5,67.687075,5.124794,294,82.296651,3.759251,209,31.764706,3.558793,85
5,6,69.587629,5.268691,194,86.567164,3.954325,134,31.666667,3.547809,60
6,7,75.886525,5.745599,141,89.423077,4.084781,104,37.837838,4.239203,37
7,8,75.471698,5.714191,106,88.75,4.054036,80,34.615385,3.878172,26
8,9,77.5,5.86776,80,90.47619,4.132887,63,29.411765,3.295179,17
9,10,79.104478,5.98924,67,90.740741,4.144971,54,30.769231,3.447264,13


In [55]:
# For intersections with a specific complaint type:                                                                                                                 
#    what percentage had a crash?                                                                                                                                   
requests = {}
for data in concern_data:
    if data['REQUESTTYPE'] not in requests.keys():
        requests[data['REQUESTTYPE']] = 1

requests = {}
all_unique = []

for k, v in concerns.iteritems():
    unique_requests = {}
    for request in v['REQUESTTYPE']:
        # Clean up badly formatted request types                                                                                                                    
        vals = request.split('nbsp;')
        if len(vals) > 1:
            request = vals[1]

        if request not in unique_requests.keys():
            unique_requests[request] = 0
        unique_requests[request] += 1

        if request not in requests.keys():
            requests[request] = {
                'crashes': 0,
                'total': 0,
                'inter_crashes': 0,
                'inter_total': 0,
                'non_inter_crashes': 0,
                'non_inter_total': 0,
            }
        if str(k) in crashes.keys():
            requests[request]['crashes'] += 1
        requests[request]['total'] += 1
        if is_inter(k):
            if str(k) in crashes.keys():
                requests[request]['inter_crashes'] += 1
            requests[request]['inter_total'] += 1
        else:
            if str(k) in crashes.keys():
                requests[request]['non_inter_crashes'] += 1
            requests[request]['non_inter_total'] += 1

    for key, value in unique_requests.iteritems():
        if value > 1:
            all_unique.append([key, value])

by_type = {}
for k, v in all_unique:
    if k not in by_type.keys():
        by_type[k] = 0
    by_type[k] += 1

results = []
for k, v in requests.iteritems():
    results.append([
        k,
        by_type[k],
        float(requests[k]['crashes'])/float(requests[k]['total']) * 100,
        requests[k]['crashes'],
        requests[k]['total'],
        float(requests[k]['inter_crashes'])/float(requests[k]['inter_total']) * 100,
        requests[k]['inter_crashes'],
        requests[k]['inter_total'],
        float(requests[k]['non_inter_crashes'])/float(requests[k]['non_inter_total']) * 100 if requests[k]['non_inter_total'] else 0,
        requests[k]['non_inter_crashes'],
        requests[k]['non_inter_total']
    ])

pd.DataFrame(results, columns=[
    'Request type',
    '# of requests of this type that appear more than once at a segment',
    '% of segments with this type of request with a crash',
    '# of segments with this type of request with a crash',
    '# of segments with this type of a request total',
    '% of intersections with this type of request with a crash',
    '# of intersections with this type of request with a crash',
    '# of intersections with this type of request total',
    '% of non-intersections with this type of request with a crash',
    '# of non-intersections with this type of request with a crash',
    '# of non-intersections with this type of request total'
])

Unnamed: 0,Request type,# of requests of this type that appear more than once at a segment,% of segments with this type of request with a crash,# of segments with this type of request with a crash,# of segments with this type of a request total,% of intersections with this type of request with a crash,# of intersections with this type of request with a crash,# of intersections with this type of request total,% of non-intersections with this type of request with a crash,# of non-intersections with this type of request with a crash,# of non-intersections with this type of request total
0,people speed,118,26.019576,319,1226,39.480519,152,385,19.857313,167,841
1,people double park their vehicles,83,35.023772,221,631,54.444444,98,180,27.272727,123,451
2,there's not enough time to cross the street,19,66.480447,119,179,78.873239,112,142,18.918919,7,37
3,the roadway surface needs maintenance,6,46.938776,23,49,65.0,13,20,34.482759,10,29
4,people are not given enough time to cross the ...,1,75.0,9,12,81.818182,9,11,0.0,0,1
5,there are no bike facilities or they need main...,10,26.573427,38,143,64.864865,24,37,13.207547,14,106
6,people have to cross too many lanes / too far,4,66.666667,20,30,82.608696,19,23,14.285714,1,7
7,bike facilities don't exist or need improvement,88,34.242958,389,1136,60.997067,208,341,22.767296,181,795
8,it’s hard to see / low visibility,28,41.491841,178,429,48.867314,151,309,22.5,27,120
9,people cross away from the crosswalks,30,40.955631,120,293,62.251656,94,151,18.309859,26,142


In [40]:
# Pedestrian vs. bike vs. car
# Compare counts against ATR/TMC volume counts
# Parsing extra field's text?