In [58]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

BASE_DIR = '../../../'

DATA_FP = BASE_DIR + '/data/processed/'
MAP_FP = DATA_FP + 'maps/'


In [59]:
def parse_json(jsonfile, otherfields=[]):
    items = json.load(open(jsonfile))
    locations = {}
    for item in items:
        if str(item['near_id']) not in locations.keys():
            d = {'count': 0}
            for field in otherfields:
                d[field] = []
            locations[str(item['near_id'])] = d
        locations[str(item['near_id'])]['count'] += 1
        for field in otherfields:
            locations[str(item['near_id'])][field].append(item[field])
    return items, locations


In [60]:
def is_inter(id):
    if len(str(id)) > 1 and str(id)[0:2] == '00':
        return False
    return True

In [61]:
crash_data, crashes = parse_json(DATA_FP + 'crash_joined.json', otherfields=['FIRST_EVENT_SUBTYPE'])

concern_data, concerns = parse_json(DATA_FP + 'concern_joined.json',
                                        otherfields=['REQUESTTYPE'])

In [72]:
"""                                                                                                                                                                 
Info about intersections vs non intersections and their crash rate                                                                                                  
"""

# Hard code counts in since they don't change (at least not for Boston)                                                                                             
# and it's much faster                                                                                                                                              
inter_count = 8574
non_inter_count = 17388

counts = {
    'inter': 0,
    'non_inter': 0,
    'no_match': 0,
    'inter_plus': 0,
    'non_inter_plus': 0,
    'ped_inter': 0,
    'ped_non_inter': 0,
}

for k, v in crashes.iteritems():
    crash_types = v['FIRST_EVENT_SUBTYPE']
    ped = False
    for t in crash_types:
        if 'PEDESTRIAN' in t:
            ped = True
    # Add ped crash info to crashes
    crashes[k]['ped'] = ped

    if str(k) == '':
        counts['no_match'] += 1
    elif is_inter(k):
        if int(v['count']) > 1:
            counts['inter_plus'] += 1
        if ped:
            counts['ped_inter'] += 1
        counts['inter'] += 1
    else:
        if int(v['count']) > 1:
            counts['non_inter_plus'] += 1
        if ped:
            counts['ped_non_inter'] += 1
        counts['non_inter'] += 1

print "Number of intersections:\t\t\t\t" + str(inter_count)
print "Number of non-intersections:\t\t\t\t" + str(non_inter_count)
print "Percentage of segments that are intersections:\t\t" \
    + str(round(100 * float(inter_count)/float(inter_count + non_inter_count)))
print ""

print "Number of intersections with at least 1 crash:\t\t" + str(counts['inter']) 
print 'Number of intersections with more than 1 crash:\t\t' + str(counts['inter_plus'])
print "Number of non-intersections with at least 1 crash:\t" + str(counts['non_inter'])
print "Number of non-intersections with more than 1 crash:\t" + str(counts['non_inter_plus'])
print ""

total_percent = round(float(100 * (counts['inter'] + counts['non_inter']) / float(inter_count + non_inter_count)))
print "percent of all segments with crash:\t\t\t" + str(total_percent)

# Percentage of intersections/non-intersections                                                                                                                     
# that have at least one crash
inter_percent = round(float(100 * counts['inter'])/float(inter_count))
print "percent of intersections with crash:\t\t\t" + str(inter_percent)

non_inter_percent = round(100 * float(counts['non_inter'])/float(non_inter_count))
print "percentage of non-intersections with crash:\t\t" + str(non_inter_percent)

print "percentage of intersections with more than 1 crash:\t" + str(
    round(float(100 * counts['inter_plus'])/float(inter_count)))

print "percentage of non-intersections with more than 1 crash:\t" + str(
    round(float(100 * counts['non_inter_plus'])/float(non_inter_count)))

ped_total = counts['ped_inter'] + counts['ped_non_inter']
ped_percent = round(float(100 * ped_total)/float(inter_count + non_inter_count))
print "percentage of total segments that had a ped crash:\t" + str(ped_percent)

ped_inter_percent = round(float(100 * counts['ped_inter'])/float(inter_count))
print "percentage of intersections that had a ped crash:\t" + str(ped_inter_percent)

ped_non_inter_percent = round(float(100 * counts['ped_non_inter'])/float(non_inter_count))
print "percentage of non-intersections that had a ped crash:\t" + str(ped_non_inter_percent)



Number of intersections:				8574
Number of non-intersections:				17388
Percentage of segments that are intersections:		33.0

Number of intersections with at least 1 crash:		1877
Number of intersections with more than 1 crash:		914
Number of non-intersections with at least 1 crash:	1552
Number of non-intersections with more than 1 crash:	369

percent of all segments with crash:			13.0
percent of intersections with crash:			22.0
percentage of non-intersections with crash:		9.0
percentage of intersections with more than 1 crash:	11.0
percentage of non-intersections with more than 1 crash:	2.0
percentage of total segments that had a ped crash:	4.0
percentage of intersections that had a ped crash:	7.0
percentage of non-intersections that had a ped crash:	2.0


In [75]:
"""                                                                                                                                                                 
What percentage of intersections with concerns had crashes                                                                                                          
at varying counts of concerns?                                                                                                                                      
"""

matching = {}
# Go through each concern location
# Increment counts for crash/no crash intersection/no intersection at this location
for id, d in concerns.iteritems():
    if d['count'] not in matching.keys():
        matching[d['count']] = {
            'inter': {'crash': 0, 'no_crash': 0},
            'non_inter': {'crash': 0, 'no_crash': 0}
        }
    if is_inter(id):
        key = 'inter'
    else:
        key = 'non_inter'

    if id in crashes.keys():
        matching[d['count']][key]['crash'] += 1
    else:
        matching[d['count']][key]['no_crash'] += 1

sorted_matching = sorted(matching.items())

results = []
for key, value in sorted_matching:

    # Do the 1+,2+ stats as well as 1, 2                                                                                                                            
    # Still need to break it out by int/non-int                                                                                                                     
    counts = {
        'total': value['inter']['crash'] + value['inter']['no_crash'] \
            + value['non_inter']['crash'] + value['non_inter']['no_crash'],
        'crashes': value['inter']['crash'] + value['non_inter']['crash'],
        'inters_total': value['inter']['crash'] + value['inter']['no_crash'], # Count of intersections with a concern
        'inters_crashes': value['inter']['crash'],
        'non_inters_total': value['non_inter']['crash'] + value['non_inter']['no_crash'],
        'non_inters_crashes':  value['non_inter']['crash'],
    }
    # Add all the data for segments with more complaints than the current complaint # we're on
    for key2, value2 in sorted_matching[key:len(sorted_matching)]:
        if key2 > key:
            counts['total'] += value2['inter']['crash'] + value2['inter']['no_crash'] \
                + value2['non_inter']['crash'] + value2['non_inter']['no_crash']
            counts['crashes'] += value2['inter']['crash'] + value2['non_inter']['crash']
            counts['inters_total'] += value2['inter']['crash'] + value2['inter']['no_crash']
            counts['inters_crashes'] += value2['inter']['crash']
            counts['non_inters_total'] += value2['non_inter']['crash'] + value2['non_inter']['no_crash']
            counts['non_inters_crashes'] += value2['non_inter']['crash']
    total_percent_v0 = round(100 * float(counts['crashes'])/float(counts['total']))
    inter_percent_v0 = round(100 * float(counts['inters_crashes'])/float(counts['inters_total']))
    non_inter_percent_v0 = round((100 * float(counts['non_inters_crashes'])/float(counts['non_inters_total'])) if counts['non_inters_total'] else 0)
    results.append([
        key,
        total_percent_v0,
        counts['total'], # total count      
        inter_percent_v0,
        counts['inters_total'], # total # of intersections with this many or more complaints
        non_inter_percent_v0,
        counts['non_inters_total'], # total # of non-intersections with this many or more complaints
        round(100 * float(counts['inters_total'])/float(counts['total']))
    ])

                                                                                 
pd.DataFrame(results, columns=[
    '# of concerns at this segment',
    '% of segments w/ v0 complaint w/ crash',
    'total # of segments with this many or more concerns',
    '% of intersections with this many or more concerns with a crash',
    '# of intersections with this many concerns',
    '% of non-intersections with this many concerns with a crash',
    '# of non-intersections with this many concerns',
    '% of these locations that were at an intersection'
])

Unnamed: 0,# of concerns at this segment,% of segments w/ v0 complaint w/ crash,total # of segments with this many or more concerns,% of intersections with this many or more concerns with a crash,# of intersections with this many concerns,% of non-intersections with this many concerns with a crash,# of non-intersections with this many concerns,% of these locations that were at an intersection
0,1,29.0,4458,43.0,2082,17.0,2376,47.0
1,2,42.0,1592,58.0,868,23.0,724,55.0
2,3,52.0,762,67.0,460,29.0,302,60.0
3,4,58.0,452,74.0,296,29.0,156,65.0
4,5,68.0,294,82.0,209,32.0,85,71.0
5,6,70.0,194,87.0,134,32.0,60,69.0
6,7,76.0,141,89.0,104,38.0,37,74.0
7,8,75.0,106,89.0,80,35.0,26,75.0
8,9,78.0,80,90.0,63,29.0,17,79.0
9,10,79.0,67,91.0,54,31.0,13,81.0


In [77]:
# For intersections with a specific complaint type:                                                                                                                 
#    what percentage had a crash?                                                                                                                                   
requests = {}
for data in concern_data:
    if data['REQUESTTYPE'] not in requests.keys():
        requests[data['REQUESTTYPE']] = 1

requests = {}
all_unique = []

for k, v in concerns.iteritems():
    unique_requests = {}
    for request in v['REQUESTTYPE']:
        # Clean up badly formatted request types                                                                                                                    
        vals = request.split('nbsp;')
        if len(vals) > 1:
            request = vals[1]

        if request not in unique_requests.keys():
            unique_requests[request] = 0
        unique_requests[request] += 1

        if request not in requests.keys():
            requests[request] = {
                'crashes': 0,
                'total': 0,
                'inter_crashes': 0,
                'inter_total': 0,
                'non_inter_crashes': 0,
                'non_inter_total': 0,
                'ped_inter_crashes': 0,
                'ped_non_inter_crashes': 0
            }
        if str(k) in crashes.keys():
            requests[request]['crashes'] += 1
        requests[request]['total'] += 1
        if is_inter(k):
            if str(k) in crashes.keys():
                requests[request]['inter_crashes'] += 1
                if crashes[k]['ped']:
                    requests[request]['ped_inter_crashes'] += 1
            requests[request]['inter_total'] += 1
        else:
            if str(k) in crashes.keys():
                requests[request]['non_inter_crashes'] += 1
                if crashes[k]['ped']:
                    requests[request]['ped_non_inter_crashes'] += 1
            requests[request]['non_inter_total'] += 1


    for key, value in unique_requests.iteritems():
        if value > 1:
            all_unique.append([key, value])

by_type = {}
for k, v in all_unique:
    if k not in by_type.keys():
        by_type[k] = 0
    by_type[k] += 1

results = []
for k, v in requests.iteritems():
    if requests[k]['total'] >= 100:
        total_percent = round(100 * float(requests[k]['crashes'])/float(requests[k]['total']))
        inter_percent = round(100 * float(requests[k]['inter_crashes'])/float(requests[k]['inter_total']))
        non_inter_percent = round(100 * float(requests[k]['non_inter_crashes'])/float(requests[k]['non_inter_total']) if requests[k]['non_inter_total'] else 0)
        results.append([
            k,
            by_type[k],
            total_percent,
            requests[k]['crashes'],
            requests[k]['total'],
            inter_percent,
            requests[k]['inter_crashes'],
            requests[k]['inter_total'],
            non_inter_percent,
            requests[k]['non_inter_crashes'],
            requests[k]['non_inter_total'],
            round(100 * float(requests[k]['inter_total'])/float(requests[k]['total']))
# Leave ped counts out for now; not sure if there's better ped/bike data
#            round(100 * float(requests[k]['ped_inter_crashes'])/float(requests[k]['inter_total'])),
#            requests[k]['ped_inter_crashes'],
#            requests[k]['ped_non_inter_crashes']
        ])
tot_ped = 0
tot_non_ped = 0
for k,v in crashes.iteritems():
    if v['ped'] is True:
        tot_ped += 1
    else:
        tot_non_ped += 1

df = pd.DataFrame(results, columns=[
    'Request type',
    '# of requests of this type that appear more than once at a segment',
    '% of segments with this type of request with a crash',
    '# of segments with this type of request with a crash',
    '# of segments with this type of a request total',
    '% of intersections with this type of request with a crash',
    '# of intersections with this type of request with a crash',
    '# of intersections with this type of request total',
    '% of non-intersections with this type of request with a crash',
    '# of non-intersections with this type of request with a crash',
    '# of non-intersections with this type of request total',
    '% of complaint locations that were intersections'
    #'% of intersections with this type of request with a ped crash',
    #'# of ped crashes in intersection with this request',
    #'# of ped crashes in non-intersection with this request'
])
df.sort_values(by="# of segments with this type of a request total", ascending=0)


Unnamed: 0,Request type,# of requests of this type that appear more than once at a segment,% of segments with this type of request with a crash,# of segments with this type of request with a crash,# of segments with this type of a request total,% of intersections with this type of request with a crash,# of intersections with this type of request with a crash,# of intersections with this type of request total,% of non-intersections with this type of request with a crash,# of non-intersections with this type of request with a crash,# of non-intersections with this type of request total,% of complaint locations that were intersections
14,of something that is not listed here,183,44.0,633,1429,61.0,525,858,19.0,108,571,60.0
0,people speed,118,26.0,319,1226,39.0,152,385,20.0,167,841,31.0
4,bike facilities don't exist or need improvement,88,34.0,389,1136,61.0,208,341,23.0,181,795,30.0
11,people run red lights / stop signs,112,50.0,396,788,56.0,372,670,20.0,24,118,85.0
12,people don't yield while turning,89,63.0,406,644,70.0,391,561,18.0,15,83,87.0
1,people double park their vehicles,83,35.0,221,631,54.0,98,180,27.0,123,451,29.0
5,it’s hard to see / low visibility,28,41.0,178,429,49.0,151,309,23.0,27,120,72.0
7,people don't yield while going straight,37,47.0,168,361,54.0,145,267,24.0,23,94,74.0
10,sidewalks/ramps don't exist or need improvement,34,36.0,118,329,54.0,79,147,21.0,39,182,45.0
13,"the wait for the ""Walk"" signal is too long",49,68.0,204,301,76.0,196,259,19.0,8,42,86.0


In [40]:
# Why are there things like turning complaints at non-inter
# Pedestrian vs. bike vs. car
# Compare counts against ATR/TMC volume counts
# Parsing extra field's text?