In [17]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

BASE_DIR = '../../../'

DATA_FP = BASE_DIR + '/data/processed/'
MAP_FP = DATA_FP + 'maps/'


In [18]:
def parse_json(jsonfile, otherfields=[]):
    items = json.load(open(jsonfile))
    locations = {}
    for item in items:
        if str(item['near_id']) not in locations.keys():
            d = {'count': 0}
            for field in otherfields:
                d[field] = []
            locations[str(item['near_id'])] = d
        locations[str(item['near_id'])]['count'] += 1
        for field in otherfields:
            locations[str(item['near_id'])][field].append(item[field])
    return items, locations


In [19]:
def is_inter(id):
    if len(str(id)) > 1 and str(id)[0:2] == '00':
        return False
    return True

In [20]:
crash_data, crashes = parse_json(DATA_FP + 'crash_joined.json')

concern_data, concerns = parse_json(DATA_FP + 'concern_joined.json',
                                        otherfields=['REQUESTTYPE'])


In [21]:
"""                                                                                                                                                                 
Info about intersections vs non intersections and their crash rate                                                                                                  
"""

# Hard code counts in since they don't change (at least not for Boston)                                                                                             
# and it's much faster                                                                                                                                              
inter_count = 8574
non_inter_count = 17388

counts = {
    'inter': 0,
    'non_inter': 0,
    'no_match': 0,
    'inter_plus': 0,
    'non_inter_plus': 0,
}
for k, v in crashes.iteritems():
    if str(k) == '':
        counts['no_match'] += 1
    elif is_inter(k):
        if int(v['count']) > 1:
            counts['inter_plus'] += 1
        counts['inter'] += 1
    else:
        if int(v['count']) > 1:
            counts['non_inter_plus'] += 1
        counts['non_inter'] += 1

print "Number of intersections:" + str(inter_count)
print "Number of non-intersections:" + str(non_inter_count)

print "Number of intersection segments with 1/more than 1 crash:" \
    + str(counts['inter']) + '/' + str(counts['inter_plus'])
print "Number of non-intersection segments with 1/more than 1 crash:" \
    + str(counts['non_inter']) + '/' + str(counts['non_inter_plus'])

print "percent of all segments with crash:" + str(
    float(counts['inter'] + counts['non_inter']) /
    float(inter_count + non_inter_count))

# Percentage of intersections/non-intersections                                                                                                                     
# that have at least one crash                                                                                                                                      
print "percent of intersections with crash:" + str(
    float(counts['inter'])/float(inter_count))

print "percent of non-intersections with crash:" + str(
    float(counts['non_inter'])/float(non_inter_count))

print "percent of intersections with more than 1 crash:" + str(
    float(counts['inter_plus'])/float(inter_count))

print "percent of non-intersections with more than 1 crash:" + str(
    float(counts['non_inter_plus'])/float(non_inter_count))



Number of intersections:8574
Number of non-intersections:17388
Number of intersection segments with 1/more than 1 crash:1877/914
Number of non-intersection segments with 1/more than 1 crash:1552/369
percent of all segments with crash:0.132077651953
percent of intersections with crash:0.218917658036
percent of non-intersections with crash:0.0892569588222
percent of intersections with more than 1 crash:0.106601352927
percent of non-intersections with more than 1 crash:0.0212215320911


In [33]:
"""                                                                                                                                                                 
What percentage of intersections with concerns had crashes                                                                                                          
at varying counts of concerns?                                                                                                                                      
"""

matching = {}
for id, d in concerns.iteritems():
    if d['count'] not in matching.keys():
        matching[d['count']] = {
            'inter': [0, 0],
            'non_inter': [0, 0]
        }
    if is_inter(id):
        key = 'inter'
    else:
        key = 'non_inter'

    if id in crashes.keys():
        matching[d['count']][key][0] += 1
    else:
        matching[d['count']][key][1] += 1

#print "concerns\t %\t total\t int %\t int tot\t non-int %\t non-int tot"
sorted_matching = sorted(matching.items())

results = []
for key, value in sorted_matching:

    # Do the 1+,2+ stats as well as 1, 2                                                                                                                            
    # Still need to break it out by int/non-int                                                                                                                     
    total_plus = value['inter'][0] + value['inter'][1] + \
        value['non_inter'][0] + value['non_inter'][1]
    crashes_plus = value['inter'][0] + value['non_inter'][0]
    for key2, value2 in sorted_matching:
        if key2 > key:
            total_plus += value2['inter'][0] + value2['inter'][1] + \
                value2['non_inter'][0] + value2['non_inter'][1]
            crashes_plus += value2['inter'][0] + value2['non_inter'][0]

    total_inters = value['inter'][0] + value['inter'][1]
    total_non_inters = value['non_inter'][0] + value['non_inter'][1]

    inter_value = value['inter'][0]
    non_inter_value = value['non_inter'][0]

        
    total_percent = float(inter_value + non_inter_value) / \
        float(total_inters + total_non_inters)
    inters_percent = 0
    if total_inters > 0:
        inters_percent = float(inter_value) / float(total_inters)
    non_inters_percent = 0
    if total_non_inters > 0:
        non_inters_percent = float(non_inter_value) / \
            float(total_non_inters)
    results.append([
        key,
        total_percent * 100,
        total_inters + total_non_inters,
        float(crashes_plus)/(float(total_plus)) * 100,
        inters_percent * 100,
        total_inters,
        non_inters_percent * 100,
        total_non_inters

    ])

pd.DataFrame(results, columns=['# of concerns', '% of segments w/ crash', 'total # of segments with this many concerns', '% of segments with this many or more concerns with a crash', '% of intersections with this many concerns with a crash', '# of intersections with this many concerns', '% of non-intersections with this many concerns with a crash', '# of non-intersections with this many concerns'])

Unnamed: 0,# of concerns,% of segments w/ crash,total # of segments with this many concerns,% of segments with this many or more concerns with a crash,% of intersections with this many concerns with a crash,# of intersections with this many concerns,% of non-intersections with this many concerns with a crash,# of non-intersections with this many concerns
0,1,22.156315,2866,29.31808,32.866557,1214,14.285714,1652
1,2,33.13253,830,42.211055,47.303922,408,19.43128,422
2,3,42.903226,310,52.099738,55.487805,164,28.767123,146
3,4,41.139241,158,58.40708,54.022989,87,25.352113,71
4,5,64.0,100,67.687075,74.666667,75,32.0,25
5,6,52.830189,53,69.587629,76.666667,30,21.73913,23
6,7,77.142857,35,75.886525,91.666667,24,45.454545,11
7,8,69.230769,26,75.471698,82.352941,17,44.444444,9
8,9,69.230769,13,77.5,88.888889,9,25.0,4
9,10,80.0,10,79.104478,88.888889,9,0.0,1


In [39]:
# For intersections with a specific complaint type:                                                                                                                 
#    what percentage had a crash?                                                                                                                                   
requests = {}
for data in concern_data:
    if data['REQUESTTYPE'] not in requests.keys():
        requests[data['REQUESTTYPE']] = 1

requests = {}
all_unique = []

for k, v in concerns.iteritems():
    unique_requests = {}
    for request in v['REQUESTTYPE']:
        # Clean up badly formatted request types                                                                                                                    
        vals = request.split('nbsp;')
        if len(vals) > 1:
            request = vals[1]

        if request not in unique_requests.keys():
            unique_requests[request] = 0
        unique_requests[request] += 1

        if request not in requests.keys():
            requests[request] = {'crash': 0, 'no': 0, 'count': 0}
        if str(k) in crashes.keys():
            requests[request]['crash'] += 1
        else:
            requests[request]['no'] += 1
        requests[request]['count'] += 1

    for key, value in unique_requests.iteritems():
        if value > 1:
            all_unique.append([key, value])

by_type = {}
for k, v in all_unique:
    if k not in by_type.keys():
        by_type[k] = 0
    by_type[k] += 1

results = []
for k, v in requests.iteritems():
    results.append([
        k,
        by_type[k],
        float(requests[k]['crash'])/(float(requests[k]['crash'] + requests[k]['no'])) * 100,
        requests[k]['crash'],
        requests[k]['crash'] + requests[k]['no']
    ])

pd.DataFrame(results, columns=[
    'Request type',
    '# of requests of this type that appear more than once at a segment',
    '% of segments with this type of request with a crash',
    '# of segments with this type of request with a crash',
    '# of segments with this type of a request total'])

Unnamed: 0,Request type,# of requests of this type that appear more than once at a segment,% of segments with this type of request with a crash,# of segments with this type of request with a crash,# of segments with this type of a request total
0,people speed,118,26.019576,319,1226
1,people double park their vehicles,83,35.023772,221,631
2,there's not enough time to cross the street,19,66.480447,119,179
3,the roadway surface needs maintenance,6,46.938776,23,49
4,people are not given enough time to cross the ...,1,75.0,9,12
5,there are no bike facilities or they need main...,10,26.573427,38,143
6,people have to cross too many lanes / too far,4,66.666667,20,30
7,bike facilities don't exist or need improvement,88,34.242958,389,1136
8,it’s hard to see / low visibility,28,41.491841,178,429
9,people cross away from the crosswalks,30,40.955631,120,293
