In [1]:
## Generate canonical dataset for hackathon
# Developed by: bpben 

In [174]:
import fiona
import json
import os
import pyproj
import rtree
import csv
import matplotlib.pyplot as plt
import pandas as pd
from shapely.geometry import Point, MultiPoint, shape, mapping
%matplotlib inline

In [175]:
MAP_FP = '../data/maps'
DATA_FP = '../data'

In [176]:
def read_records(fp, date_col, id_col, agg='week'):
    """ Read point data, output count by aggregation level
    agg : datepart for aggregation
    date_col : column name with date information
    id_col : column name with inter/non-inter id (for grouping)
    """
    
    with open(fp, 'r') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    df[date_col] = pd.to_datetime(df[date_col])
    df[agg] = df[date_col].apply(lambda x: getattr(x,agg))
    df_g = df.groupby([id_col, agg]).size()
    return(df_g)

def read_shp(fp):
    """ Read shp, output tuple geometry + property """
    out = [(shape(line['geometry']),line['properties']) for line in fiona.open(fp)]
    return(out)

def road_make(feats, inters_fp, non_inters_fp, agg='max'):
    """ Makes road feature df, intersections + non-intersections 
    
    agg : aggregation type (default is max)
    """
    #Read in inters data (json), turn into df with inter index
    df_index = []
    df_records = []
    with open(inters_fp, 'r') as f:
        inters = json.load(f)
        #Append each index to dataframe
        for idx, lines in inters.iteritems():
            df_records.extend(lines)
            df_index.extend([idx]*len(lines))
    inters_df = pd.DataFrame(df_records, index=df_index)
    
    #Read in non_inters data:
    non_inters = read_shp(non_inters_fp)
    non_inters_df = pd.DataFrame([x[1] for x in non_inters])
    non_inters_df.set_index('id', inplace=True)
    
    #Combine inter + non_inter
    combined = pd.concat([inters_df, non_inters_df])
    
    #Subset columns
    combined = combined[feats]
    
    #Aggregating inters data = max of all properties
    aggregated = getattr(combined.groupby(combined.index), agg)
    combined = aggregated()
    
    return(combined)

In [177]:
# read/aggregate crash/concerns
crash = read_records(DATA_FP + '/crash_joined.json', 'CALENDAR_DATE', 'near_id') 
concern = read_records(DATA_FP + '/concern_joined.json', 'REQUESTDATE', 'near_id')

In [178]:
# join aggregated crash/concerns
cr_con = pd.concat([crash, concern], axis=1)
cr_con.columns = ['crash','concern']

In [126]:
# if null for a certain week = 0 (no crash/concern)
cr_con.reset_index(inplace=True)
cr_con = cr_con.fillna(0)
# Make near_id string (for matching to segments)
cr_con['near_id'] = cr_con['near_id'].astype('str')

In [115]:
# combined road feature dataset parameters
inters_fp = DATA_FP+'/inters_data.json'
non_inters_fp = MAP_FP + '/non_inters_segments.shp'
feats = ['AADT', 'SPEEDLIMIT', 
             'Struct_Cnd', 'Surface_Tp',
            'F_F_Class']

In [149]:
# create combined road feature dataset
combined = road_make(feats, inters_fp, non_inters_fp)
# All features as int
combined = combined.apply(lambda x: x.astype('int'))

In [None]:
# 53 weeks for each segment (year = 52.2 weeks)
all_weeks = pd.MultiIndex.from_product([combined.index, range(1,54)], names=['segment_id', 'week'])

In [129]:
# crash/concern for each week, for each segment
cr_con = cr_con.set_index(['near_id','week']).reindex(all_weeks, fill_value=0)
cr_con.reset_index(inplace=True)

In [131]:
# join segment features to crash/concern
cr_con_roads = cr_con.merge(combined, left_on='segment_id', right_index=True, how='outer')

In [173]:
# output canon dataset
cr_con_roads.set_index('segment_id').to_csv(DATA_FP + '/vz_predict_dataset.csv.gz', compression='gzip')