In [2]:
## Generate canonical dataset for hackathon
# Developed by: bpben 

In [1]:
import fiona
import json
import os
import pyproj
import rtree
import csv
import matplotlib.pyplot as plt
import pandas as pd
from shapely.geometry import Point, MultiPoint, shape, mapping
%matplotlib inline

In [2]:
MAP_FP = './data/maps'
DATA_FP = './data'

In [10]:
def read_records(fp, date_col, id_col, agg='week'):
    """ Read point data, output count by aggregation level
    agg : datepart for aggregation
    date_col : column name with date information
    id_col : column name with inter/non-inter id (for grouping)
    """
    
    with open(fp, 'r') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    df[date_col] = pd.to_datetime(df[date_col])
    df[agg] = df[date_col].apply(lambda x: getattr(x,agg))
    df_g = df.groupby([id_col, agg]).size()
    return(df_g)

def read_shp(fp):
    """ Read shp, output tuple geometry + property """
    out = [(shape(line['geometry']),line['properties']) for line in fiona.open(fp)]
    return(out)

def road_make(feats, inters_fp, non_inters_fp):
    """ Makes road feature df, intersections + non-intersections """
    #Read in inters data (json), turn into df with inter index
    df_index = []
    df_records = []
    with open(inters_fp, 'r') as f:
        inters = json.load(f)
        #Append each index to dataframe
        for idx, lines in inters.iteritems():
            df_records.extend(lines)
            df_index.extend([idx]*len(lines))
    inters_df = pd.DataFrame(df_records, index=df_index)
    
    #Read in non_inters data:
    non_inters = read_shp(non_inters_fp)
    non_inters_df = pd.DataFrame([x[1] for x in non_inters])
    non_inters_df.set_index('id', inplace=True)
    
    #Combine inter + non_inter
    combined = pd.concat([inters_df, non_inters_df])
    
    #Subset columns
    combined = combined[feats]
    
    #Aggregating inters data = max of all properties
    combined = combined.groupby(combined.index).max()
    
    return(combined)

In [4]:
#Read/aggregate crash/concerns
crash = read_records(DATA_FP + '/crash_joined.json', 'CALENDAR_DATE', 'near_id') 
concern = read_records(DATA_FP + '/concern_joined.json', 'REQUESTDATE', 'near_id')

In [5]:
#Join aggregated crash/concerns
cr_con = pd.concat([crash, concern], axis=1)
cr_con.columns = ['crash','concern']

In [6]:
#Reset index, Null as 0
cr_con.reset_index(inplace=True)
cr_con = cr_con.fillna(0)

In [7]:
#Combined road feature dataset parameters
inters_fp = 'inters_data.json'
non_inters_fp = MAP_FP + '/non_inters_segments.shp'
feats = ['AADT', 'SPEEDLIMIT', 
             'Struct_Cnd', 'Surface_Tp',
            'F_F_Class']

In [11]:
#Create combined road feature dataset
combined = road_make(feats, inters_fp, non_inters_fp)

In [12]:
combined.loc['0010']

AADT           0
SPEEDLIMIT    20
Struct_Cnd     2
Surface_Tp     6
F_F_Class      7
Name: 0010, dtype: object

In [13]:
#Join segment features to crash/concern
cr_con_roads = cr_con.merge(combined, left_on='near_id', right_index=True)

In [14]:
#Output canon dataset
cr_con_roads.columns = ['segment_id'] + list(cr_con_roads.columns[1:])
cr_con_roads.to_csv(DATA_FP + '/vz_predict_dataset.csv', index=False)