<h1> Data Processing for Models</h1>
<h3>Title:</h3> <em>Processing data using Traffic and Weather datasets to prepare train and test sets for models</em> 
<h3>Author:</h3> <em>Uttam Kumar</em>

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
import pytz
import pygeohash as gh
from haversine import haversine
import time
import pickle
import re
import random
import os
import multiprocessing
from multiprocessing import cpu_count,Pool 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import os.path
from os import path
import warnings; warnings.simplefilter('ignore') #to ignore printing of warnings

<em> Defining a helper class to utilize weather data for feature vector construction </em>

In [2]:
#defining class for weather parameters
class weather:
    date = ''
    temp = 0.0
    windchill = 0.0
    humid = 0.0
    pressure= 0.0
    visib = 0.0
    windspeed = 0.0
    winddir = ''
    precipitation = 0.0
    events = ''
    condition = ''
    
    def __init__(self, date, temp, windchill, humid, pressure, visib, windspeed, winddir, 
                 precipitation, events, condition, zone):
        self.date = datetime.strptime(date, '%Y-%m-%d %I:%M:%S %p')
        self.date = self.date.replace(tzinfo=pytz.timezone(zone))
        self.temp = float(temp)
        self.windchill = float(windchill)
        self.humid = float(humid)
        self.pressure = float(pressure)
        self.visib = float(visib)
        self.windspeed = float(windspeed)
        self.winddir = winddir
        self.precipitation = float(precipitation)
        self.events = events
        self.condition = condition

<em>  Meta data for each city including geo-fence and time-zone  </em> 

In [3]:
cities = {'LosAngeles': [33.700615, 34.353627, -118.683511, -118.074559], 
           'Houston': [29.497907,30.129003,-95.797178,-94.988191],
           'Austin': [30.079327, 30.596764,-97.968881,-97.504838],
           'Dallas': [32.559567,33.083278,-97.036586,-96.428928],
           'Charlotte': [34.970168,35.423667,-81.060925,-80.622687],
           'Atlanta': [33.612410,33.916999,-84.575600,-84.231911]}

time_zones = {'Houston':'US/Central', 'Charlotte':'US/Eastern', 'Dallas':'US/Central',
              'Atlanta':'US/Eastern', 'Austin':'US/Central', 'LosAngeles':'US/Pacific'}

# time interval to take a sample data for 
start = datetime(2018, 6, 1)
finish   = datetime(2018, 9, 2)

begin = datetime.strptime('2018-06-01 00:00:00', '%Y-%m-%d %H:%M:%S')
end   = datetime.strptime('2018-08-31 23:59:59', '%Y-%m-%d %H:%M:%S')

<em>  Sample Traffic Event Data for all Cities </em> 

In [4]:
#loading the additional traffc and weather events dataset
#traffic_df = pd.read_csv('../data_files/TrafficWeatherEvent_Aug16_June19_Publish.csv')
#loading data from newly published dataset
traffic_df = pd.read_csv('../data_files/TrafficEvents_Aug16_Dec20_Publish.csv')
print(len(traffic_df.columns))
print(len(traffic_df))
traffic_df.head(2)

19
31355575


Unnamed: 0,EventId,Type,Severity,TMC,Description,StartTime(UTC),EndTime(UTC),TimeZone,LocationLat,LocationLng,Distance(mi),AirportCode,Number,Street,Side,City,County,State,ZipCode
0,T-38768,Congestion,Moderate,73,Severe delays of 18 minutes on US-101 Redwood ...,2016-08-01 00:03:00,2016-08-01 00:14:28,US/Pacific,38.214657,-122.602669,0.0,KO69,,Redwood Hwy S,R,Petaluma,Sonoma,CA,94952.0
1,T-38772,Congestion,Moderate,72,Delays of eight minutes on CA-92 San Mateo Rd ...,2016-08-01 00:07:00,2016-08-01 00:18:44,US/Pacific,37.477329,-122.415703,0.0,KHAF,12685.0,San Mateo Rd,L,Half Moon Bay,San Mateo,CA,94019.0


In [5]:
traffic_df['StartTime(UTC)'] = traffic_df['StartTime(UTC)'].astype('datetime64[ns]', errors = 'ignore')
traffic_df['EndTime(UTC)'] = traffic_df['EndTime(UTC)'].astype('datetime64[ns]', errors = 'ignore')
print(len(traffic_df.columns))

19


In [6]:
#iterating over all cities for time
for c in cities:
    crds = cities[c]
    #subset of bith classes i.e accidents and non-accidents of each of the six cities
    subset_all = traffic_df[(traffic_df['StartTime(UTC)'] >= start) & (traffic_df['StartTime(UTC)'] < end) & (traffic_df['LocationLat']>crds[0]) & (traffic_df['LocationLat']<crds[1]) & (traffic_df['LocationLng']>crds[2]) & (traffic_df['LocationLng']<crds[3])]
    subset_all.to_csv('../data_files/temporary/TE_{}_20180601_20180609.csv'.format(c), index=False)
    print(len(subset_all))
    subset_all.head(2)

92989
45463
20440
30545
18600
23944


<em>  Setting UTC as the default timezone for each traffic event </em> 

In [7]:
## converting time from UTC to local time zone for each city 
te_city2incidents = {}
for c in cities:
    incidents = []
    z = time_zones[c]
    
    #opening the subset of data containing records of accidents and non-accidents for each of the cities
    with open('../data_files/temporary/TE_{}_20180601_20180609.csv'.format(c), 'r') as file:
        header = False
        for line in file:
            if not header:
                header = True
                continue
            parts = line.replace('\r', '').replace('\n', '').split(',')
            
            '''
            #parts e.g.
            ['T-21832562', 'Broken-Vehicle', '', '211', 
            'Entry ramp to I-605 Northbound from CA-22 Eastbound lane blocked due to stalled vehicle.', 
            '2018-08-31 04:39:48', '2018-08-31 05:24:12', 'US/Pacific', '33.777599', '-118.09063', 
            '0.600000023842', 'KSLI', '', 'San Diego Fwy S', 'R', 'Seal Beach', 'Orange', 'CA', '90740.0']
            '''
            
            ds = datetime.strptime(parts[5].replace('T',' '), '%Y-%m-%d %H:%M:%S')
            ds = ds.replace(tzinfo=pytz.utc)
            ds = ds.astimezone(pytz.timezone(z))
            de = datetime.strptime(parts[6].replace('T',' '), '%Y-%m-%d %H:%M:%S')
            de = de.replace(tzinfo=pytz.utc)
            de = de.astimezone(pytz.timezone(z))
            
            v = [parts[0], parts[2], float(parts[8]), float(parts[9]), ds, de]            
            incidents.append(v)
            
    te_city2incidents[c] = incidents

<em>  Constructing Feature Vectors for pairs of City-Geohash </em> 

In [8]:
zone_to_be = {}
for z in ['US/Eastern', 'US/Central', 'US/Mountain', 'US/Pacific']:
    t_begin = begin.replace(tzinfo=pytz.timezone(z))
    t_end   = end.replace(tzinfo=pytz.timezone(z))
    zone_to_be[z] = [t_begin, t_end]

<em> Getting Traffic Data </em>

In [9]:
#defining a geohash precision for use in codes ahead
geohash_prec = 5 #to represent area of 4.89*4.89 Sq Km

#defining a dict containing new names of some traffic events
name_conversion = {'Broken-Vehicle':'BrokenVehicle', 'Flow-Incident': 'FlowIncident', 'Lane-Blocked':'RoadBlocked'}

def return_interval_index(time_stamp, start, end):
    if time_stamp < start or time_stamp>end: 
        return -1
    index = int(((time_stamp - start).days*24*60 + (time_stamp-start).seconds/60)/15)
    return index

#15 is used to divide the interval each being of 15 minute for each feature vector
diff = int(((end - begin).days*24*60 + (end-begin).seconds/60)/15) # i.e. number of 15 minutes intervals

city_to_geohashes = {}
for c in cities: 
    #geohashes of each city, defining initially as empty
    city_to_geohashes[c] = {}

start_timestamp = time.time()
ccnntt = 0

geocode_to_airport = {}
aiport_to_timezone = {}

for c in cities:
    z = time_zones[c]
    
    #opening the subset data containing records of accidents and non-accidents for each of the cities
    with open('../data_files/temporary/TE_{}_20180601_20180609.csv'.format(c), 'r') as file:
        header = False
        for line in file:
            if not header:
                header = True
                continue
                
            #framing the 12 columns of the dataset into an array    
            parts = line.replace('\r', '').replace('\n', '').split(',')
            
            '''
            #parts e.g.
            ['T-21832562', 'Broken-Vehicle', '', '211', 
            'Entry ramp to I-605 Northbound from CA-22 Eastbound lane blocked due to stalled vehicle.', 
            '2018-08-31 04:39:48', '2018-08-31 05:24:12', 'US/Pacific', '33.777599', '-118.09063', 
            '0.600000023842', 'KSLI', '', 'San Diego Fwy S', 'R', 'Seal Beach', 'Orange', 'CA', '90740.0']
            '''
            
            ds = datetime.strptime(parts[5].replace('T',' '), '%Y-%m-%d %H:%M:%S')
            ds = ds.replace(tzinfo=pytz.utc)
            ds = ds.astimezone(pytz.timezone(z))
            s_interval = return_interval_index(ds, zone_to_be[z][0], zone_to_be[z][1])
            if s_interval==-1: 
                continue
                
            de = datetime.strptime(parts[6].replace('T',' '), '%Y-%m-%d %H:%M:%S')
            de = de.replace(tzinfo=pytz.utc)
            de = de.astimezone(pytz.timezone(z))
            e_interval = return_interval_index(de, zone_to_be[z][0], zone_to_be[z][1])
            if e_interval == -1: 
               e_interval = diff-1    
            
            #converting LocationLat and LocationLng geo coordinates of the traffic event location to geohash
            start_gh = gh.encode(float(parts[8]), float(parts[9]), precision=geohash_prec)
            intervals = []
            #check if this geohash location exist within the city, if not then append 0 values to intervals
            if start_gh not in city_to_geohashes[c]:
                for i in range(diff): 
                    intervals.append({'Construction':0, 'Congestion':0, 'Accident':0, 'FlowIncident':0, 'Event':0, 
                                      'BrokenVehicle':0, 'RoadBlocked':0, 'Other':0})
            else:
                intervals = city_to_geohashes[c][start_gh]
            
            #checking if the traffic event is present in our selected traffic events name conversion dict
            if parts[4] in name_conversion:
                tp = name_conversion[parts[4]]
            else: 
                tp = parts[4].split('-')[0]
            
            #for each of the 15 minutes time interval
            for i in range(s_interval, e_interval+1):                
                v = intervals[i]
                if tp in v: 
                    v[tp] = v[tp] + 1
                else: 
                    v['Other'] = v['Other'] + 1
                intervals[i] = v
                
                if tp == 'Accident': 
                    break # unlike other types of traffic events 
                
            city_to_geohashes[c][start_gh] = intervals
            
            ap = parts[11]
            #i.e. airport code of closest airport station to the location of a traffic event
            if len(ap) > 3:
                if start_gh not in geocode_to_airport:
                    geocode_to_airport[start_gh] = set([ap])
                else:
                    st = geocode_to_airport[start_gh]
                    st.add(ap)
                    geocode_to_airport[start_gh] = st
                aiport_to_timezone[ap] = z
  
        start_timestamp = time.time()

<em>  Getting the weather data </em> 

In [10]:
# load and sort relevant weather data
airports_to_observations = {}
#geocode_to_airport is of the form
# {'9qh00': {'KLGB', 'KSLI'}, '9q5bn': {'KLGB'}, '9q5bw': {'KLGB'}, 
#  '9q5c1': {'KLAX'}, '9q5c5': {'KLAX', 'KHHR'}, ....
for g in geocode_to_airport:
    aps = geocode_to_airport[g]
    for a in aps:
        if a not in airports_to_observations:
           airports_to_observations[a] = []
            
w_path = '../data_files/Sample_Weather/' # weather observation records
airport_to_data = {}
for ap in airports_to_observations:
    data = []
    z = aiport_to_timezone[ap]
    header = ''
    if path.exists(w_path + ap + '.csv'):
      with open(w_path + ap + '.csv', 'r') as file:
          for line in file:
              if 'Airport' in line: 
                  header = line.replace('\r','').replace('\n','').replace(',Hour','')
                  continue
              parts = line.replace('\r', '').replace('\n', '').split(',')
              #parts e.g data is as following
              # ['KLGB', '2016-8-11', '9:53 PM', '69.1', '-1000.0', 
              #  '75.0', '30.01', '10.0', '6.9', 'NW', '-1000.0', 'N/A', 'Clear']
              try:
                  w = weather(parts[1] + ' ' + parts[2].split(' ')[0] + ':00 ' + parts[2].split(' ')[1], parts[3], parts[4], 
                             parts[5], parts[6], parts[7], parts[8], parts[9], parts[10], parts[11], parts[12], z)   
                  data.append(w)
              except:
                  continue
    data.sort(key=lambda x:x.date)
    airport_to_data[ap] = data


<em> Finding missing airports using havershine <br> 
 We find the closest weather station using haversine distance and geo codes. Then, we take the weather observation record whose reported time
was closest to the start time of traffic event.</em> 

In [11]:
for c in city_to_geohashes:
    #e.g. of c value is : LosAngeles
    for g in city_to_geohashes[c]:
        #e.g. of g value is 9qh00 or 9vk0z or 9v6s1 or 9vg4w or dnq83 or dn5bp
        #e.g. of geocode_to_airport value is
        #{'9qh00': {'KLGB', 'KSLI'}, '9q5bn': {'KLGB'}, '9q5bw': {'KLGB'}, '9q5c1': {'KLAX'}, ...
        #now checking if g exists in geocode_to_airport dict or not, if not then add using haversine distance
        if g not in geocode_to_airport:
            gc = gh.decode_exactly(g)[0:2]
            min_dist = 1000000000
            close_g = ''
            for _g in geocode_to_airport:
                _gc = gh.decode_exactly(_g)[0:2]
                dst = haversine(gc, _gc, 'km')
                if dst < min_dist:
                    min_dist = dst
                    close_g = _g
            geocode_to_airport[g] = geocode_to_airport[close_g]

In [12]:
city_to_geohashes_to_weather = {}

for c in city_to_geohashes:
    #e.g. value of c is LosAngeles
    start = time.time()
    geo2weather = {}
    for g in city_to_geohashes[c]:
        #e.g. value of g is 9qh00
        w_data = []
        for i in range(len(city_to_geohashes[c][g])):
            w_data.append({'Temperature':[], 'Humidity':[], 'Pressure':[], 'Visibility':[], 'WindSpeed':[], 
                          'Precipitation':[], 'Condition':set(), 'Event':set()})
            #i.e. for each of the cities, we collect data in format
            #[{'Temperature': [], 'Humidity': [], 'Pressure': [], 'Visibility': [], 'WindSpeed': [],
            #  'Precipitation': [], 'Condition': set(), 'Event': set()}]

        # populate weather data
        aps = geocode_to_airport[g]
        for a in aps:
            #e.g. value of 'a' is KLGB
            z = aiport_to_timezone[a]
            #e.g. value of z is US/Pacific
            a_w_data = airport_to_data[a]
            prev = 0
            for a_w_d in a_w_data:
                idx = return_interval_index(a_w_d.date, zone_to_be[z][0], zone_to_be[z][1])
                if idx >-1:
                    for i in range(prev, min(idx+1, len(w_data))):
                        _w = w_data[i]
                        
                        _tmp = _w['Temperature']
                        if a_w_d.temp > -1000:
                            _tmp.append(a_w_d.temp)
                            _w['Temperature'] = _tmp
                        
                        _hmd = _w['Humidity']
                        if a_w_d.humid > -1000:
                            _hmd.append(a_w_d.humid)
                            _w['Humidity'] = _hmd
                        
                        _prs = _w['Pressure']
                        if a_w_d.pressure > -1000:
                            _prs.append(a_w_d.pressure)
                            _w['Pressure'] = _prs
                        
                        _vis = _w['Visibility']
                        if a_w_d.visib > -1000:
                            _vis.append(a_w_d.visib)
                            _w['Visibility'] = _vis
                            
                        _wspd = _w['WindSpeed']
                        if a_w_d.windspeed > -1000:
                            _wspd.append(a_w_d.windspeed)
                            _w['WindSpeed'] = _wspd
                            
                        _precip = _w['Precipitation']
                        if a_w_d.precipitation > -1000:
                            _precip.append(a_w_d.precipitation)
                            _w['Precipitation'] = _precip
                            
                        _cond = _w['Condition']
                        _cond.add(a_w_d.condition)
                        _w['Condition'] = _cond
                        
                        _evnt = _w['Event']
                        _evnt.add(a_w_d.events)
                        _w['Event'] = _evnt
                        
                        w_data[i] = _w
                        
                    prev = idx+1                                               
        geo2weather[g] = w_data
        #e.g. of a w_data row is:  [{'Temperature': [69.1, 68.0], 'Humidity': [96.0, 100.0], 
        # 'Pressure': [29.93, 29.99], 'Visibility': [9.0, 10.0], 'WindSpeed': [3.5, 3.5], 
        # 'Precipitation': [], 'Condition': {'Clear', 'Mostly Cloudy'}, 'Event': {'N/A'}},
        # {},{},...,{}]
    city_to_geohashes_to_weather[c] = geo2weather

<em>  Adding Daylight Data </em> 

In [13]:
class dayLight:
    sunrise = []
    sunset = []
    def __init__(self, sunrise, sunset):
        self.sunrise = sunrise
        self.sunset = sunset
        
#function that returns hour and minute part of a given time        
def return_time(x):
    try:
        h = int(x.split(':')[0])
        m = int(x.split(':')[1].split(' ')[0])
        if 'pm' in x and h < 12: 
            h = h + 12
        return [h,m]
    except: 
        return [0,0]

#function to return values of 1 or 0 for day or night given a time of a city
def returnDayLight(city, state, dt):
    sc = city + '-' + state
    days = city_days_time[sc]
    d = str(dt.year) + '-' + str(dt.month) + '-' + str(dt.day)
    if d in days:
        r = days[d]
        if ((dt.hour>r.sunrise[0] and dt.hour<r.sunset[0]) or
            (dt.hour>=r.sunrise[0] and dt.minute>=r.sunrise[1] and dt.hour<r.sunset[0]) or
            (dt.hour>r.sunrise[0] and dt.hour<=r.sunset[0] and dt.minute<r.sunset[1]) or 
            (dt.hour>=r.sunrise[0] and dt.minute>=r.sunrise[1] and dt.hour<=r.sunset[0] and dt.minute<r.sunset[1])):
            return '1'
        else: 
            return '0'

In [14]:
city_days_time = {}

days = {}
city = ''
with open('../data_files/sample_daylight.csv', 'r') as file: # you find daylight data for the selected 6 cities in this file
    for ln in file.readlines():
        parts = ln.replace('\r','').replace('\n','').split(',')
        print(parts)
        #e.g. of parts value is ['Charlotte-NC', '2016-1-1', '7:32 am ', '5:22 pm ', '9:49:53', '6:01 am', 
        #                         '6:52 pm', '6:32 am', '6:21 pm', '7:03 am', '5:50 pm', '12:27 pm ']
        
        if parts[0] != city:
            if len(city) > 0:   #checking if city stores some city name or not 
                #i.e. if parts[0] is not equals to a city val
                if city in city_days_time: 
                    _days = city_days_time[city]
                    for _d in _days: 
                        days[_d] = _days[_d]
                city_days_time[city] = days

            city = parts[0]
            days = {}

        sunrise = return_time(parts[2])
        sunset  = return_time(parts[3])
        dl = dayLight(sunrise, sunset)
        days[parts[1]] = dl

if city in city_days_time:
    _days = city_days_time[city]
    for _d in _days: 
        days[_d] = _days[_d]
city_days_time[city] = days

print('Successfully loaded daylight data for {} cities!'.format(len(city_days_time)))

['Charlotte-NC', '2016-1-1', '7:32 am ', '5:22 pm ', '9:49:53', '6:01 am', '6:52 pm', '6:32 am', '6:21 pm', '7:03 am', '5:50 pm', '12:27 pm ']
['Charlotte-NC', '2016-1-2', '7:32 am ', '5:22 pm ', '9:50:28', '6:01 am', '6:53 pm', '6:32 am', '6:22 pm', '7:04 am', '5:50 pm', '12:27 pm ']
['Charlotte-NC', '2016-1-3', '7:32 am ', '5:23 pm ', '9:51:06', '6:01 am', '6:54 pm', '6:32 am', '6:23 pm', '7:04 am', '5:51 pm', '12:27 pm ']
['Charlotte-NC', '2016-1-4', '7:32 am ', '5:24 pm ', '9:51:48', '6:02 am', '6:54 pm', '6:32 am', '6:24 pm', '7:04 am', '5:52 pm', '12:28 pm ']
['Charlotte-NC', '2016-1-5', '7:32 am ', '5:25 pm ', '9:52:32', '6:02 am', '6:55 pm', '6:33 am', '6:24 pm', '7:04 am', '5:53 pm', '12:28 pm ']
['Charlotte-NC', '2016-1-6', '7:32 am ', '5:26 pm ', '9:53:19', '6:02 am', '6:56 pm', '6:33 am', '6:25 pm', '7:04 am', '5:54 pm', '12:29 pm ']
['Charlotte-NC', '2016-1-7', '7:32 am ', '5:26 pm ', '9:54:10', '6:02 am', '6:57 pm', '6:33 am', '6:26 pm', '7:04 am', '5:54 pm', '12:29 pm ']

['Austin-TX', '2018-7-9', '6:36 am ', '8:35 pm ', '13:59:04', '5:01 am', '10:11 pm', '5:36 am', '9:35 pm', '6:09 am', '9:03 pm', '1:36 pm ']
['Austin-TX', '2018-7-10', '6:37 am ', '8:35 pm ', '13:58:20', '5:02 am', '10:10 pm', '5:37 am', '9:35 pm', '6:10 am', '9:02 pm', '1:36 pm ']
['Austin-TX', '2018-7-11', '6:37 am ', '8:35 pm ', '13:57:35', '5:02 am', '10:10 pm', '5:37 am', '9:35 pm', '6:10 am', '9:02 pm', '1:36 pm ']
['Austin-TX', '2018-7-12', '6:38 am ', '8:35 pm ', '13:56:47', '5:03 am', '10:09 pm', '5:38 am', '9:34 pm', '6:11 am', '9:02 pm', '1:36 pm ']
['Austin-TX', '2018-7-13', '6:38 am ', '8:34 pm ', '13:55:58', '5:04 am', '10:09 pm', '5:39 am', '9:34 pm', '6:11 am', '9:01 pm', '1:36 pm ']
['Austin-TX', '2018-7-14', '6:39 am ', '8:34 pm ', '13:55:06', '5:05 am', '10:08 pm', '5:39 am', '9:34 pm', '6:12 am', '9:01 pm', '1:37 pm ']
['Austin-TX', '2018-7-15', '6:39 am ', '8:34 pm ', '13:54:13', '5:05 am', '10:08 pm', '5:40 am', '9:33 pm', '6:12 am', '9:01 pm', '1:37 pm ']
['Austi

['Houston-TX', '2017-10-10', '7:20 am ', '6:55 pm ', '11:35:56', '6:00 am', '8:15 pm', '6:28 am', '7:47 pm', '6:56 am', '7:19 pm', '1:08 pm ']
['Houston-TX', '2017-10-11', '7:20 am ', '6:54 pm ', '11:34:12', '6:01 am', '8:14 pm', '6:29 am', '7:46 pm', '6:56 am', '7:18 pm', '1:07 pm ']
['Houston-TX', '2017-10-12', '7:21 am ', '6:53 pm ', '11:32:28', '6:02 am', '8:12 pm', '6:29 am', '7:45 pm', '6:57 am', '7:17 pm', '1:07 pm ']
['Houston-TX', '2017-10-13', '7:21 am ', '6:52 pm ', '11:30:45', '6:02 am', '8:11 pm', '6:30 am', '7:44 pm', '6:57 am', '7:16 pm', '1:07 pm ']
['Houston-TX', '2017-10-14', '7:22 am ', '6:51 pm ', '11:29:02', '6:03 am', '8:10 pm', '6:30 am', '7:43 pm', '6:58 am', '7:15 pm', '1:07 pm ']
['Houston-TX', '2017-10-15', '7:23 am ', '6:50 pm ', '11:27:19', '6:03 am', '8:09 pm', '6:31 am', '7:42 pm', '6:59 am', '7:14 pm', '1:07 pm ']
['Houston-TX', '2017-10-16', '7:23 am ', '6:49 pm ', '11:25:37', '6:04 am', '8:08 pm', '6:32 am', '7:41 pm', '6:59 am', '7:13 pm', '1:06 pm ']

In [15]:
# pre-loading daylight mapping for different cities
city_to_index_to_daylight = {}
states = {'Houston':'TX', 'Charlotte':'NC', 'Dallas':'TX', 'Atlanta':'GA', 'Austin':'TX', 'LosAngeles':'CA'}
for c in cities:
    d_begin = begin.replace(tzinfo=pytz.timezone(time_zones[c]))
    d_end   = end.replace(tzinfo=pytz.timezone(time_zones[c]))
    index_to_daylight = {}
    index = 0
    while(d_begin < d_end):
        dl = returnDayLight(c, states[c], d_begin)
        index_to_daylight[index] = dl
        index += 1
        d_begin += timedelta(seconds=15*60)
    city_to_index_to_daylight[c] = index_to_daylight

<em> Writing the Feature Vectors <br>
Here each vector represent a 15 minutes time interval for each geohash (geohash: a region of size 4.89km x 4.89km)</em> 

In [16]:
# mapping each time-step to hour of day and day of the week; this should be consistent across different time-zones!
timestep_to_dow_hod = {}
d_begin = begin.replace(tzinfo=pytz.utc)
d_end   = end.replace(tzinfo=pytz.utc)
index = 0

while(d_begin < d_end):
    dow = d_begin.weekday()
    hod = d_begin.hour    
    timestep_to_dow_hod[index] = [dow, hod]
    
    d_begin += timedelta(seconds=15*60)    
    index += 1

In [17]:
# traffic events
traffic_tags = ['Accident', 'BrokenVehicle', 'Congestion', 'Construction', 'Event', 'FlowIncident', \
                'Other', 'RoadBlocked']
# weather events
weather_tags = ['Condition', 'Event', 'Humidity', 'Precipitation', 'Pressure', 'Temperature', \
                'Visibility', 'WindSpeed']
poi_tags = []
start = time.time()
condition_tags = set()

for c in city_to_geohashes:
    #e.g. value of c is LosAngeles
    # creating vector for each geohash region during a 15 minutes time interval. 
    #  Such vector contains time, traffic and weather attributes. 
    writer = open('../data_files/vectors/{}_geo2vec_{}-{}.csv'.format(c, \
                    str(begin.year)+str(begin.month)+str(begin.day), \
                    str(end.year)+str(end.month)+str(end.day)), 'w')
    writer.write('Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,'\
        'T-Event,T-FlowIncident,T-Other,T-RoadBlocked,W-Humidity,W-Precipitation,W-Pressure,'\
        'W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail\n')
    
    traffic = city_to_geohashes[c]
    weather = city_to_geohashes_to_weather[c] 
    
    for g in traffic:
        vectors = []
        for i in range(len(traffic[g])):
            v = []
            for t in traffic_tags: 
                v.append(traffic[g][i][t])
            v_w = [0,0,0,0] # for rain, snow, fog, and hail respectively
            for w in weather_tags:
                if w=='Condition' or w=='Event':      
                    _tgs = weather[g][i][w]
                    for _tg in _tgs: 
                        #merging all rain. drizzle, thunderstorm into one category 
                        if 'rain' in _tg.lower() or 'drizzle' in _tg.lower() or 'thunderstorm' in _tg.lower():
                            v_w[0] = 1
                        elif 'snow' in _tg.lower():
                            v_w[1] = 1
                        #merging all fog. haze, mist, smoke into one category 
                        elif 'fog' in _tg.lower() or 'haze' in _tg.lower() or 'mist' in _tg.lower() \
                              or 'smoke' in _tg.lower():
                            v_w[2] = 1
                        #merging hail, ice pellets into one category 
                        elif 'hail' in _tg.lower() or 'ice pellets' in _tg.lower(): 
                            v_w[3] = 1   
                            
                elif len(weather[g][i][w]) == 0: 
                    v.append(0)
                else: 
                    v.append(np.mean(weather[g][i][w]))
            for _v_w in v_w: 
                v.append(_v_w)
            vectors.append(v)
        
        for i in range(len(vectors)):
            v = vectors[i]
            v = [str(v[j]) for j in range(len(v))]
            v = ','.join(v)
            writer.write(g + ',' + str(i) + ',' + str(timestep_to_dow_hod[i][0]) + ',' \
                         + str(timestep_to_dow_hod[i][1]) \
                         + ',' + city_to_index_to_daylight[c][i] + ',' + v + '\n')
            #print(g + ',' + str(i) + ',' + str(timestep_to_dow_hod[i][0]) + ',' \
            #             + str(timestep_to_dow_hod[i][1]) \
            #             + ',' + city_to_index_to_daylight[c][i] + ',' + v + '\n')
            #e.g. first value as printed for LosAngeles is:
            # 9qh00,0,4,0,0,0,0,0,0,0,0,0,0,80.0,0,29.94,58.85,10.0,0,0,0,0,0
            #of another city, the first value for e.g. is:
            # 9v6s1,0,4,0,0,0,0,0,0,0,0,0,0,87.0,0,29.84,75.0,10.0,4.6,0,0,0,0
            #break
        #break
            
    writer.close()
    start = time.time()

<em> Loading GloVe Word Embedding Vectors </em>

In [18]:
word2vec = {}
#to check how values are stored in the file, we use a counter to get 1 of the values printed
cntr = 0
#opening the GloVe vector file 
with open('../data_files/glove.6B.100d.txt', 'r') as reader:  
    for line in reader:
        cntr += 1
        parts = line.replace('\r', '').replace('\n', '').split(' ')
        v = [float(parts[i]) for i in range(1, len(parts))]
        word2vec[parts[0]] = v
        if cntr == 1:
            print('One of the records of glove file looks as:')
            print(parts)
            print('\nFirst vector formed is: ')
            print(v)
            print('\n one of the values of word2vec formed is')
            print(word2vec)

One of the records of glove file looks as:
['the', '-0.038194', '-0.24487', '0.72812', '-0.39961', '0.083172', '0.043953', '-0.39141', '0.3344', '-0.57545', '0.087459', '0.28787', '-0.06731', '0.30906', '-0.26384', '-0.13231', '-0.20757', '0.33395', '-0.33848', '-0.31743', '-0.48336', '0.1464', '-0.37304', '0.34577', '0.052041', '0.44946', '-0.46971', '0.02628', '-0.54155', '-0.15518', '-0.14107', '-0.039722', '0.28277', '0.14393', '0.23464', '-0.31021', '0.086173', '0.20397', '0.52624', '0.17164', '-0.082378', '-0.71787', '-0.41531', '0.20335', '-0.12763', '0.41367', '0.55187', '0.57908', '-0.33477', '-0.36559', '-0.54857', '-0.062892', '0.26584', '0.30205', '0.99775', '-0.80481', '-3.0243', '0.01254', '-0.36942', '2.2167', '0.72201', '-0.24978', '0.92136', '0.034514', '0.46745', '1.1079', '-0.19358', '-0.074575', '0.23353', '-0.052062', '-0.22044', '0.057162', '-0.15806', '-0.30798', '-0.41625', '0.37972', '0.15006', '-0.53212', '-0.2055', '-1.2526', '0.071624', '0.70565', '0.49744',

In [19]:
#function to return vector form of a description using the word2vec built above using GloVe file
def return_desc2vec(input):
    #splitting the input description into parts 
    parts = re.split(' - | |\.|\\\|/|;|,|&|!|\?|\(|\)|\[|\]|\{|\}', input)
    parts = [p.lower() for p in parts]
    v = []
    for p in parts:
        if len(p) ==0: 
            continue
        #checking if a word is presnt in word2vec, then the corresponding vector is appended
        if p in word2vec: 
            v.append(word2vec[p])
    if len(v) ==0: 
        None
    v = np.mean(v, axis=0)
    return v

<em> Using Traffic Event Data to Create Embedding Vector </em>

In [20]:
# load valid geohashes
valid_geohashes = set() 
#we only generate data for those regions/geohashes that have valid POI data 
#taking a counter to see one of the data
cntr = 0
with open('../data_files/geohash_to_poi_vec.csv', 'r') as reader:
    for line in reader:
        cntr += 1
        if 'Geohash' in line: 
            continue
        valid_geohashes.add(line.split(',')[0])
        if cntr == 1:
           print(valid_geohashes)

In [21]:
geo_to_vec = {}
start_timestamp = time.time()
#taking a counter to see one of the data
cntr = 0
for c in cities:
    with open('../data_files/temporary/TE_{}_20180601_20180609.csv'.format(c), 'r') as file:
        header = False
        for line in file:
            if not header:
                header = True
                continue
            cntr += 1
            parts = line.replace('\r', '').replace('\n', '').split(',')

            if cntr == 1:
               print('A value stored in parts is: ')
               print(parts)  
                
            start_gh = gh.encode(float(parts[8]), float(parts[9]), precision=geohash_prec)
            
            if cntr == 1:
               print('\nStarting geohash for this record is: ')
               print(start_gh)
            if start_gh not in valid_geohashes: 
                continue
            
            mat = []
            if start_gh in geo_to_vec:
                mat = geo_to_vec[start_gh]
            mat.append(return_desc2vec(parts[16]))
            geo_to_vec[start_gh] = mat     
            #e.g. value of a parts
            if cntr == 1:
               print('\ne.g. value of a geo_to_vec')
               print(mat)

    start_timestamp = time.time()
print('\n geo_to_vec done for all the cities')

A value stored in parts is: 
['T-21832562', 'Broken-Vehicle', '', '211', 'Entry ramp to I-605 Northbound from CA-22 Eastbound lane blocked due to stalled vehicle.', '2018-08-31 04:39:48', '2018-08-31 05:24:12', 'US/Pacific', '33.777599', '-118.09063', '0.600000023842', 'KSLI', '', 'San Diego Fwy S', 'R', 'Seal Beach', 'Orange', 'CA', '90740.0']

Starting geohash for this record is: 
9qh00

e.g. value of a geo_to_vec
[array([-0.14969  ,  0.16477  , -0.35532  , -0.71915  ,  0.6213   ,
        0.7414   ,  0.68959  ,  0.40371  , -0.24239  ,  0.1774   ,
       -0.95079  , -0.18887  , -0.02344  ,  0.49681  ,  0.081903 ,
       -0.36944  ,  1.2257   , -0.119    ,  0.95571  , -0.19501  ,
       -0.27181  , -0.21512  ,  0.33257  ,  0.058012 ,  0.2024   ,
        0.79726  ,  0.17186  , -0.07327  , -0.094389 , -0.23099  ,
        0.74749  ,  0.12781  ,  0.56017  , -0.48495  ,  0.067763 ,
        0.096598 ,  0.36286  ,  0.41255  ,  0.7284   , -0.24565  ,
        0.15594  , -1.6498   , -0.1249   , 

<em> Creating textual Feature Vector for each Geohash </em>

In [22]:
#writing the geohash encoding to a vector file to be used later in model
writer = open('../data_files/geohash_to_text_vec.csv', 'w')
writer.write('Geohash,vec\n')
#taking a counter to see one of the data
cntr = 0
for g in geo_to_vec:
    vec = list(np.mean(geo_to_vec[g], axis=0))
    v = [str(vec[i]) for i in range(len(vec))]
    v = ' '.join(v)
    writer.write(g + ',' + v + '\n')
    cntr+=1
    #printing one of the example values
    if cntr==1:
       print('An example record: ')
       print(g + ',' + v)
writer.close()

An example record: 
9qh00,0.5625524681238586 -0.319340719489982 0.2162985610200333 -0.7053523861566511 0.5580420673952637 0.7660333788706754 0.47271400728597845 0.6818535336976355 -0.07036903460837896 0.4104145992714074 -0.27467880054644794 -0.7631509380692167 -0.27303145719489885 -0.17258974499089333 -0.028334383424408145 -0.7746253005464495 0.7547550819672131 0.47355236794171296 -0.024760564663023874 -0.0081823469945355 0.08031730327868794 0.0716424772313299 -0.4451289617486297 0.2758214681238631 0.3693409016393473 0.005018397085610171 0.3101684790528218 -0.8414004735883472 -0.032836230418943756 -0.04032399544626577 0.16166880783242146 0.12373462750455361 0.3088657103825146 0.46964182149362205 -0.1815603934426225 -0.09915646812386154 0.4406970947176706 -0.20542094717668394 0.5437354826958102 0.1494737795992712 0.12942615755919962 -0.8046030236794205 -0.7160280510018182 -0.7921545992713948 0.7254082786885213 -0.12325613843351607 -0.32787407103825533 -0.4613772950819672 0.0428170491803

<em> Refining POI Vector for each Geohash (or geographical region) </em>

In [23]:
geohash_map = pd.read_csv("../data_files/geohash_to_poi_vec.csv")
geohash_vec = geohash_map[[ u'Amenity', u'Bump', u'Crossing', u'Give_Way',
       u'Junction', u'Noexit', u'Railway', u'Roundabout', u'Station', u'Stop',
       u'Traffic_Calming', u'Traffic_Signal', u'Turning_Circle',
       u'Turning_Loop']]

scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(geohash_vec.loc[:,'Amenity':]) 
scaled_values = scaler.transform(geohash_vec.loc[:,'Amenity':]) 
geohash_vec.loc[:,'Amenity':] = scaled_values

geohash_dict={}
for index, row in geohash_map.iterrows():
    geohash_dict[row.Geohash] = np.array(geohash_vec.iloc[index])
    
f = open("../data_files/geo_vect_dict.pkl","wb")
pickle.dump(geohash_dict,f)
f.close()

geo_dict = dict(zip(geohash_map.Geohash.unique(), range(len(geohash_map.Geohash.unique()))))
f = open("../data_files/geo_dict.pkl","wb")
pickle.dump(geo_dict,f)
f.close()

<em> Refining Description2Vector data for each Geohash (or geographical region) </em>

In [24]:
NLP_map = pd.read_csv("../data_files/geohash_to_text_vec.csv")

cntr = 0
NLP_dict={}
print(geohash_map.head(1))
for idx,row in geohash_map.iterrows():
    NLP_dict[row['Geohash']] = geohash_map[geohash_map.columns[1:]].to_numpy()

print('\nValues stored in the NLP dict:')
print(NLP_dict)

f = open("../data_files/NLP_vect_dict.pkl","wb")
pickle.dump(NLP_dict,f)
f.close()

  Geohash  Amenity  Bump  Crossing  Entrance  Give_Way  Junction  Noexit  \
0   9v6mn       47     0         2         0         0         2       1   

   Railway  Roundabout  Station  Stop  Traffic_Calming  Traffic_Signal  \
0        0           0       11     0                0              10   

   Turning_Circle  Turning_Loop  
0             157             0  

Values stored in the NLP dict:
{'9v6mn': array([[ 47,   0,   2, ...,  10, 157,   0],
       [ 45,   0,   0, ...,  42, 314,   0],
       [  3,   0,   0, ...,   1,  12,   0],
       ...,
       [  4,   0,   0, ...,   6,  41,   0],
       [ 17,   0,  20, ...,  30,   9,   0],
       [ 14,   0,  64, ...,  55,  89,   0]]), '9v6mm': array([[ 47,   0,   2, ...,  10, 157,   0],
       [ 45,   0,   0, ...,  42, 314,   0],
       [  3,   0,   0, ...,   1,  12,   0],
       ...,
       [  4,   0,   0, ...,   6,  41,   0],
       [ 17,   0,  20, ...,  30,   9,   0],
       [ 14,   0,  64, ...,  55,  89,   0]]), '9vg7s': array([[ 47,  

<em> Data Clearning by Adding Geoghash Code </em>

In [25]:
#function to clean data
def clean_data(filepath,storename):
    df = pd.read_csv(filepath)
    display (df.head())
    list_ = df.columns

    temp_df = df [[u'TimeStep', u'T-Accident',u'Geohash', u'HOD', u'DOW', u'DayLight',
       u'T-BrokenVehicle', u'T-Congestion', u'T-Construction', u'T-Event',
       u'T-FlowIncident', u'T-Other', u'T-RoadBlocked', u'W-Humidity',
       u'W-Precipitation', u'W-Pressure', u'W-Temperature', u'W-Visibility',
       u'W-WindSpeed', u'W-Rain', u'W-Snow', u'W-Fog', u'W-Hail']]
    temp_df.to_hdf('../data_files/'+storename+'.h5',key='set1')
    display(temp_df.head())
    
    f = open("../data_files/geo_dict.pkl","rb")
    geo_dict = pickle.load(f)
    f.close()
    
    def fun_hash(geohash):
        return geo_dict[geohash]
    df['geohash_code'] = df.apply(lambda row: fun_hash(row['Geohash']), axis=1) 
    temp_df = df [[u'TimeStep', u'T-Accident',u'Geohash',u'geohash_code', u'HOD', u'DOW', u'DayLight',
       u'T-BrokenVehicle', u'T-Congestion', u'T-Construction', u'T-Event',
       u'T-FlowIncident', u'T-Other', u'T-RoadBlocked', u'W-Humidity',
       u'W-Precipitation', u'W-Pressure', u'W-Temperature', u'W-Visibility',
       u'W-WindSpeed', u'W-Rain', u'W-Snow', u'W-Fog', u'W-Hail']]
    temp_df.to_hdf('../data_files/'+storename+'.h5',key='set2')
    
    df = pd.read_hdf('../data_files/'+storename+'.h5',key='set2')
    display(df.head())
    
    def week_day(DOW):
        if DOW < 5:
            return 1
        else:
            return 0
    def shift(group):
        df_list=[]
        for idx,df in group:
            df['predicted_accident'] = df['T-Accident'].shift(-1)
            df.drop(df.tail(1).index,inplace=True)
            df_list.append(df)
        return pd.concat(df_list)

    def time_interval(HOD):
        if HOD >=6 and HOD <10:
            return 0
        if HOD >= 10 and HOD<15:
            return 1
        if HOD >=15 and HOD< 18:
            return 2;
        if HOD >=18 and HOD< 22:
            return 3
        else:
            return 4; 
    def make_binary(d):
        if d > 0:
            return 1
        else:
            return 0    
    df['DOW_cat'] = df.apply(lambda row: week_day(row['DOW']), axis=1)   #day of week
    df['HOD_cat'] = df.apply(lambda row: time_interval(row['HOD']), axis=1) 
    df['T-Accident'] = df.apply(lambda row: make_binary(row['T-Accident']), axis=1) 
    group = df.groupby('Geohash')
    df = shift(group)
    temp_df = df [[u'TimeStep', u'predicted_accident',u'Geohash',u'geohash_code', u'HOD_cat', u'DOW_cat', u'T-Accident',u'DayLight',
       u'T-BrokenVehicle', u'T-Congestion', u'T-Construction', u'T-Event',
       u'T-FlowIncident', u'T-Other', u'T-RoadBlocked', u'W-Humidity',
       u'W-Precipitation', u'W-Pressure', u'W-Temperature', u'W-Visibility',
       u'W-WindSpeed', u'W-Rain', u'W-Snow', u'W-Fog', u'W-Hail']]
    temp_df.to_hdf('../data_files/'+storename+'.h5',key='set3')

In [26]:
cities = ['Atlanta', 'Austin', 'Charlotte', 'Dallas', 'Houston', 'LosAngeles']

for city in cities:
    clean_data("../data_files/vectors/{}_geo2vec_201861-2018831.csv".format(city), city)

Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,dn5bp,0,4,0,0,0,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
1,dn5bp,1,4,0,0,0,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
2,dn5bp,2,4,0,0,0,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
3,dn5bp,3,4,0,0,0,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
4,dn5bp,4,4,1,0,0,0,0,0,0,...,79.333333,0.0,30.006667,73.033333,10.0,11.166667,0,0,0,0


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,dn5bp,0,4,0,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
1,1,0,dn5bp,0,4,0,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
2,2,0,dn5bp,0,4,0,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
3,3,0,dn5bp,0,4,0,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
4,4,0,dn5bp,1,4,0,0,0,0,0,...,79.333333,0.0,30.006667,73.033333,10.0,11.166667,0,0,0,0


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,dn5bp,361,0,4,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
1,1,0,dn5bp,361,0,4,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
2,2,0,dn5bp,361,0,4,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
3,3,0,dn5bp,361,0,4,0,0,0,0,...,85.333333,0.0,29.976667,73.633333,10.0,3.5,0,0,0,0
4,4,0,dn5bp,361,1,4,0,0,0,0,...,79.333333,0.0,30.006667,73.033333,10.0,11.166667,0,0,0,0


Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,9v6s1,0,4,0,0,0,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
1,9v6s1,1,4,0,0,0,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
2,9v6s1,2,4,0,0,0,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
3,9v6s1,3,4,0,0,0,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
4,9v6s1,4,4,1,0,0,0,0,0,0,...,87.0,0.0,29.87,75.0,10.0,4.6,0,0,0,0


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9v6s1,0,4,0,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
1,1,0,9v6s1,0,4,0,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
2,2,0,9v6s1,0,4,0,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
3,3,0,9v6s1,0,4,0,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
4,4,0,9v6s1,1,4,0,0,0,0,0,...,87.0,0.0,29.87,75.0,10.0,4.6,0,0,0,0


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9v6s1,921,0,4,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
1,1,0,9v6s1,921,0,4,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
2,2,0,9v6s1,921,0,4,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
3,3,0,9v6s1,921,0,4,0,0,0,0,...,87.0,0.0,29.84,75.0,10.0,4.6,0,0,0,0
4,4,0,9v6s1,921,1,4,0,0,0,0,...,87.0,0.0,29.87,75.0,10.0,4.6,0,0,0,0


Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,dnq83,0,4,0,0,0,0,0,0,0,...,98.0,0.0,29.96,68.55,9.5,3.5,0,0,0,0
1,dnq83,1,4,0,0,0,0,0,0,0,...,95.0,0.0,29.955,69.45,9.5,3.5,0,0,0,0
2,dnq83,2,4,0,0,0,0,0,0,0,...,95.0,0.0,29.955,69.45,9.5,3.5,0,0,0,0
3,dnq83,3,4,0,0,0,0,0,0,0,...,95.0,0.0,29.95,69.45,9.5,3.5,0,0,0,0
4,dnq83,4,4,1,0,0,0,0,0,0,...,95.0,0.0,29.935,69.45,9.5,3.5,0,0,0,0


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,dnq83,0,4,0,0,0,0,0,...,98.0,0.0,29.96,68.55,9.5,3.5,0,0,0,0
1,1,0,dnq83,0,4,0,0,0,0,0,...,95.0,0.0,29.955,69.45,9.5,3.5,0,0,0,0
2,2,0,dnq83,0,4,0,0,0,0,0,...,95.0,0.0,29.955,69.45,9.5,3.5,0,0,0,0
3,3,0,dnq83,0,4,0,0,0,0,0,...,95.0,0.0,29.95,69.45,9.5,3.5,0,0,0,0
4,4,0,dnq83,1,4,0,0,0,0,0,...,95.0,0.0,29.935,69.45,9.5,3.5,0,0,0,0


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,dnq83,889,0,4,0,0,0,0,...,98.0,0.0,29.96,68.55,9.5,3.5,0,0,0,0
1,1,0,dnq83,889,0,4,0,0,0,0,...,95.0,0.0,29.955,69.45,9.5,3.5,0,0,0,0
2,2,0,dnq83,889,0,4,0,0,0,0,...,95.0,0.0,29.955,69.45,9.5,3.5,0,0,0,0
3,3,0,dnq83,889,0,4,0,0,0,0,...,95.0,0.0,29.95,69.45,9.5,3.5,0,0,0,0
4,4,0,dnq83,889,1,4,0,0,0,0,...,95.0,0.0,29.935,69.45,9.5,3.5,0,0,0,0


Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,9vg4w,0,4,0,0,0,0,0,0,0,...,65.0,0.0,29.815,83.0,10.0,14.4,0,0,0,0
1,9vg4w,1,4,0,0,0,0,0,0,0,...,65.0,0.0,29.815,83.0,10.0,14.4,0,0,0,0
2,9vg4w,2,4,0,0,0,0,0,0,0,...,66.0,0.0,29.815,82.75,10.0,14.4,0,0,0,0
3,9vg4w,3,4,0,0,0,0,0,0,0,...,66.5,0.0,29.815,82.5,10.0,14.4,0,0,0,0
4,9vg4w,4,4,1,0,0,0,0,0,0,...,65.5,0.0,29.815,82.2,10.0,13.25,0,0,0,0


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9vg4w,0,4,0,0,0,0,0,...,65.0,0.0,29.815,83.0,10.0,14.4,0,0,0,0
1,1,0,9vg4w,0,4,0,0,0,0,0,...,65.0,0.0,29.815,83.0,10.0,14.4,0,0,0,0
2,2,0,9vg4w,0,4,0,0,0,0,0,...,66.0,0.0,29.815,82.75,10.0,14.4,0,0,0,0
3,3,0,9vg4w,0,4,0,0,0,0,0,...,66.5,0.0,29.815,82.5,10.0,14.4,0,0,0,0
4,4,0,9vg4w,1,4,0,0,0,0,0,...,65.5,0.0,29.815,82.2,10.0,13.25,0,0,0,0


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9vg4w,229,0,4,0,0,0,0,...,65.0,0.0,29.815,83.0,10.0,14.4,0,0,0,0
1,1,0,9vg4w,229,0,4,0,0,0,0,...,65.0,0.0,29.815,83.0,10.0,14.4,0,0,0,0
2,2,0,9vg4w,229,0,4,0,0,0,0,...,66.0,0.0,29.815,82.75,10.0,14.4,0,0,0,0
3,3,0,9vg4w,229,0,4,0,0,0,0,...,66.5,0.0,29.815,82.5,10.0,14.4,0,0,0,0
4,4,0,9vg4w,229,1,4,0,0,0,0,...,65.5,0.0,29.815,82.2,10.0,13.25,0,0,0,0


Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,9vk0z,0,4,0,0,0,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
1,9vk0z,1,4,0,0,0,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
2,9vk0z,2,4,0,0,0,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
3,9vk0z,3,4,0,0,0,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
4,9vk0z,4,4,1,0,0,0,0,0,0,...,87.0,0.0,29.9,81.15,7.5,5.8,0,0,1,0


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9vk0z,0,4,0,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
1,1,0,9vk0z,0,4,0,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
2,2,0,9vk0z,0,4,0,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
3,3,0,9vk0z,0,4,0,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
4,4,0,9vk0z,1,4,0,0,0,0,0,...,87.0,0.0,29.9,81.15,7.5,5.8,0,0,1,0


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9vk0z,257,0,4,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
1,1,0,9vk0z,257,0,4,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
2,2,0,9vk0z,257,0,4,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
3,3,0,9vk0z,257,0,4,0,0,0,0,...,87.0,0.0,29.905,81.15,7.5,6.95,0,0,1,0
4,4,0,9vk0z,257,1,4,0,0,0,0,...,87.0,0.0,29.9,81.15,7.5,5.8,0,0,1,0


Unnamed: 0,Geohash,TimeStep,DOW,HOD,DayLight,T-Accident,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,9qh00,0,4,0,0,0,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
1,9qh00,1,4,0,0,0,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
2,9qh00,2,4,0,0,0,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
3,9qh00,3,4,0,0,0,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
4,9qh00,4,4,1,0,0,0,0,0,0,...,80.5,0.0,29.94,58.9,10.0,0.0,0,0,0,0


Unnamed: 0,TimeStep,T-Accident,Geohash,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,T-Event,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9qh00,0,4,0,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
1,1,0,9qh00,0,4,0,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
2,2,0,9qh00,0,4,0,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
3,3,0,9qh00,0,4,0,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
4,4,0,9qh00,1,4,0,0,0,0,0,...,80.5,0.0,29.94,58.9,10.0,0.0,0,0,0,0


Unnamed: 0,TimeStep,T-Accident,Geohash,geohash_code,HOD,DOW,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
0,0,0,9qh00,906,0,4,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
1,1,0,9qh00,906,0,4,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
2,2,0,9qh00,906,0,4,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
3,3,0,9qh00,906,0,4,0,0,0,0,...,80.0,0.0,29.94,58.85,10.0,0.0,0,0,0,0
4,4,0,9qh00,906,1,4,0,0,0,0,...,80.5,0.0,29.94,58.9,10.0,0.0,0,0,0,0


<em> Creating sample entries for different cities and performing negative sampling </em>

In [27]:
#Loading some important files saved earlier
f = open("../data_files/geo_vect_dict.pkl","rb")
geohash_dict = pickle.load(f)
f.close()

f = open("../data_files/geo_dict.pkl","rb")
geo_dict = pickle.load(f)
f.close()

f = open("../data_files/NLP_vect_dict.pkl","rb")
NLP_dict = pickle.load(f)
f.close()

In [28]:
#to run the jobs in parallel
cores = cpu_count() #Number of CPU cores on system
partitions = cores

class WithExtraArgs(object):
     def __init__(self, func, **args):
         self.func = func
         self.args = args
     def __call__(self, df):
         return self.func(df, **self.args)

def applyParallel(data, func,pool,partition, kwargs):
     data_split = [data[i:i + partition] for i in range(0, len(data), partition)]
     data =pool.map(WithExtraArgs(func, **kwargs), data_split)
     return data

In [29]:
#function to convert the data to onehot vector for each of the records
def onhot_enoceder(train):
     myEncoder = OneHotEncoder(sparse=False)
     myEncoder.fit(train['HOD_cat'].values.reshape(-1, 1))

     onehot_encode = pd.concat([train.reset_index().drop('HOD_cat',1),
                 pd.DataFrame(myEncoder.transform(train['HOD_cat'].values.reshape(-1, 1)),
                              columns=['HOD_en0','HOD_en1','HOD_en2','HOD_en3','HOD_en4'])], axis=1).reindex()
     return onehot_encode.drop('index',1)

In [30]:
#function to create training set
def create_train_set_aug_geo(frame_list,geomap):
    process_name = str(multiprocessing.current_process())
    id = int(process_name.split(',')[0].split('-')[1])
    
    X_train = []
    y_train = []
    for frame in frame_list:
        training_set = frame.values
        #make sure there is unique geohash per frame
        #print frame.Geohash.iloc[0]
        geo_vec = geomap[frame.Geohash.iloc[0]]
        geo_code = geo_dict[frame.Geohash.iloc[0]]
        try:
            NLP_code = NLP_dict[frame.Geohash.iloc[0]]
        except:
            NLP_code = np.zeros(100)
            
        for i in range(8, training_set.shape[0]):
            if training_set[i, 1] > 0 :
                a = np.concatenate((training_set[i-8:i,4:].flatten(),geo_vec),axis=0)
                a = np.concatenate((a,NLP_code),axis=0)
                a = np.append(a, geo_code)
                X_train.append(a)
                y_train.append(1)  #training_set[i, 1])
                
            elif random.uniform(0, 1) > 0.98:  # negative sampling for non-accident cases 
                a = np.concatenate((training_set[i-8:i,4:].flatten(),geo_vec),axis=0)
                a = np.concatenate((a,NLP_code),axis=0)
                a = np.append(a, geo_code)
                X_train.append(a)
                y_train.append(0)  #training_set[i, 1])
    return X_train, y_train

In [31]:
#function to create sequences
def create_sequences(df,geohash_dict):
    frame_list=[]
    for idx, frame in df.groupby(df.Geohash):
        frame_list.append(frame)
    
    pool = Pool(cores)
    partition = int(np.ceil(float(len(frame_list))/partitions))    
    train_set = applyParallel (frame_list,create_train_set_aug_geo,pool,partition,{'geomap':geohash_dict.copy()})
    pool.close()
    pool.join()
    X_train = []
    y_train = []
    for set_ in train_set:
        X_train.extend(set_[0])
        y_train.extend(set_[1])

    X_train, y_train = np.array(X_train), np.array(y_train)   
    return X_train,y_train

In [32]:
#function to create and save train test data to be used by various models
def train_data(filename):
    df = pd.read_hdf('../data_files/'+filename+'.h5',key='set3') # the .h5 file contains raw traffic, weather, time, and POI data 
    display(df.head())
    df_normalize = df.copy()
    train = df_normalize[df_normalize.TimeStep <= df_normalize.TimeStep.max()*5/6] #5/6 of data taken for training
    test = df_normalize[df_normalize.TimeStep > df_normalize.TimeStep.max()*5/6] # 

    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(train.loc[:,'T-BrokenVehicle':]) 
    scaled_values = scaler.transform(train.loc[:,'T-BrokenVehicle':]) 
    train.loc[:,'T-BrokenVehicle':] = scaled_values
    scaled_values = scaler.transform(test.loc[:,'T-BrokenVehicle':]) 
    test.loc[:,'T-BrokenVehicle':] = scaled_values
    display(test.head())
    
    train = onhot_enoceder(train)
    test = onhot_enoceder(test)
    
    display(test.head())
    
    X_train, y_train = create_sequences(train,geohash_dict)
    X_test, y_test = create_sequences(test,geohash_dict)

    #creating several files per city to in folder train_set inside data_files
    np.save('../data_files/train_test_set_jan21/X_train_'+filename,X_train) 
    np.save('../data_files/train_test_set_jan21/y_train_'+filename,y_train)
    np.save('../data_files/train_test_set_jan21/X_test_'+filename,X_test)
    np.save('../data_files/train_test_set_jan21/y_test_'+filename,y_test)

In [33]:
cities = ['Atlanta', 'Austin', 'Charlotte', 'Dallas', 'Houston', 'LosAngeles']
for city in cities:
    train_data(city) # creating training and test data for each city
print('Train and test files created for {} cities successfully'.format(str(len(cities))))

Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
529860,0,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
529861,1,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
529862,2,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
529863,3,0.0,djgz7,98,4,1,0,0,0,0,...,84.5,0.0,29.98,74.45,10.0,3.5,0,0,0,0
529864,4,0.0,djgz7,98,4,1,0,0,0,0,...,81.5,0.0,30.005,74.0,10.0,10.4,0,0,0,0


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,HOD_cat,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,...,W-Humidity,W-Precipitation,W-Pressure,W-Temperature,W-Visibility,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail
537218,7358,0.0,djgz7,98,2,1,0,1,0.0,0.0,...,0.49,0.0,0.993564,0.960837,1.0,0.173653,0.0,0.0,0.0,0.0
537219,7359,0.0,djgz7,98,2,1,0,1,0.0,0.0,...,0.49,0.0,0.993564,0.960837,1.0,0.173653,0.0,0.0,0.0,0.0
537220,7360,0.0,djgz7,98,2,1,0,1,0.0,0.0,...,0.52,0.0,0.993234,0.956009,1.0,0.224551,0.0,0.0,0.0,0.0
537221,7361,0.0,djgz7,98,2,1,0,1,0.0,0.0,...,0.52,0.0,0.993234,0.956009,1.0,0.224551,0.0,0.0,0.0,0.0
537222,7362,0.0,djgz7,98,2,1,0,1,0.0,0.0,...,0.52,0.0,0.993234,0.956009,1.0,0.224551,0.0,0.0,0.0,0.0


Unnamed: 0,TimeStep,predicted_accident,Geohash,geohash_code,DOW_cat,T-Accident,DayLight,T-BrokenVehicle,T-Congestion,T-Construction,...,W-WindSpeed,W-Rain,W-Snow,W-Fog,W-Hail,HOD_en0,HOD_en1,HOD_en2,HOD_en3,HOD_en4
0,7358,0.0,djgz7,98,1,0,1,0.0,0.0,0.0,...,0.173653,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,7359,0.0,djgz7,98,1,0,1,0.0,0.0,0.0,...,0.173653,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,7360,0.0,djgz7,98,1,0,1,0.0,0.0,0.0,...,0.224551,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,7361,0.0,djgz7,98,1,0,1,0.0,0.0,0.0,...,0.224551,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,7362,0.0,djgz7,98,1,0,1,0.0,0.0,0.0,...,0.224551,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)