In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
# Flow
filepath = 'data/Traffic_Flow_Braga_With_Cordinates_20190228.csv'

cols = ['road_num', 'road_name', 'functional_road_class_desc',
       'current_speed', 'free_flow_speed', 'speed_diff', 'current_travel_time',
       'free_flow_travel_time', 'time_diff', 'creation_date', 'began_lat',
       'end_lat', 'began_long', 'end_long']

dtypes = {'road_num': 'uint8',
 'road_name': 'category',
 'functional_road_class_desc': 'category',
 'current_speed': 'uint8',
 'free_flow_speed': 'uint8',
 'speed_diff': 'uint8',
 'current_travel_time': 'int64',
 'free_flow_travel_time': 'uint16',
 'time_diff': 'int64',
 'began_lat': 'float32',
 'end_lat': 'float32',
 'began_long': 'float32',
 'end_long': 'float32'}

flow = pd.read_csv(filepath, usecols=cols, dtype=dtypes, parse_dates=['creation_date'])

In [3]:
# Incidents - we are removing the speedways away from the city center and main flow roads
filepath = 'data/Traffic_Incidents_Braga_Only_Route.csv'

cols = ['description', 'cause_of_incident', 'from_road', 'to_road',
       'affected_roads', 'incident_category_desc', 'magnitude_of_delay_desc',
       'length_in_meters', 'delay_in_seconds', 'incident_date', 'latitude', 'longitude']

dtypes = {'description': 'category',
 'cause_of_incident': 'category',
 'from_road': 'category',
 'to_road': 'category',
 'affected_roads': 'category',
 'incident_category_desc': 'category',
 'magnitude_of_delay_desc': 'category',
 'length_in_meters': 'uint16',
 'delay_in_seconds': 'uint16',
 'latitude': 'float32',
 'longitude': 'float32'}

incidents = pd.read_csv(filepath, usecols=cols, dtype=dtypes, parse_dates=['incident_date'])

In [4]:
# Data cleaning
flow = flow.sort_values(by=['creation_date', 'road_num'])
flow = flow.reset_index(drop=True)

incidents = incidents.sort_values(by=['incident_date'])
incidents = incidents.reset_index(drop=True)

In [5]:
# Feature engineering
flow['date'] = flow['creation_date'].dt.strftime('%Y-%m-%d')
flow['datetime'] = flow['creation_date'].dt.strftime('%Y-%m-%d %H')

incidents['datetime'] = incidents['incident_date'].dt.strftime('%Y-%m-%d %H')
incidents['incident_date'] = incidents['incident_date'].dt.strftime('%Y-%m-%d %H:%M')

In [6]:
incidents.head()

Unnamed: 0,description,cause_of_incident,from_road,to_road,affected_roads,incident_category_desc,magnitude_of_delay_desc,length_in_meters,delay_in_seconds,incident_date,latitude,longitude,datetime
0,stationary traffic,,Rua de São Gonçalo,Largo de São Francisco (N101) / Praça da Repúb...,,Jam,Major,200,197,2018-07-24 14:58,41.551788,-8.42116,2018-07-24 14
1,stationary traffic,,Rua 5 De Julho / Rua do Outeiro,Avenida São Lourenço (N14),,Jam,Moderate,350,168,2018-07-24 16:47,41.514069,-8.45438,2018-07-24 16
2,stationary traffic,,Rua do Conselheiro Bento Miguel,Rua Feliciano Ramos,N101,Jam,Moderate,300,141,2018-07-24 16:47,41.557812,-8.41734,2018-07-24 16
3,queuing traffic,,Avenida João Paulo II (N103),Rua Cónego Luciano Afonso dos Santos (N101) / ...,,Jam,Moderate,590,127,2018-07-24 16:47,41.555019,-8.40697,2018-07-24 16
4,stationary traffic,,Avenida de Sequeira / Rua de São Pedro,Avenida de Sequeira / Rua de São Pedro,N103,Jam,Major,630,228,2018-07-24 16:47,41.537361,-8.45229,2018-07-24 16


In [7]:
flow.head()

Unnamed: 0,road_num,road_name,functional_road_class_desc,current_speed,free_flow_speed,speed_diff,current_travel_time,free_flow_travel_time,time_diff,creation_date,began_lat,end_lat,began_long,end_long,date,datetime
0,1,Avenida da Liberdade,Secondary Road,25,25,0,124,124,0,2018-07-24 14:58:54,41.550884,41.542164,-8.422812,-8.417782,2018-07-24,2018-07-24 14
1,1,Avenida da Liberdade,Secondary Road,25,25,0,124,124,0,2018-07-24 14:58:55,41.550884,41.542164,-8.422812,-8.417782,2018-07-24,2018-07-24 14
2,2,Avenida Central,Secondary Road,25,25,0,124,124,0,2018-07-24 14:58:55,41.559689,41.553974,-8.44671,-8.438541,2018-07-24,2018-07-24 14
3,3,Rua de Caires,Local Connecting Road,25,37,12,54,36,18,2018-07-24 14:58:55,41.544514,41.549107,-8.433262,-8.433573,2018-07-24,2018-07-24 14
4,4,N14 Bosch,Other Major Road,32,47,15,70,48,22,2018-07-24 14:58:55,41.539219,41.526821,-8.433685,-8.44785,2018-07-24,2018-07-24 14


In [8]:
# Weather
filepath = 'data/Weather_Braga_Until_20190228.csv'

cols = ['weather_description', 'temperature', 'atmospheric_pressure',
        'humidity', 'wind_speed', 'cloudiness', 'current_luminosity',
        'sunrise', 'sunset', 'creation_date']

dtypes = {'weather_description': 'category',
 'temperature': 'uint8',
 'atmospheric_pressure': 'uint16',
 'humidity': 'uint8',
 'wind_speed': 'uint8',
 'cloudiness': 'uint8',
 'current_luminosity': 'category'}

weather = pd.read_csv(filepath, usecols=cols, parse_dates=['sunrise', 'sunset', 'creation_date'])

In [9]:
m = {
    'nuvens quebrados': 'algumas nuvens',
    'nuvens quebradas': 'algumas nuvens',
    'nuvens dispersas': 'algumas nuvens',
    'céu pouco nublado': 'algumas nuvens',
    'neblina': 'névoa',
    'chuva leve': 'chuva fraca',
    'garoa fraca': 'chuva fraca',
    'chuva': 'chuva moderada',
    'chuva de intensidade pesada': 'chuva intensa',
    'chuva de intensidade pesado': 'chuva intensa',
    'trovoada com chuva leve': 'trovoada',
    'trovoada com chuva': 'trovoada',
}
weather['weather_description'] = weather['weather_description'].replace(m)

In [10]:
# Feature engineering
weather['datetime'] = weather['creation_date'].dt.strftime('%Y-%m-%d %H')

In [11]:
holidays = pd.read_csv('data/holidays.csv')
flow = pd.merge(flow, holidays, how='left', on='date')
flow = pd.merge(flow, weather, how='left', on='datetime')
incidents = pd.merge(incidents, weather, how='left', on='datetime')

In [12]:
flow['holiday_name'] = flow['holiday_name'].fillna(0)
flow['holiday_flag'] = [0 if x == 0 else 1 for x in flow['holiday_name']]
flow['datetime'] = pd.to_datetime(flow['datetime'])

In [13]:
flow = flow.drop(['date', 'datetime', 'creation_date_y', 'holiday_name'], axis=1)
flow = flow.rename(columns={'creation_date_x': 'creation_date'})

In [14]:
flow.head()

Unnamed: 0,road_num,road_name,functional_road_class_desc,current_speed,free_flow_speed,speed_diff,current_travel_time,free_flow_travel_time,time_diff,creation_date,...,weather_description,temperature,atmospheric_pressure,humidity,wind_speed,cloudiness,current_luminosity,sunrise,sunset,holiday_flag
0,1,Avenida da Liberdade,Secondary Road,25,25,0,124,124,0,2018-07-24 14:58:54,...,algumas nuvens,21.0,1016.0,77.0,5.0,20.0,LIGHT,2018-07-24 05:21:06,2018-07-24 19:58:50,0
1,1,Avenida da Liberdade,Secondary Road,25,25,0,124,124,0,2018-07-24 14:58:55,...,algumas nuvens,21.0,1016.0,77.0,5.0,20.0,LIGHT,2018-07-24 05:21:06,2018-07-24 19:58:50,0
2,2,Avenida Central,Secondary Road,25,25,0,124,124,0,2018-07-24 14:58:55,...,algumas nuvens,21.0,1016.0,77.0,5.0,20.0,LIGHT,2018-07-24 05:21:06,2018-07-24 19:58:50,0
3,3,Rua de Caires,Local Connecting Road,25,37,12,54,36,18,2018-07-24 14:58:55,...,algumas nuvens,21.0,1016.0,77.0,5.0,20.0,LIGHT,2018-07-24 05:21:06,2018-07-24 19:58:50,0
4,4,N14 Bosch,Other Major Road,32,47,15,70,48,22,2018-07-24 14:58:55,...,algumas nuvens,21.0,1016.0,77.0,5.0,20.0,LIGHT,2018-07-24 05:21:06,2018-07-24 19:58:50,0


In [15]:
flow.to_csv('data/flow.csv', index=False)

In [16]:
incidents.to_csv('data/incidents.csv', index=False)

In [17]:
weather.to_csv('data/weather.csv', index=False)

In [18]:
def cache_dtypes(df, ignored=[]):
    dtypes = df.drop(ignored, axis=1).dtypes
    dtypes_col = dtypes.index
    dtypes_type = [i.name for i in dtypes.values]
    
    return dict(zip(dtypes_col, dtypes_type))


def save_dtypes(dtypes, path):
    with open(path, 'wb') as handle:
        pickle.dump(dtypes, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(path, 'created!')