In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium

from geopy.geocoders import Nominatim

%matplotlib inline

sns.set(rc={'figure.figsize':(12, 8)})

origin = [41.5511, -8.428]

# Flow

filepath = 'data/Traffic_Flow_Braga_Until_20190228.csv'

cols = ['road_num', 'road_name', 'functional_road_class_desc',
        'current_speed', 'free_flow_speed', 'speed_diff', 'current_travel_time',
        'free_flow_travel_time', 'time_diff', 'creation_date']

dtypes = {'road_num': 'uint8',
 'road_name': 'category',
 'functional_road_class_desc': 'category',
 'current_speed': 'uint8',
 'free_flow_speed': 'uint8',
 'speed_diff': 'uint8',
 'current_travel_time': 'int64',
 'free_flow_travel_time': 'uint16',
 'time_diff': 'int64'}

flow = pd.read_csv(filepath, usecols=cols, dtype=dtypes, parse_dates=['creation_date'])

# Incidents

filepath = 'data/Traffic_Incidents_Braga_Until_20190228.csv'

cols = ['description', 'cause_of_incident', 'from_road', 'to_road',
       'affected_roads', 'incident_category_desc', 'magnitude_of_delay_desc',
       'length_in_meters', 'delay_in_seconds', 'incident_date']

dtypes = {'description': 'category',
 'cause_of_incident': 'category',
 'from_road': 'category',
 'to_road': 'category',
 'affected_roads': 'category',
 'incident_category_desc': 'category',
 'magnitude_of_delay_desc': 'category',
 'length_in_meters': 'uint16',
 'delay_in_seconds': 'uint16',
 'incident_date': 'category'}

incidents = pd.read_csv(filepath, usecols=cols, dtype=dtypes, parse_dates=['incident_date'])

In [2]:
def typecast_objects(gl_obj):
    gl_obj = gl_obj.apply(lambda x: x.str.strip())
    gl_obj = gl_obj.apply(lambda x: x.str.lower())
    
    converted_obj = pd.DataFrame()
    
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:, col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:, col] = gl_obj[col]
    
    return converted_obj


def downcast(df):
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')

    df_obj = df.select_dtypes(include=['object'])
    converted_obj = typecast_objects(df_obj)

    df[converted_int.columns] = converted_int
    df[converted_obj.columns] = converted_obj
    
    return df


def get_dtypes(df):
    dtypes = df.dtypes

    colnames = dtypes.index
    types = [i.name for i in dtypes.values]

    return dict(zip(colnames, types))

In [3]:
# Data cleaning
flow = flow.sort_values(by=['creation_date', 'road_num'])
flow = flow.reset_index(drop=True)

incidents = incidents.sort_values(by=['incident_date'])
incidents = incidents.reset_index(drop=True)

In [None]:
# Feature engineering
flow['date'] = flow['creation_date'].dt.strftime('%Y-%m-%d')
flow['datetime'] = flow['creation_date'].dt.strftime('%Y-%m-%d %H')

incidents['month'] = incidents['incident_date'].dt.strftime('%Y-%m')
incidents['incident_date'] = incidents['incident_date'].dt.strftime('%Y-%m-%d %H:%M')

In [None]:
flow.head()

Unnamed: 0,road_num,road_name,functional_road_class_desc,current_speed,free_flow_speed,speed_diff,current_travel_time,free_flow_travel_time,time_diff,creation_date,date,datetime
0,1,Avenida da Liberdade,Secondary Road,25,25,0,124,124,0,2018-07-24 14:58:54,2018-07-24,2018-07-24 14
1,1,Avenida da Liberdade,Secondary Road,25,25,0,124,124,0,2018-07-24 14:58:55,2018-07-24,2018-07-24 14
2,2,Avenida Central,Secondary Road,25,25,0,124,124,0,2018-07-24 14:58:55,2018-07-24,2018-07-24 14
3,3,Rua de Caires,Local Connecting Road,25,37,12,54,36,18,2018-07-24 14:58:55,2018-07-24,2018-07-24 14
4,4,N14 Bosch,Other Major Road,32,47,15,70,48,22,2018-07-24 14:58:55,2018-07-24,2018-07-24 14


In [None]:
incidents.head()

Unnamed: 0,description,cause_of_incident,from_road,to_road,affected_roads,incident_category_desc,magnitude_of_delay_desc,length_in_meters,delay_in_seconds,incident_date,month
0,stationary traffic,,Rua de São Gonçalo,Largo de São Francisco (N101) / Praça da Repúb...,,Jam,Major,200,197,2018-07-24 14:58,2018-07
1,queuing traffic,,A3 exit [4],A11 exit [5],A11/IC14,Jam,Major,240,55,2018-07-24 14:58,2018-07
2,queuing traffic,,Avenida São Lourenço (N14),N14 exit [5A],A11,Jam,Moderate,850,96,2018-07-24 16:01,2018-07
3,stationary traffic,,Rua 5 De Julho / Rua do Outeiro,Avenida São Lourenço (N14),,Jam,Moderate,350,168,2018-07-24 16:47,2018-07
4,queuing traffic,,Rua Damiana Maria da Silva,Autoestrada Esposende-Guimarães,N14,Jam,Moderate,620,118,2018-07-24 16:47,2018-07


In [None]:
incidents.groupby('month')['magnitude_of_delay_desc'].hist()

In [None]:
# Weather
filepath = 'data/Weather_Braga_Until_20190228.csv'

cols = ['weather_description', 'temperature', 'atmospheric_pressure',
        'humidity', 'wind_speed', 'cloudiness', 'rain', 'current_luminosity',
        'sunrise', 'sunset', 'creation_date']

dtypes = {'weather_description': 'category',
 'temperature': 'uint8',
 'atmospheric_pressure': 'uint16',
 'humidity': 'uint8',
 'wind_speed': 'uint8',
 'cloudiness': 'uint8',
 'rain': 'uint8',
 'current_luminosity': 'category',
 'sunrise': 'datetime64[ns]',
 'sunset': 'datetime64[ns]',
 'creation_date': 'datetime64[ns]'}

weather = pd.read_csv(filepath, usecols=cols, parse_dates=['sunrise', 'sunset', 'creation_date'])

In [None]:
# Feature engineering
weather['datetime'] = weather['creation_date'].dt.strftime('%Y-%m-%d %H')

In [None]:
holidays = pd.read_csv('data/holidays.csv')
flow = pd.merge(flow, holidays, how='left', on='date')
flow = pd.merge(flow, weather, how='left', on='datetime')

In [None]:
flow['holiday_name'] = flow['holiday_name'].fillna(0)
flow['holiday_flag'] = [0 if x == 0 else 1 for x in flow['holiday_name']]
flow['datetime'] = pd.to_datetime(flow['datetime'])

In [None]:
flow = flow.drop(['road_name', 'date', 'creation_date_x', 'creation_date_y', 'holiday_name'], axis=1)