In [1]:
import os
import re
import glob
import time

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

from bs4 import BeautifulSoup
from selenium import webdriver

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# file not available locally on the GitHub repo, must be downloaded from http://www.robesafe.uah.es/personal/eduardo.romera/uah-driveset/#download
data_path = 'UAH-DRIVESET-v1/'

files = glob.glob(data_path+'*/*/SEMANTIC_ONLINE*.txt', recursive=True)


In [3]:
def get_class(drive_class):
    
    drive_class = re.sub(r'[0-9]', '', drive_class)
    
    if drive_class == 'NORMAL':
        return drive_class
    else:
        return 'NOT NORMAL'

In [4]:
df = pd.DataFrame()

for i,file in enumerate(files):
    drive_info = file.split('/')[2].split('-')
    
    drive_date = drive_info[0]
    dist_km = drive_info[1][:-2]
    driver = drive_info[2]
    drive_class = get_class(drive_info[3])
    road_type = drive_info[4]
    
    row = {'date_time_raw':drive_date, 'total_dist':dist_km, 'driver_id':driver, 'road_type':road_type, 'class_normal':drive_class}
    
    df = df.append(row, ignore_index=True)

df['date_time_clean'] = pd.to_datetime(df['date_time_raw']).dt.round('30min')
df['date_time_clean'] = pd.to_datetime(df['date_time_clean']).dt.strftime('%Y-%m-%d %I:%M %p')
df

Unnamed: 0,class_normal,date_time_raw,driver_id,road_type,total_dist,date_time_clean
0,NORMAL,20151110175712,D1,SECONDARY,16,2015-11-10 06:00 PM
1,NOT NORMAL,20151111135612,D1,SECONDARY,13,2015-11-11 02:00 PM
2,NOT NORMAL,20151111134545,D1,SECONDARY,16,2015-11-11 02:00 PM
3,NOT NORMAL,20151111132348,D1,MOTORWAY,25,2015-11-11 01:30 PM
4,NOT NORMAL,20151111125233,D1,MOTORWAY,24,2015-11-11 01:00 PM
5,NORMAL,20151110180824,D1,SECONDARY,16,2015-11-10 06:00 PM
6,NORMAL,20151111123124,D1,MOTORWAY,25,2015-11-11 12:30 PM
7,NORMAL,20151221112434,D6,SECONDARY,17,2015-12-21 11:30 AM
8,NORMAL,20151217162714,D6,MOTORWAY,26,2015-12-17 04:30 PM
9,NOT NORMAL,20151221120051,D6,MOTORWAY,26,2015-12-21 12:00 PM


In [5]:
def rendering(url):
    driver = webdriver.Chrome('/usr/local/bin/chromedriver') # run ChromeDriver
    driver.get(url) # load the web page from the URL
    time.sleep(3) # wait for the web page to load
    render = driver.page_source # get the page source HTML
    driver.quit() # quit ChromeDriver
    return render # return the page source HTML
    

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   class_normal     40 non-null     object
 1   date_time_raw    40 non-null     object
 2   driver_id        40 non-null     object
 3   road_type        40 non-null     object
 4   total_dist       40 non-null     object
 5   date_time_clean  40 non-null     object
dtypes: object(6)
memory usage: 2.0+ KB


In [7]:
cols = ['temp_F', 'dp_F', 'humidity_pct', 'wind_direction', 'wind_speed_mph', 'wind_gust_mph', 'pressure_in', 'precipitation_in', 'condition']
for c in cols:
    df[c] = np.nan
    

In [8]:
def get_wunderground(i, row):
    drive_date = row['date_time_clean'][:10]
    drive_time = row['date_time_clean'][11:].lstrip('0')
    
    search_url = f'http://www.wunderground.com/history/daily/es/madrid/LEMD/date/{drive_date}'

    wunderground_page = rendering(search_url)

    wunderground_soup = BeautifulSoup(wunderground_page, 'html.parser')

    soup_container = wunderground_soup.find('lib-city-history-observation')
    soup_data = soup_container.find_all('tr')
    
    for j,dat in enumerate(soup_data[1:]):
        for k,td in enumerate(dat.find_all('td', class_='ng-star-inserted')):
            tmp = td.text
            if k == 0:
                row_time = tmp
            if (k != 0) and (row_time == drive_time):
                df.loc[i, cols[k-1]] = tmp.split(u'\xa0°')[0]

In [9]:
while len(df[df[cols[0]].isna()]) != 0:
    for i, row in df.iterrows():
        get_wunderground(i, row)
        

In [10]:
df

Unnamed: 0,class_normal,date_time_raw,driver_id,road_type,total_dist,date_time_clean,temp_F,dp_F,humidity_pct,wind_direction,wind_speed_mph,wind_gust_mph,pressure_in,precipitation_in,condition
0,NORMAL,20151110175712,D1,SECONDARY,16,2015-11-10 06:00 PM,72,43,35,VAR,2,0,28.32,0.0,Fair
1,NOT NORMAL,20151111135612,D1,SECONDARY,13,2015-11-11 02:00 PM,64,45,49,S,3,0,28.29,0.0,Fair
2,NOT NORMAL,20151111134545,D1,SECONDARY,16,2015-11-11 02:00 PM,64,45,49,S,3,0,28.29,0.0,Fair
3,NOT NORMAL,20151111132348,D1,MOTORWAY,25,2015-11-11 01:30 PM,63,46,55,S,3,0,28.29,0.0,Fair
4,NOT NORMAL,20151111125233,D1,MOTORWAY,24,2015-11-11 01:00 PM,61,46,59,S,1,0,28.32,0.0,Fair
5,NORMAL,20151110180824,D1,SECONDARY,16,2015-11-10 06:00 PM,72,43,35,VAR,2,0,28.32,0.0,Fair
6,NORMAL,20151111123124,D1,MOTORWAY,25,2015-11-11 12:30 PM,57,45,63,WNW,3,0,28.32,0.0,Fair
7,NORMAL,20151221112434,D6,SECONDARY,17,2015-12-21 11:30 AM,41,37,87,CALM,0,0,28.46,0.0,Fair
8,NORMAL,20151217162714,D6,MOTORWAY,26,2015-12-17 04:30 PM,59,39,48,S,2,0,28.35,0.0,Fair
9,NOT NORMAL,20151221120051,D6,MOTORWAY,26,2015-12-21 12:00 PM,45,37,76,S,1,0,28.46,0.0,Fair


In [12]:
df.to_csv('UAH-DRIVESET-weather.csv')