In [20]:
import glob
import regex as re
import numpy as np
import pandas as pd
import xgboost as xgb
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
path = '../data'

In [10]:
stations = pd.read_csv(path+'/stations.csv')
targets = pd.read_csv(path+'/targets.csv')
headers_mod = pd.read_csv(path+'/headers_mod.csv')
headers_obs = pd.read_csv(path+'/headers_obs.csv')

## Build raw data

In [None]:
with open('../data/headers_mod.csv') as f:
    mod_headers = f.readlines()[0].split()
    
with open('../data/headers_obs.csv') as f:
    obs_headers = f.readlines()[0].split()
    
mod_headers, obs_headers

In [None]:
observations = []
for folder in glob.glob('../data/obs/*'):
    for file in glob.glob('{}/*'.format(folder)):
        year = re.search('\d{4}', file)[0]
        station = 'ES{}A'.format(re.findall('\d{4}', file)[1])
        data = pd.read_csv(file, sep='\t', names=obs_headers)
        observations.append((year, station, data))
        
observations[0][2].sample(5)

In [30]:
obs_df = pd.DataFrame()
for obs in observations:
    obs_df = obs_df.append(obs[2]).reset_index(drop=True)

In [33]:
obs_df.to_csv(path+'/all_obs.csv')

In [None]:
models = []
for folder in glob.glob('../data/mod/*'):
    for file in glob.glob('{}/*'.format(folder)):
        year = re.search('\d{4}', file)[0]
        station = 'ES{}A'.format(re.findall('\d{4}', file)[-1])
        data = pd.read_table(file, sep='\s+', names=mod_headers)
        data['year'] = year
        data['station'] = station
        models.append(data)
        
models = pd.concat(models)
models.sample(5)

In [39]:
models = models.reset_index(drop=True)

In [40]:
models.to_csv(path+'/all_models.csv')

## Feature engineering

In [54]:
df_obs

Unnamed: 0,Countrycode,Namespace,AirQualityNetwork,AirQualityStation,AirQualityStationEoICode,SamplingPoint,SamplingProcess,Sample,AirPollutant,AirPollutantCode,AveragingTime,Concentration,UnitOfMeasurement,DatetimeBegin,DatetimeEnd,Validity,Verification,day,month,year
0,in/ES_8_2013-2015_timeseries.csv:ES,ES.BDCA.AQD,NET_ES209A,STA_ES1480A,ES1480A,SP_08019044_8_8,SPP_08019044_8_8.1,SAM_08019044_8_8,NO2,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,90,µg/m3,2013-07-05 18:00:00,2013-07-05 19:00:00,1,1,2013-07-05,7,2013
1,in/ES_8_2013-2015_timeseries.csv:ES,ES.BDCA.AQD,NET_ES209A,STA_ES1480A,ES1480A,SP_08019044_8_8,SPP_08019044_8_8.1,SAM_08019044_8_8,NO2,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,87,µg/m3,2013-07-05 19:00:00,2013-07-05 20:00:00,1,1,2013-07-05,7,2013
2,in/ES_8_2013-2015_timeseries.csv:ES,ES.BDCA.AQD,NET_ES209A,STA_ES1480A,ES1480A,SP_08019044_8_8,SPP_08019044_8_8.1,SAM_08019044_8_8,NO2,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,76,µg/m3,2013-07-05 20:00:00,2013-07-05 21:00:00,1,1,2013-07-05,7,2013
3,in/ES_8_2013-2015_timeseries.csv:ES,ES.BDCA.AQD,NET_ES209A,STA_ES1480A,ES1480A,SP_08019044_8_8,SPP_08019044_8_8.1,SAM_08019044_8_8,NO2,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,65,µg/m3,2013-07-05 21:00:00,2013-07-05 22:00:00,1,1,2013-07-05,7,2013
4,in/ES_8_2013-2015_timeseries.csv:ES,ES.BDCA.AQD,NET_ES209A,STA_ES1480A,ES1480A,SP_08019044_8_8,SPP_08019044_8_8.1,SAM_08019044_8_8,NO2,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,56,µg/m3,2013-07-05 22:00:00,2013-07-05 23:00:00,1,1,2013-07-05,7,2013
5,in/ES_8_2013-2015_timeseries.csv:ES,ES.BDCA.AQD,NET_ES209A,STA_ES1480A,ES1480A,SP_08019044_8_8,SPP_08019044_8_8.1,SAM_08019044_8_8,NO2,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,66,µg/m3,2013-07-05 23:00:00,2013-07-06 00:00:00,1,1,2013-07-05,7,2013
6,in/ES_8_2013-2015_timeseries.csv:ES,ES.BDCA.AQD,NET_ES209A,STA_ES1480A,ES1480A,SP_08019044_8_8,SPP_08019044_8_8.1,SAM_08019044_8_8,NO2,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,87,µg/m3,2013-07-06 00:00:00,2013-07-06 01:00:00,1,1,2013-07-06,7,2013
7,in/ES_8_2013-2015_timeseries.csv:ES,ES.BDCA.AQD,NET_ES209A,STA_ES1480A,ES1480A,SP_08019044_8_8,SPP_08019044_8_8.1,SAM_08019044_8_8,NO2,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,92,µg/m3,2013-07-06 01:00:00,2013-07-06 02:00:00,1,1,2013-07-06,7,2013
8,in/ES_8_2013-2015_timeseries.csv:ES,ES.BDCA.AQD,NET_ES209A,STA_ES1480A,ES1480A,SP_08019044_8_8,SPP_08019044_8_8.1,SAM_08019044_8_8,NO2,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,93,µg/m3,2013-07-06 02:00:00,2013-07-06 03:00:00,1,1,2013-07-06,7,2013
9,in/ES_8_2013-2015_timeseries.csv:ES,ES.BDCA.AQD,NET_ES209A,STA_ES1480A,ES1480A,SP_08019044_8_8,SPP_08019044_8_8.1,SAM_08019044_8_8,NO2,http://dd.eionet.europa.eu/vocabulary/aq/pollu...,hour,85,µg/m3,2013-07-06 03:00:00,2013-07-06 04:00:00,1,1,2013-07-06,7,2013


In [52]:
df_obs['DatetimeBegin'] = pd.to_datetime(df_obs['DatetimeBegin'])
df_obs['DatetimeEnd'] = pd.to_datetime(df_obs['DatetimeEnd'])
df_obs['day'] = df_obs['DatetimeBegin'].dt.date


In [67]:
daily_targets = df_obs.groupby(by=['day', 'AirQualityStation'])['Concentration'].max().reset_index()
daily_targets['target'] = 0
daily_targets.loc[daily_targets['Concentration'] > 130, 'target'] = 1

In [70]:
daily_targets[daily_targets['target'] == 1]

Unnamed: 0,day,AirQualityStation,Concentration,target
24,2013-01-04,STA_ES1480A,187,1
30,2013-01-05,STA_ES1480A,131,1
42,2013-01-07,STA_ES1480A,182,1
61,2013-01-10,STA_ES1438A,146,1
103,2013-01-16,STA_ES1438A,146,1
201,2013-01-30,STA_ES1438A,181,1
215,2013-02-01,STA_ES1438A,143,1
305,2013-02-14,STA_ES1438A,133,1
395,2013-02-27,STA_ES1396A,132,1
396,2013-02-27,STA_ES1438A,192,1


In [72]:
len(daily_targets[daily_targets['target'] == 1]) / len(daily_targets) * 100

3.5592970019199526