In [2]:
import copy
import glob
import regex as re
import numpy as np
import pandas as pd
import datetime as dt
import xgboost as xgb
import tensorflow as tf
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline
%load_ext rpy2.ipython
%run ../airquality/data/gen_daily_targets.py
%run ../airquality/data/prepare_data.py

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [3]:
test = pd.read_csv('../data/targets.csv')
stations = pd.read_csv('../data/stations.csv')

In [66]:
observations = pd.read_csv('../data/observations.csv', index_col=0)
models = pd.read_csv('../data/models.csv', index_col=0)
obs_and_mods = pd.merge(models, observations, how='left',
                    on=['station', 'day', 'time', 'datetime' ,'year'],
                    suffixes=('_mod', '')).sort_values('datetime', ascending=True)
obs_and_mods.to_csv('../data/obs_and_mod.csv')
obs_and_mods_cols = ['pred_0_days', 'pred_1_days', 'Concentration', 'target', 'day']
obs_and_mods.head()

Unnamed: 0,pred_0_days,pred_1_days,day,lon,lat,year,station,datetime,time,Concentration,target
0,38.79,34.11,2013-01-01,2.15,41.39,2013,ES1438A,2013-01-01 00:00:00,00:00:00,,
1,28.53,27.48,2013-01-01,2.13,41.38,2013,ES1396A,2013-01-01 00:00:00,00:00:00,,
2,35.85,42.57,2013-01-01,2.2,41.4,2013,ES0691A,2013-01-01 00:00:00,00:00:00,,
3,31.81,31.59,2013-01-01,2.15,41.4,2013,ES1480A,2013-01-01 00:00:00,00:00:00,,
4,31.81,31.59,2013-01-01,2.15,41.43,2013,ES1856A,2013-01-01 00:00:00,00:00:00,,


In [67]:
agg_types = ['max']
tall_series = obs_and_mods.fillna(0).groupby(['day', 'station']).agg({
    'Concentration': agg_types
})['Concentration'].reset_index()
aggs = [tall_series.pivot(index='day', columns='station', values=agg) for agg in agg_types]
aggs = [df.rename(columns={c: c + '_' + agg for c in df.columns}) for df, agg in zip(aggs, agg_types)]
wide_series = pd.concat(aggs, axis=1)
wide_series.head()

station,ES0691A_max,ES1396A_max,ES1438A_max,ES1480A_max,ES1679A_max,ES1856A_max,ES1992A_max
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,72.0,98.0,112.0,94.0,69.0,39.0,68.0
2013-01-02,87.0,86.0,119.0,79.0,76.0,77.0,85.0
2013-01-03,119.0,85.0,117.0,102.0,78.0,94.0,110.0
2013-01-04,80.0,113.0,123.0,187.0,108.0,60.0,99.0
2013-01-05,60.0,114.0,,131.0,114.0,16.0,85.0


In [64]:
tall_series_mod = obs_and_mods.groupby(['day', 'station']).agg({
    'pred_0_days': agg_types
})['pred_0_days'].reset_index().rename(columns={
    'pred_0_days': 'Concentration'
})
wide_series_mod = tall_series_mod\
    .pivot(index='day', columns='station', values='max')\
    .rename(columns={c: c + '_max' for c in tall_series_mod.columns})
wide_series_mod.head()

station,ES0691A,ES1396A,ES1438A,ES1480A,ES1679A,ES1856A,ES1992A
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,84.45,66.88,84.33,66.18,84.33,66.18,66.88
2013-01-02,94.56,82.88,120.52,109.39,120.52,109.39,82.88
2013-01-03,77.65,66.58,76.64,79.88,76.64,79.88,66.58
2013-01-04,102.79,62.68,48.29,80.1,79.81,80.1,62.74
2013-01-05,89.21,53.53,,64.29,73.18,64.29,53.53


In [65]:
obs_and_mods.set_index('day').loc['2015-01-03'].tail()

KeyError: 'the label [2015-01-03] is not in the [index]'

In [55]:
to_impute

station,ES0691A_max,ES1396A_max,ES1438A_max,ES1480A_max,ES1679A_max,ES1856A_max,ES1992A_max
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-03,70.36,40.63,66.81,44.46,66.81,44.46,40.63
2015-01-06,122.30,121.64,144.60,130.53,144.60,130.53,121.64
2015-01-09,95.57,78.34,101.46,54.66,101.46,54.66,78.34
2015-01-12,101.67,97.10,104.45,70.22,104.45,70.22,97.10
2015-01-15,105.26,88.25,101.26,97.05,101.26,97.05,88.25
2015-01-18,68.32,73.48,86.26,84.30,86.26,84.30,73.48
2015-01-21,98.79,90.51,102.58,93.53,102.58,93.53,90.51
2015-01-24,95.76,46.33,91.05,78.67,91.05,78.67,46.33
2015-01-27,102.36,105.18,93.92,111.85,93.92,111.85,105.18
2015-01-30,67.94,33.95,64.80,47.61,64.80,47.61,33.95


In [56]:
wide_series.head()

station,ES0691A_max,ES1396A_max,ES1438A_max,ES1480A_max,ES1679A_max,ES1856A_max,ES1992A_max
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,72.0,98.0,112.0,94.0,69.0,39.0,68.0
2013-01-02,87.0,86.0,119.0,79.0,76.0,77.0,85.0
2013-01-03,119.0,85.0,117.0,102.0,78.0,94.0,110.0
2013-01-04,80.0,113.0,123.0,187.0,108.0,60.0,99.0
2013-01-05,60.0,114.0,,131.0,114.0,16.0,85.0


In [53]:
to_impute = wide_series.loc[test['date'].unique()]
for s in obs_and_mods.station.unique():
    to_impute[s + '_max'] = wide_series_mod.loc[to_impute.reset_index().day, s]
originals = wide_series[list(~wide_series.reset_index().day.isin(test['date'].unique()))]
wide_series_imputed = pd.concat([to_impute, originals]).sort_index()
wide_series_imputed.head()

station,ES0691A_max,ES1396A_max,ES1438A_max,ES1480A_max,ES1679A_max,ES1856A_max,ES1992A_max
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,72.0,98.0,112.0,94.0,69.0,39.0,68.0
2013-01-02,87.0,86.0,119.0,79.0,76.0,77.0,85.0
2013-01-03,119.0,85.0,117.0,102.0,78.0,94.0,110.0
2013-01-04,80.0,113.0,123.0,187.0,108.0,60.0,99.0
2013-01-05,60.0,114.0,,131.0,114.0,16.0,85.0
