In [44]:
import copy
import glob
import time
import random
import regex as re
import numpy as np
import pandas as pd
import datetime as dt
import xgboost as xgb
import tensorflow as tf
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline
%load_ext rpy2.ipython
%run ../airquality/data/gen_daily_targets.py
%run ../airquality/data/prepare_data.py

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [2]:
test = pd.read_csv('../data/targets.csv')
stations = pd.read_csv('../data/stations.csv')
extra_features = pd.read_csv('/Users/jonaspaulwestermann/Downloads/dataset_v1.csv', index_col=0).drop([
    'holiday', 'target', 
], axis=1)
rolling_mean_features = pd.read_csv('../data/data_roll_day_conc_obs.csv')
extra_features = extra_features[[c for c in extra_features.columns if not 'dist' in c]]

In [3]:
observations = pd.read_csv('../data/observations.csv', index_col=0)
models = pd.read_csv('../data/models.csv', index_col=0)
obs_and_mods = pd.merge(models, observations, how='left',
                    on=['station', 'day', 'time', 'datetime' ,'year'],
                    suffixes=('_mod', '')).sort_values('datetime', ascending=True)
obs_and_mods.to_csv('../data/obs_and_mod.csv')
obs_and_mods_cols = ['pred_0_days', 'pred_1_days', 'Concentration', 'target', 'day']
obs_and_mods.head()

Unnamed: 0,pred_0_days,pred_1_days,day,lon,lat,year,station,datetime,time,Concentration,target
0,38.79,34.11,2013-01-01,2.15,41.39,2013,ES1438A,2013-01-01 00:00:00,00:00:00,,
1,28.53,27.48,2013-01-01,2.13,41.38,2013,ES1396A,2013-01-01 00:00:00,00:00:00,,
2,35.85,42.57,2013-01-01,2.2,41.4,2013,ES0691A,2013-01-01 00:00:00,00:00:00,,
3,31.81,31.59,2013-01-01,2.15,41.4,2013,ES1480A,2013-01-01 00:00:00,00:00:00,,
4,31.81,31.59,2013-01-01,2.15,41.43,2013,ES1856A,2013-01-01 00:00:00,00:00:00,,


In [4]:
agg_types = ['mean', 'max', 'std']
tall_series = obs_and_mods.groupby(['day', 'station']).agg({
    'Concentration': agg_types
})['Concentration'].reset_index()
aggs = [tall_series.pivot(index='day', columns='station', values=agg) for agg in agg_types]
aggs = [df.rename(columns={c: c + '_' + agg for c in df.columns}) for df, agg in zip(aggs, agg_types)]
wide_series = pd.concat(aggs, axis=1)
wide_series.head()

station,ES0691A_mean,ES1396A_mean,ES1438A_mean,ES1480A_mean,ES1679A_mean,ES1856A_mean,ES1992A_mean,ES0691A_max,ES1396A_max,ES1438A_max,...,ES1679A_max,ES1856A_max,ES1992A_max,ES0691A_std,ES1396A_std,ES1438A_std,ES1480A_std,ES1679A_std,ES1856A_std,ES1992A_std
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,42.39,38.61,51.39,43.87,37.13,14.78,23.35,72.0,98.0,112.0,...,69.0,39.0,68.0,18.13,26.67,25.73,24.63,20.29,10.99,21.15
2013-01-02,52.5,47.58,63.71,54.5,48.75,32.08,43.29,87.0,86.0,119.0,...,76.0,77.0,85.0,16.61,24.72,26.48,19.18,16.91,25.43,26.53
2013-01-03,64.62,54.62,74.04,64.17,49.25,38.88,49.04,119.0,85.0,117.0,...,78.0,94.0,110.0,22.69,19.05,22.87,21.7,14.87,22.17,31.43
2013-01-04,54.12,44.3,44.5,89.13,61.92,22.18,39.04,80.0,113.0,123.0,...,108.0,60.0,99.0,16.58,28.35,32.46,45.41,25.83,16.98,28.2
2013-01-05,36.88,39.17,,89.0,64.0,12.25,33.54,60.0,114.0,,...,114.0,16.0,85.0,11.0,22.21,,21.63,17.04,2.56,17.75


In [5]:
test.station.unique()

array(['ES0691A', 'ES1396A', 'ES1438A', 'ES1480A', 'ES1679A', 'ES1856A',
       'ES1992A'], dtype=object)

## Cross Validation
Changes from `Jonas_5`:
* Add Jose's rolling mean features

In [35]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error, make_scorer

lags = 25

for s in test.station.unique():
    data = wide_series[list(wide_series.reset_index().day < '2015-01-01')]
    to_lag = data[[c for c in data.columns if not c in ['{}_{}'.format(s, agg) for agg in agg_types]]]
    features = create_lagged_features(to_lag, lags)\
        .join(extra_features[extra_features.station == s].set_index('date'))\
        .join(rolling_mean_features[rolling_mean_features.station == s]
              .set_index('date').drop(['station', 'max_conc_obs'], axis=1))\
        .join(obs_and_mods[obs_and_mods.station == s][['Concentration', 'day']].groupby('day').max())
    X = features[[c for c in features.columns if not c in [
        'time', 'datetime', 'Concentration', 'target', 'station'
    ]]].values
    y = features['Concentration'].fillna(method='ffill').values

    params = pd.DataFrame({
        'eta': random.sample(list(np.linspace(0.01, 2, 201)), 200),
        'n_estimators': random.sample(list(range(1, 250)), 200),
        'reg_alpha': random.sample(list(np.linspace(0.01, 1, 201)), 200),
        'max_depth': random.sample(list(range(1, 35)), 25) * 8,
        'score': [np.nan] * 200
    })
    for i, r in params[['eta', 'n_estimators', 'reg_alpha', 'max_depth']].head(10).iterrows():
        kf = KFold(n_splits=3)
        metric = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            p = dict(r)
            p['n_estimators'] = int(p['n_estimators'])
            p['max_depth'] = int(p['max_depth'])
            model = xgb.XGBRegressor(**p)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            metric.append(
                mean_squared_error(y_test, y_pred)
            ) 

        params.loc[i, 'score'] = np.mean(metric)
        print(params.iloc[i])

    params.to_csv('../reports/cv_xgb_Jonas_6_{}_{}.csv'
                  .format(s, len(glob.glob('../reports/cv_xgb_Jonas_6_{}_*'.format(s)))))
    params.sort_values('score').head()

eta              0.84
max_depth       13.00
n_estimators    21.00
reg_alpha        0.34
score          109.41
Name: 0, dtype: float64
eta             0.59
max_depth       4.00
n_estimators   72.00
reg_alpha       0.25
score          19.04
Name: 1, dtype: float64
eta              1.90
max_depth        1.00
n_estimators    44.00
reg_alpha        0.97
score          137.54
Name: 2, dtype: float64
eta             0.09
max_depth      21.00
n_estimators   48.00
reg_alpha       0.89
score          20.12
Name: 3, dtype: float64
eta              0.93
max_depth       22.00
n_estimators   243.00
reg_alpha        0.11
score           19.16
Name: 4, dtype: float64
eta             1.49
max_depth      19.00
n_estimators   37.00
reg_alpha       0.61
score          26.86
Name: 5, dtype: float64
eta             0.51
max_depth      18.00
n_estimators   70.00
reg_alpha       0.19
score          18.54
Name: 6, dtype: float64
eta             1.38
max_depth      26.00
n_estimators   71.00
reg_alpha       0.8

eta             0.49
max_depth      27.00
n_estimators   54.00
reg_alpha       0.43
score          39.67
Name: 2, dtype: float64
eta             0.56
max_depth       3.00
n_estimators   85.00
reg_alpha       0.69
score          34.96
Name: 3, dtype: float64
eta              1.43
max_depth       10.00
n_estimators   172.00
reg_alpha        0.10
score           41.10
Name: 4, dtype: float64
eta              1.27
max_depth       11.00
n_estimators   175.00
reg_alpha        0.76
score           41.44
Name: 5, dtype: float64
eta              0.58
max_depth        9.00
n_estimators   149.00
reg_alpha        0.25
score           40.35
Name: 6, dtype: float64
eta             0.05
max_depth      28.00
n_estimators   96.00
reg_alpha       0.14
score          40.76
Name: 7, dtype: float64
eta              0.42
max_depth       26.00
n_estimators   184.00
reg_alpha        0.15
score           41.30
Name: 8, dtype: float64
eta             0.48
max_depth      33.00
n_estimators   63.00
reg_alpha     

In [39]:
all_frames = []
for s in test.station.unique():
    frames = [pd.read_csv(f, index_col=0) for f in glob.glob('../reports/cv_xgb_Jonas_6*{}*'.format(s))]
    try:
        frame = pd.concat(frames)
        frame['station'] = s
        all_frames.append(frame)
    except ValueError:
        pass
    
cv_results = pd.concat(all_frames)
cv_results = cv_results[cv_results['score'] > 0]
cv_results.sort_values(['score', 'station']).drop_duplicates('station')

Unnamed: 0,eta,max_depth,n_estimators,reg_alpha,score,station
6,0.51,18,70,0.19,18.54,ES0691A
9,1.72,2,222,0.81,24.07,ES1856A
2,1.06,6,40,0.59,30.97,ES1679A
3,0.56,3,85,0.69,34.96,ES1992A
4,1.64,4,107,0.06,55.69,ES1396A
9,0.46,6,227,0.69,70.77,ES1480A
9,0.78,22,189,0.09,97.92,ES1438A


In [40]:
test_params = []
for i, r in cv_results.sort_values(['score', 'station']).drop_duplicates('station').iterrows():
    p = dict(r)
    del p['score']
    del p['station']
    test_params.append((r['station'], p))
    
test_params

[('ES0691A',
  {'eta': 0.5075000000000001,
   'max_depth': 18,
   'n_estimators': 70,
   'reg_alpha': 0.1882}),
 ('ES1856A',
  {'eta': 1.7214, 'max_depth': 2, 'n_estimators': 222, 'reg_alpha': 0.8119}),
 ('ES1679A',
  {'eta': 1.0647, 'max_depth': 6, 'n_estimators': 40, 'reg_alpha': 0.5941}),
 ('ES1992A',
  {'eta': 0.55725,
   'max_depth': 3,
   'n_estimators': 85,
   'reg_alpha': 0.6930999999999999}),
 ('ES1396A',
  {'eta': 1.6418, 'max_depth': 4, 'n_estimators': 107, 'reg_alpha': 0.0595}),
 ('ES1480A',
  {'eta': 0.4577500000000001,
   'max_depth': 6,
   'n_estimators': 227,
   'reg_alpha': 0.6930999999999999}),
 ('ES1438A',
  {'eta': 0.77615,
   'max_depth': 22,
   'n_estimators': 189,
   'reg_alpha': 0.08919999999999999})]

In [None]:
%%time
lags = 25
preds_by_station = []
edit_wide_series = copy.deepcopy(wide_series)

for d in test['date'].unique():
    data = edit_wide_series[list(pd.to_datetime(edit_wide_series.reset_index().day) <= d)]
    for s, p in test_params:
        to_lag = data[[c for c in data.columns if not c in ['{}_{}'.format(s, agg) for agg in agg_types]]]
        X = create_lagged_features(to_lag, lags)
        X_train = np.nan_to_num(X.iloc[:-1].values)
        for agg in agg_types:
            start = time.time()
            print('Predicting {} for station {} day {}'.format(agg, s, d))
            y_train = np.nan_to_num(data['{}_{}'.format(s, agg)][:-1])
            X_test = np.nan_to_num(X.iloc[-1].values.reshape(1, X.shape[1]))
            p['n_jobs'] = 4
            model = xgb.XGBRegressor(**p)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            edit_wide_series.loc[d, '{}_{}'.format(s, agg)] = y_pred
            print('Predicted {} in {:,.2f}s'.format(y_pred, time.time() - start))

Predicting mean for station ES0691A day 2015-01-03
Predicted [ 55.21148682] in 6.81s
Predicting max for station ES0691A day 2015-01-03
Predicted [ 91.05612183] in 7.04s
Predicting std for station ES0691A day 2015-01-03
Predicted [ 18.69669724] in 7.63s
Predicting mean for station ES1856A day 2015-01-03
Predicted [ 34.20509338] in 3.42s
Predicting max for station ES1856A day 2015-01-03
