In [3]:
import copy
import glob
import random
import regex as re
import numpy as np
import pandas as pd
import datetime as dt
import xgboost as xgb
import tensorflow as tf
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline
%load_ext rpy2.ipython
%run ../airquality/data/gen_daily_targets.py
%run ../airquality/data/prepare_data.py

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [4]:
test = pd.read_csv('../data/targets.csv')
stations = pd.read_csv('../data/stations.csv')
extra_features = pd.read_csv('../data/dataset_v1.csv', index_col=0).drop([
    'holiday', 'target', 
], axis=1)
rolling_mean_features = pd.read_csv('../data/data_roll_day_dist.csv')
extra_features = extra_features[[c for c in extra_features.columns if not 'dist' in c]]

In [5]:
observations = pd.read_csv('../data/observations.csv', index_col=0)
models = pd.read_csv('../data/models.csv', index_col=0)
obs_and_mods = pd.merge(models, observations, how='left',
                    on=['station', 'day', 'time', 'datetime' ,'year'],
                    suffixes=('_mod', '')).sort_values('datetime', ascending=True)
obs_and_mods.to_csv('../data/obs_and_mod.csv')
obs_and_mods_cols = ['pred_0_days', 'pred_1_days', 'Concentration', 'target', 'day']
obs_and_mods.head()

Unnamed: 0,pred_0_days,pred_1_days,day,lon,lat,year,station,datetime,time,Concentration,target
0,38.79,34.11,2013-01-01,2.15,41.39,2013,ES1438A,2013-01-01 00:00:00,00:00:00,,
1,28.53,27.48,2013-01-01,2.13,41.38,2013,ES1396A,2013-01-01 00:00:00,00:00:00,,
2,35.85,42.57,2013-01-01,2.2,41.4,2013,ES0691A,2013-01-01 00:00:00,00:00:00,,
3,31.81,31.59,2013-01-01,2.15,41.4,2013,ES1480A,2013-01-01 00:00:00,00:00:00,,
4,31.81,31.59,2013-01-01,2.15,41.43,2013,ES1856A,2013-01-01 00:00:00,00:00:00,,


In [6]:
agg_types = ['mean', 'max', 'std']
tall_series = obs_and_mods.groupby(['day', 'station']).agg({
    'Concentration': agg_types
})['Concentration'].reset_index()
aggs = [tall_series.pivot(index='day', columns='station', values=agg) for agg in agg_types]
aggs = [df.rename(columns={c: c + '_' + agg for c in df.columns}) for df, agg in zip(aggs, agg_types)]
wide_series = pd.concat(aggs, axis=1)
wide_series.head()

station,ES0691A_mean,ES1396A_mean,ES1438A_mean,ES1480A_mean,ES1679A_mean,ES1856A_mean,ES1992A_mean,ES0691A_max,ES1396A_max,ES1438A_max,...,ES1679A_max,ES1856A_max,ES1992A_max,ES0691A_std,ES1396A_std,ES1438A_std,ES1480A_std,ES1679A_std,ES1856A_std,ES1992A_std
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,42.39,38.61,51.39,43.87,37.13,14.78,23.35,72.0,98.0,112.0,...,69.0,39.0,68.0,18.13,26.67,25.73,24.63,20.29,10.99,21.15
2013-01-02,52.5,47.58,63.71,54.5,48.75,32.08,43.29,87.0,86.0,119.0,...,76.0,77.0,85.0,16.61,24.72,26.48,19.18,16.91,25.43,26.53
2013-01-03,64.62,54.62,74.04,64.17,49.25,38.88,49.04,119.0,85.0,117.0,...,78.0,94.0,110.0,22.69,19.05,22.87,21.7,14.87,22.17,31.43
2013-01-04,54.12,44.3,44.5,89.13,61.92,22.18,39.04,80.0,113.0,123.0,...,108.0,60.0,99.0,16.58,28.35,32.46,45.41,25.83,16.98,28.2
2013-01-05,36.88,39.17,,89.0,64.0,12.25,33.54,60.0,114.0,,...,114.0,16.0,85.0,11.0,22.21,,21.63,17.04,2.56,17.75


## Cross-Validation SVM

In [90]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import log_loss, make_scorer

lags = 100
for s in test.station.unique():
    data = wide_series[list(wide_series.reset_index().day < '2015-01-01')]
    to_lag = data[[c for c in data.columns if not c in ['{}_{}'.format(s, agg) for agg in agg_types]]]
    features = create_lagged_features(to_lag, lags)\
        .join(extra_features[extra_features.station == s].set_index('date'))\
        .join(rolling_mean_features[rolling_mean_features.station == s]
              .set_index('date').drop(['station', 'max_conc_obs'], axis=1))\
        .join(obs_and_mods[obs_and_mods.station == s][['Concentration', 'day']].groupby('day').max())\
        .fillna(method='ffill').fillna(0)
    X = features[[c for c in features.columns if not c in [
        'time', 'datetime', 'Concentration', 'target', 'station'
    ]]].values[:, np.where(X[0] != 98)[0]]
    y = (features['Concentration'].fillna(method='ffill').values > 100).astype(int)

    params = pd.DataFrame({
        'C': np.random.choice(np.linspace(0.01, .99, 100), 15),
        'score': [np.nan] * 15
    })
    for i, r in params[['C']].iterrows():
        kf = KFold(n_splits=3)
        metric = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            p = dict(r)
            p['probability'] = True
            try:
                model = SVC(**p)
                model.fit(X_train, y_train)
                y_pred = model.predict_proba(X_test)
                metric.append(
                    log_loss(y_test, y_pred, labels=(0, 1))
                ) 
            except ValueError as e: 
                print(e)
                metric.append(np.nan)
                pass

        params.loc[i, 'score'] = np.mean(metric)
        print(params.iloc[i])

    params.to_csv('../reports/cv_ssvm_Jonas_8_{}_{}.csv'
                  .format(s, len(glob.glob('../reports/cv_ssvm_Jonas_8_{}_*'.format(s)))))
    params.head()

C       0.09
score   0.28
Name: 0, dtype: float64
C       0.87
score   0.28
Name: 1, dtype: float64
C       0.76
score   0.28
Name: 2, dtype: float64
C       0.37
score   0.28
Name: 3, dtype: float64
C       0.46
score   0.28
Name: 4, dtype: float64
C       0.89
score   0.28
Name: 5, dtype: float64
C       0.56
score   0.28
Name: 6, dtype: float64
C       0.81
score   0.28
Name: 7, dtype: float64
C       0.02
score   0.28
Name: 8, dtype: float64
C       0.03
score   0.28
Name: 9, dtype: float64
C       0.45
score   0.28
Name: 10, dtype: float64
C       0.49
score   0.28
Name: 11, dtype: float64
C       0.37
score   0.28
Name: 12, dtype: float64
C       0.56
score   0.28
Name: 13, dtype: float64
C       0.86
score   0.28
Name: 14, dtype: float64
C       0.68
score   0.31
Name: 0, dtype: float64
C       0.76
score   0.31
Name: 1, dtype: float64
C       0.11
score   0.31
Name: 2, dtype: float64
C       0.18
score   0.31
Name: 3, dtype: float64
C       0.50
score   0.31
Name: 4, dtype: flo

## SVM Classification per Station

* Targeting `target`
* Using imputed for 2015

In [7]:
tall_series_mod = obs_and_mods.fillna(0).groupby(['day', 'station']).agg({
    'pred_0_days': agg_types
})['pred_0_days'].reset_index().rename(columns={
    'pred_0_days': 'Concentration'
})
aggs = [tall_series_mod.pivot(index='day', columns='station', values=agg) for agg in agg_types]
aggs = [df.rename(columns={c: c + '_' + agg for c in df.columns}) for df, agg in zip(aggs, agg_types)]
wide_series_mod = pd.concat(aggs, axis=1)
wide_series_mod.tail()

station,ES0691A_mean,ES1396A_mean,ES1438A_mean,ES1480A_mean,ES1679A_mean,ES1856A_mean,ES1992A_mean,ES0691A_max,ES1396A_max,ES1438A_max,...,ES1679A_max,ES1856A_max,ES1992A_max,ES0691A_std,ES1396A_std,ES1438A_std,ES1480A_std,ES1679A_std,ES1856A_std,ES1992A_std
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-12-27,51.54,50.02,57.79,56.64,57.79,56.64,50.02,104.21,96.41,104.76,...,104.76,108.53,96.41,25.52,28.88,23.64,31.42,23.64,31.42,28.88
2015-12-28,69.5,59.83,74.54,70.96,74.54,70.96,59.83,123.89,116.91,135.5,...,135.5,131.59,116.91,28.95,26.06,31.27,28.83,31.27,28.83,26.06
2015-12-29,48.35,44.61,53.75,46.21,53.75,46.21,44.61,93.35,113.39,106.2,...,106.2,113.48,113.39,23.33,28.05,22.57,28.15,22.57,28.15,28.05
2015-12-30,55.49,43.79,54.16,49.88,54.16,49.88,43.79,103.2,91.71,95.34,...,95.34,105.85,91.71,27.95,24.97,23.91,29.4,23.91,29.4,24.97
2015-12-31,64.1,51.87,64.55,60.12,64.55,60.12,51.87,103.42,97.83,109.93,...,109.93,107.9,97.83,22.91,22.19,23.95,25.89,23.95,25.89,22.19


In [8]:
to_impute = wide_series.loc[test['date'].unique()]
for s in obs_and_mods.station.unique():
    columns = [s + '_' + agg for agg in agg_types]
    to_impute[columns] = wide_series_mod.loc[to_impute.reset_index().day, columns]
    
originals = wide_series[list(~wide_series.reset_index().day.isin(test['date'].unique()))]
wide_series_imputed = pd.concat([to_impute, originals]).sort_index()
wide_series_imputed.head()

station,ES0691A_mean,ES1396A_mean,ES1438A_mean,ES1480A_mean,ES1679A_mean,ES1856A_mean,ES1992A_mean,ES0691A_max,ES1396A_max,ES1438A_max,...,ES1679A_max,ES1856A_max,ES1992A_max,ES0691A_std,ES1396A_std,ES1438A_std,ES1480A_std,ES1679A_std,ES1856A_std,ES1992A_std
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,42.39,38.61,51.39,43.87,37.13,14.78,23.35,72.0,98.0,112.0,...,69.0,39.0,68.0,18.13,26.67,25.73,24.63,20.29,10.99,21.15
2013-01-02,52.5,47.58,63.71,54.5,48.75,32.08,43.29,87.0,86.0,119.0,...,76.0,77.0,85.0,16.61,24.72,26.48,19.18,16.91,25.43,26.53
2013-01-03,64.62,54.62,74.04,64.17,49.25,38.88,49.04,119.0,85.0,117.0,...,78.0,94.0,110.0,22.69,19.05,22.87,21.7,14.87,22.17,31.43
2013-01-04,54.12,44.3,44.5,89.13,61.92,22.18,39.04,80.0,113.0,123.0,...,108.0,60.0,99.0,16.58,28.35,32.46,45.41,25.83,16.98,28.2
2013-01-05,36.88,39.17,,89.0,64.0,12.25,33.54,60.0,114.0,,...,114.0,16.0,85.0,11.0,22.21,,21.63,17.04,2.56,17.75


In [9]:
all_frames = []
for s in test.station.unique():
    frames = [pd.read_csv(f, index_col=0) for f in glob.glob('../reports/cv_ssvm_Jonas_8*{}*'.format(s))]
    try:
        frame = pd.concat(frames)
        frame['station'] = s
        all_frames.append(frame)
    except ValueError:
        pass
    
cv_results = pd.concat(all_frames)
cv_results = cv_results[cv_results['score'] > 0]
cv_results.sort_values(['score', 'station']).drop_duplicates('station')

Unnamed: 0,C,score,station
14,0.04,0.27,ES1679A
13,0.51,0.28,ES0691A
4,0.5,0.31,ES1396A
3,0.48,0.35,ES1856A
12,0.76,0.52,ES1992A
1,0.25,0.66,ES1480A
1,0.15,0.67,ES1438A


In [10]:
test_params = []
for i, r in cv_results.sort_values(['score', 'station']).drop_duplicates('station').iterrows():
    p = dict(r)
    del p['score']
    del p['station']
    test_params.append((r['station'], p))
    
test_params

[('ES1679A', {'C': 0.039696969696969696}),
 ('ES0691A', {'C': 0.5148484848484849}),
 ('ES1396A', {'C': 0.495050505050505}),
 ('ES1856A', {'C': 0.4752525252525253}),
 ('ES1992A', {'C': 0.7623232323232323}),
 ('ES1480A', {'C': 0.2475757575757576}),
 ('ES1438A', {'C': 0.1485858585858586})]

In [18]:
# from sklearn.svm import SVC
# from sklearn.model_selection import  KFold
# from sklearn.metrics import log_loss

# lags = 100
# preds_by_station = []
# for s, p in test_params:
#     data = pd.concat([
#         wide_series[list(wide_series.reset_index().day < '2015-01-01')],
#         wide_series_imputed[list(wide_series_imputed.reset_index().day >= '2015-01-01')]
#     ])
#     to_lag = data[[c for c in data.columns if not c in ['{}_{}'.format(s, agg) for agg in agg_types]]]
#     features = create_lagged_features(to_lag, lags)\
#         .join(extra_features[extra_features.station == s].set_index('date'))\
#         .join(rolling_mean_features[rolling_mean_features.station == s]
#               .set_index('date').drop(['station', 'conc_obs', 'weekend', 'week_day', 'month'], axis=1))\
#         .join(obs_and_mods[obs_and_mods.station == s][['Concentration', 'day']].groupby('day').max())\
#         .fillna(method='ffill').fillna(0)
#     X = features[[c for c in features.columns if not c in [
#         'time', 'datetime', 'Concentration', 'target', 'station'
#     ]]].values
#     y = (features['Concentration'].values > 100).astype(int)
#     y_pred_all = copy.deepcopy(y).astype(float)
#     metrics_all = []
    
#     kf = KFold(n_splits=2)
#     for train_index, test_index in kf.split(X):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]
#         p['probability'] = True
#         model = SVC(**p)
#         model.fit(X_train, y_train)
#         y_pred = model.predict_proba(X_test)[:, 1]
#         y_pred_all[test_index] = y_pred
#         metrics_all.append(log_loss(y_test, y_pred, labels=(0, 1)))
        
#     preds_by_station.append((s, y_pred_all))

predictions = pd.DataFrame(np.column_stack([e[1] for e in preds_by_station]), columns=test.station.unique())
predictions['date'] = features.drop('day', axis=1).reset_index()['day']
predictions.to_csv('../reports/pred_ssvm_Jonas_8_{}.csv'.format(len(glob.glob('../reports/pred_ssvm_Jonas_8_*'))))
print('CV Metric: {}'.format(np.mean(metrics_all)))
predictions.tail()

CV Metric: 0.6679334765440583


Unnamed: 0,ES0691A,ES1396A,ES1438A,ES1480A,ES1679A,ES1856A,ES1992A,date
1090,0.07,0.09,0.11,0.11,0.23,0.38,0.41,2015-12-27
1091,0.07,0.09,0.11,0.11,0.23,0.38,0.41,2015-12-28
1092,0.07,0.09,0.11,0.11,0.23,0.38,0.41,2015-12-29
1093,0.07,0.09,0.11,0.11,0.23,0.38,0.41,2015-12-30
1094,0.07,0.09,0.11,0.11,0.23,0.38,0.41,2015-12-31
