In [1]:
import copy
import glob
import random
import regex as re
import numpy as np
import pandas as pd
import datetime as dt
import xgboost as xgb
import tensorflow as tf
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline
%load_ext rpy2.ipython
%run ../airquality/data/gen_daily_targets.py
%run ../airquality/data/prepare_data.py








In [2]:
test = pd.read_csv('../data/targets.csv')
stations = pd.read_csv('../data/stations.csv')
extra_features = pd.read_csv('/Users/jonaspaulwestermann/Downloads/dataset_v1.csv', index_col=0).drop([
    'holiday', 'target', 
], axis=1)
rolling_mean_features = pd.read_csv('../data/data_roll_day_conc_obs.csv')
extra_features = extra_features[[c for c in extra_features.columns if not 'dist' in c]]

In [3]:
observations = pd.read_csv('../data/observations.csv', index_col=0)
models = pd.read_csv('../data/models.csv', index_col=0)
obs_and_mods = pd.merge(models, observations, how='left',
                    on=['station', 'day', 'time', 'datetime' ,'year'],
                    suffixes=('_mod', '')).sort_values('datetime', ascending=True)
obs_and_mods.to_csv('../data/obs_and_mod.csv')
obs_and_mods_cols = ['pred_0_days', 'pred_1_days', 'Concentration', 'target', 'day']
obs_and_mods.head()

Unnamed: 0,pred_0_days,pred_1_days,day,lon,lat,year,station,datetime,time,Concentration,target
0,38.79,34.11,2013-01-01,2.15,41.39,2013,ES1438A,2013-01-01 00:00:00,00:00:00,,
1,28.53,27.48,2013-01-01,2.13,41.38,2013,ES1396A,2013-01-01 00:00:00,00:00:00,,
2,35.85,42.57,2013-01-01,2.2,41.4,2013,ES0691A,2013-01-01 00:00:00,00:00:00,,
3,31.81,31.59,2013-01-01,2.15,41.4,2013,ES1480A,2013-01-01 00:00:00,00:00:00,,
4,31.81,31.59,2013-01-01,2.15,41.43,2013,ES1856A,2013-01-01 00:00:00,00:00:00,,


In [4]:
agg_types = ['mean', 'max', 'std']
tall_series = obs_and_mods.groupby(['day', 'station']).agg({
    'Concentration': agg_types
})['Concentration'].reset_index()
aggs = [tall_series.pivot(index='day', columns='station', values=agg) for agg in agg_types]
aggs = [df.rename(columns={c: c + '_' + agg for c in df.columns}) for df, agg in zip(aggs, agg_types)]
wide_series = pd.concat(aggs, axis=1)
wide_series.head()

station,ES0691A_mean,ES1396A_mean,ES1438A_mean,ES1480A_mean,ES1679A_mean,ES1856A_mean,ES1992A_mean,ES0691A_max,ES1396A_max,ES1438A_max,...,ES1679A_max,ES1856A_max,ES1992A_max,ES0691A_std,ES1396A_std,ES1438A_std,ES1480A_std,ES1679A_std,ES1856A_std,ES1992A_std
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,42.39,38.61,51.39,43.87,37.13,14.78,23.35,72.0,98.0,112.0,...,69.0,39.0,68.0,18.13,26.67,25.73,24.63,20.29,10.99,21.15
2013-01-02,52.5,47.58,63.71,54.5,48.75,32.08,43.29,87.0,86.0,119.0,...,76.0,77.0,85.0,16.61,24.72,26.48,19.18,16.91,25.43,26.53
2013-01-03,64.62,54.62,74.04,64.17,49.25,38.88,49.04,119.0,85.0,117.0,...,78.0,94.0,110.0,22.69,19.05,22.87,21.7,14.87,22.17,31.43
2013-01-04,54.12,44.3,44.5,89.13,61.92,22.18,39.04,80.0,113.0,123.0,...,108.0,60.0,99.0,16.58,28.35,32.46,45.41,25.83,16.98,28.2
2013-01-05,36.88,39.17,,89.0,64.0,12.25,33.54,60.0,114.0,,...,114.0,16.0,85.0,11.0,22.21,,21.63,17.04,2.56,17.75


## Cross-Validation KNN

In [34]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import log_loss, make_scorer

lags = 100
for s in test.station.unique():
    data = wide_series[list(wide_series.reset_index().day < '2015-01-01')]
    to_lag = data[[c for c in data.columns if not c in ['{}_{}'.format(s, agg) for agg in agg_types]]]
    features = create_lagged_features(to_lag, lags)\
        .join(extra_features[extra_features.station == s].set_index('date'))\
        .join(rolling_mean_features[rolling_mean_features.station == s]
              .set_index('date').drop(['station', 'max_conc_obs'], axis=1))\
        .join(obs_and_mods[obs_and_mods.station == s][['Concentration', 'day']].groupby('day').max())\
        .fillna(method='bfill').fillna(method='ffill')
    X = features[[c for c in features.columns if not c in [
        'time', 'datetime', 'Concentration', 'target', 'station'
    ]]].values
    y = (features['Concentration'].fillna(method='ffill').values > 100).astype(int)

    params = pd.DataFrame({
        'n_neighbors': list(range(1, 25)),
        'score': [np.nan] * 24
    })
    for i, r in params[['n_neighbors']].iterrows():
        kf = KFold(n_splits=3)
        metric = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            p = dict(r)
            model = KNeighborsClassifier(**p)
            model.fit(X_train, y_train)
            y_pred = model.predict_proba(X_test)
            metric.append(
                log_loss(y_test, y_pred, labels=(0, 1))
            ) 

        params.loc[i, 'score'] = np.mean(metric)
        print(params.iloc[i])

    params.to_csv('../reports/cv_knn_Jonas_7_{}_{}.csv'
                  .format(s, len(glob.glob('../reports/cv_knn_Jonas_7_{}_*'.format(s)))))
    params.head()

n_neighbors   1.00
score         5.01
Name: 0, dtype: float64
n_neighbors   2.00
score         2.65
Name: 1, dtype: float64
n_neighbors   3.00
score         1.93
Name: 2, dtype: float64
n_neighbors   4.00
score         1.61
Name: 3, dtype: float64
n_neighbors   5.00
score         1.43
Name: 4, dtype: float64
n_neighbors   6.00
score         1.30
Name: 5, dtype: float64
n_neighbors   7.00
score         1.12
Name: 6, dtype: float64
n_neighbors   8.00
score         0.99
Name: 7, dtype: float64
n_neighbors   9.00
score         0.94
Name: 8, dtype: float64
n_neighbors   10.00
score          0.95
Name: 9, dtype: float64
n_neighbors   11.00
score          0.77
Name: 10, dtype: float64
n_neighbors   12.00
score          0.64
Name: 11, dtype: float64
n_neighbors   13.00
score          0.56
Name: 12, dtype: float64
n_neighbors   14.00
score          0.47
Name: 13, dtype: float64
n_neighbors   15.00
score          0.47
Name: 14, dtype: float64
n_neighbors   16.00
score          0.47
Name: 15, dty

n_neighbors   10.00
score          0.77
Name: 9, dtype: float64
n_neighbors   11.00
score          0.64
Name: 10, dtype: float64
n_neighbors   12.00
score          0.55
Name: 11, dtype: float64
n_neighbors   13.00
score          0.56
Name: 12, dtype: float64
n_neighbors   14.00
score          0.56
Name: 13, dtype: float64
n_neighbors   15.00
score          0.55
Name: 14, dtype: float64
n_neighbors   16.00
score          0.51
Name: 15, dtype: float64
n_neighbors   17.00
score          0.47
Name: 16, dtype: float64
n_neighbors   18.00
score          0.47
Name: 17, dtype: float64
n_neighbors   19.00
score          0.38
Name: 18, dtype: float64
n_neighbors   20.00
score          0.30
Name: 19, dtype: float64
n_neighbors   21.00
score          0.30
Name: 20, dtype: float64
n_neighbors   22.00
score          0.30
Name: 21, dtype: float64
n_neighbors   23.00
score          0.30
Name: 22, dtype: float64
n_neighbors   24.00
score          0.30
Name: 23, dtype: float64
n_neighbors    1.00
score 