In [27]:
import glob
import regex as re
import numpy as np
import pandas as pd
import xgboost as xgb
import tensorflow as tf
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline
%load_ext rpy2.ipython
%run ../airquality/data/gen_daily_targets.py
%run ../airquality/data/prepare_data.py

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [69]:
test = pd.read_csv('../data/targets.csv')
stations = pd.read_csv('../data/stations.csv')

In [184]:
observations = pd.read_csv('../data/observations.csv', index_col=0)
models = pd.read_csv('../data/models.csv', index_col=0)
features = pd.merge(models, observations, how='right',
                    on=['station', 'day', 'time', 'datetime' ,'year'],
                    suffixes=('_mod', '')).sort_values('datetime', ascending=True)
features.head()

Unnamed: 0,pred_0_days,pred_1_days,day,lon,lat,year,station,datetime,time,Concentration,target
0,23.01,28.7,2013-01-01,2.12,41.39,2013,ES1992A,2013-01-01 01:00:00,01:00:00,19,0
1,26.07,30.44,2013-01-01,2.2,41.4,2013,ES0691A,2013-01-01 01:00:00,01:00:00,55,0
2,30.5,36.76,2013-01-01,2.19,41.39,2013,ES1679A,2013-01-01 01:00:00,01:00:00,48,0
3,23.01,28.7,2013-01-01,2.13,41.38,2013,ES1396A,2013-01-01 01:00:00,01:00:00,39,0
4,24.72,32.98,2013-01-01,2.15,41.43,2013,ES1856A,2013-01-01 01:00:00,01:00:00,11,0


In [113]:
observations.shape

(159733, 7)

In [114]:
models.shape

(367906, 9)

In [115]:
features.shape

(726, 700)

In [175]:
by_station = observations\
    .drop_duplicates(['datetime', 'station'])\
    .pivot(index='datetime', columns='station', values='Concentration')
lagged_features = create_lagged_features(by_station, lags=100).fillna(0)
features = lagged_features[list(lagged_features.reset_index()['datetime'].apply(lambda d: '00:00:00' in d))]
features.head()

station,ES0691A_lag_1,ES1396A_lag_1,ES1438A_lag_1,ES1480A_lag_1,ES1679A_lag_1,ES1856A_lag_1,ES1992A_lag_1,ES0691A_lag_2,ES1396A_lag_2,ES1438A_lag_2,...,ES1679A_lag_99,ES1856A_lag_99,ES1992A_lag_99,ES0691A_lag_100,ES1396A_lag_100,ES1438A_lag_100,ES1480A_lag_100,ES1679A_lag_100,ES1856A_lag_100,ES1992A_lag_100
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02 00:00:00,59.0,84.0,100.0,89.0,69.0,31.0,68.0,63.0,98.0,112.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-03 00:00:00,59.0,80.0,83.0,61.0,56.0,56.0,65.0,65.0,81.0,81.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-04 00:00:00,88.0,58.0,58.0,40.0,44.0,36.0,34.0,85.0,60.0,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-05 00:00:00,65.0,113.0,0.0,144.0,80.0,31.0,97.0,68.0,112.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-06 00:00:00,27.0,27.0,0.0,100.0,73.0,11.0,47.0,27.0,28.0,0.0,...,68.0,31.0,61.0,71.0,68.0,69.0,73.0,62.0,15.0,46.0


In [176]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

kf = KFold(n_splits=3)
metric_by_station = []
for s in observations.station.unique():
    data = pd.merge(features, observations[observations.station == s],
                    left_index=True, right_on='datetime', how='left').dropna()
    data = data[data.year != 2015]
    print(data.shape)
    X = data[[c for c in data.columns if not c in ['datetime', 'day', 'time', 'station', 'year', 'target']]].values
    y = data[['target']].values.ravel()
    metric = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = xgb.XGBClassifier(n_estimators=100)
        model.fit(X_train, y_train)
        y_pred = [int(e) for e in model.predict(X_test)]
        score = log_loss(y_test, y_pred, labels=(0, 1))
        metric.append(score) 
        
    metric_by_station.append((s, np.mean(metric)))
    
metric_by_station

(721, 707)
(723, 707)
(720, 707)
(722, 707)
(720, 707)
(711, 707)
(724, 707)


[('ES1992A', 0.95841521635255333),
 ('ES1480A', 2.4841189097236742),
 ('ES1856A', 0.33579476994695229),
 ('ES1396A', 0.62162631789595568),
 ('ES0691A', 0.19188209108283807),
 ('ES1438A', 2.9632491669115795),
 ('ES1679A', 0.095345547127682118)]

In [177]:
preds_by_station = []
for s in observations.station.unique():
    data = pd.merge(features, observations[observations.station == s],
                    left_index=True, right_on='datetime', how='left').dropna()
    data_train = data[data.year != 2015]
    data_test = data.set_index('day').loc[test['date'].values[0]]
    raise Exception()
    X_train = data[[
        c for c in data.columns if not c in ['datetime', 'day', 'time', 'station', 'year', 'target']
    ]][data.year != 2015].values
    X_test = data[[
        c for c in data.columns if not c in ['datetime', 'day', 'time', 'station', 'year', 'target']
    ]].values
    y = data[['target']].values.ravel()
    model = xgb.XGBClassifier(n_estimators=100)
    model.fit(X, y)

KeyError: 'the label [2015-01-03] is not in the [index]'