In [204]:
import copy
import glob
import random
import regex as re
import numpy as np
import pandas as pd
import datetime as dt
import xgboost as xgb
import tensorflow as tf
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline
%load_ext rpy2.ipython
%run ../airquality/data/gen_daily_targets.py
%run ../airquality/data/prepare_data.py

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [205]:
test = pd.read_csv('../data/targets.csv')
stations = pd.read_csv('../data/stations.csv')

In [206]:
test = pd.read_csv('../data/targets.csv')
stations = pd.read_csv('../data/stations.csv')
extra_features = pd.read_csv('../data/dataset_v1.csv', index_col=0).drop([
    'holiday', 'target', 
], axis=1)
rolling_mean_features = pd.read_csv('../data/data_roll_day_dist.csv')
extra_features = extra_features[[c for c in extra_features.columns if not 'dist' in c]]

In [207]:
observations = pd.read_csv('../data/observations.csv', index_col=0)
models = pd.read_csv('../data/models.csv', index_col=0)
obs_and_mods = pd.merge(models, observations, how='left',
                    on=['station', 'day', 'time', 'datetime' ,'year'],
                    suffixes=('_mod', '')).sort_values('datetime', ascending=True)
obs_and_mods.to_csv('../data/obs_and_mod.csv')
obs_and_mods_cols = ['pred_0_days', 'pred_1_days', 'Concentration', 'target', 'day']
obs_and_mods.head()

Unnamed: 0,pred_0_days,pred_1_days,day,lon,lat,year,station,datetime,time,Concentration,target
0,38.79,34.11,2013-01-01,2.15,41.39,2013,ES1438A,2013-01-01 00:00:00,00:00:00,,
1,28.53,27.48,2013-01-01,2.13,41.38,2013,ES1396A,2013-01-01 00:00:00,00:00:00,,
2,35.85,42.57,2013-01-01,2.2,41.4,2013,ES0691A,2013-01-01 00:00:00,00:00:00,,
3,31.81,31.59,2013-01-01,2.15,41.4,2013,ES1480A,2013-01-01 00:00:00,00:00:00,,
4,31.81,31.59,2013-01-01,2.15,41.43,2013,ES1856A,2013-01-01 00:00:00,00:00:00,,


In [208]:
pred_svm = pd.read_csv('../reports/pred_ssvm_Jonas_8_1.csv', index_col=0).set_index('date')
pred_svm.head()

Unnamed: 0_level_0,ES0691A,ES1396A,ES1438A,ES1480A,ES1679A,ES1856A,ES1992A
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,0.16,0.16,0.1,0.14,0.17,0.34,0.31
2013-01-02,0.16,0.16,0.1,0.14,0.17,0.34,0.31
2013-01-03,0.16,0.16,0.1,0.14,0.17,0.34,0.31
2013-01-04,0.16,0.16,0.1,0.14,0.17,0.34,0.31
2013-01-05,0.16,0.16,0.1,0.14,0.17,0.34,0.31


In [209]:
pred_knn = pd.read_csv('../reports/pred_knn_Jonas_7_3.csv', index_col=0).set_index('date')
pred_knn.head()

Unnamed: 0_level_0,ES0691A,ES1396A,ES1438A,ES1480A,ES1679A,ES1856A,ES1992A
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,0.3,0.14,0.09,0.17,0.07,0.58,0.68
2013-01-02,0.3,0.19,0.09,0.22,0.13,0.62,0.64
2013-01-03,0.3,0.19,0.17,0.22,0.13,0.71,0.68
2013-01-04,0.45,0.14,0.22,0.22,0.13,0.83,0.68
2013-01-05,0.55,0.14,0.17,0.22,0.13,0.88,0.64


In [210]:
pred_xgb = pd.read_csv('../reports/pred_xgb_Jonas_6_0.csv', index_col=0).set_index('date')
pred_xgb.head()

Unnamed: 0_level_0,ES0691A,ES1396A,ES1438A,ES1480A,ES1679A,ES1856A,ES1992A
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-02,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2013-01-03,1.0,0.0,0.0,1.0,0.0,1.0,1.0
2013-01-04,0.0,0.0,1.0,0.0,1.0,1.0,1.0
2013-01-05,0.0,0.0,1.0,0.0,1.0,1.0,1.0


In [211]:
features = pred_xgb\
    .join(pred_knn, lsuffix='_xgb', rsuffix='_knn')\
    .join(pred_svm, rsuffix='_svm')
    
features = features.reset_index().melt(id_vars='date')
features['station'] = features['variable'].apply(lambda v: v[:7])
features = features.pivot(index='date', columns='variable', values='value')
tall_features = []
for s in test.station.unique():
    cols = features[[c for c in features.columns if s in c]]
    cols.columns = cols.columns.map(lambda s: s[s.find('_')+1:])
    tall_features.append(cols)
    
tall_features = pd.concat(tall_features, axis=0).fillna(0)
for s in test.station.unique():
    tall_features[s] = (tall_features[s] > 0).astype(int)
    
tall_features.head()

Unnamed: 0_level_0,ES0691A,ES1396A,ES1438A,ES1480A,ES1679A,ES1856A,ES1992A,knn,xgb
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-01-01,1,0,0,0,0,0,0,0.3,0.0
2013-01-02,1,0,0,0,0,0,0,0.3,0.0
2013-01-03,1,0,0,0,0,0,0,0.3,1.0
2013-01-04,1,0,0,0,0,0,0,0.45,0.0
2013-01-05,1,0,0,0,0,0,0,0.55,0.0


In [212]:
stations = []
for i, r in tall_features.iterrows():
    for s in test.station.unique():
        if r[s] > 0:
            stations.append(s)
            break
            
tall_features['station'] =stations
tall_features.head()

Unnamed: 0_level_0,ES0691A,ES1396A,ES1438A,ES1480A,ES1679A,ES1856A,ES1992A,knn,xgb,station
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2013-01-01,1,0,0,0,0,0,0,0.3,0.0,ES0691A
2013-01-02,1,0,0,0,0,0,0,0.3,0.0,ES0691A
2013-01-03,1,0,0,0,0,0,0,0.3,1.0,ES0691A
2013-01-04,1,0,0,0,0,0,0,0.45,0.0,ES0691A
2013-01-05,1,0,0,0,0,0,0,0.55,0.0,ES0691A


In [213]:
master = pd.merge(target, tall_features.reset_index().rename(columns={'date': 'day'}), on=['day', 'station'])\
    .set_index(['day', 'station'])
master.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,target,ES0691A,ES1396A,ES1438A,ES1480A,ES1679A,ES1856A,ES1992A,knn,xgb
day,station,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-01-01,ES0691A,0.0,1,0,0,0,0,0,0,0.3,0.0
2013-01-01,ES1396A,0.0,0,1,0,0,0,0,0,0.14,0.0
2013-01-01,ES1438A,1.0,0,0,1,0,0,0,0,0.09,0.0
2013-01-01,ES1480A,0.0,0,0,0,1,0,0,0,0.17,0.0
2013-01-01,ES1679A,0.0,0,0,0,0,1,0,0,0.07,0.0


In [214]:
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

kf = KFold(n_splits=3)
metric = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
    metric.append(
        log_loss(y_test, y_pred, labels=(0, 1))
    ) 
    
print(np.mean(metric))

KeyError: '[2555 2556 2557 ..., 7662 7663 7664] not in index'

In [234]:
X = master.drop('target', axis=1)
y = master[['target']]

X_train = X.loc[list(~X.reset_index()['day'].isin(test['date'].values))]
y_train = y.loc[list(~X.reset_index()['day'].isin(test['date'].values))].fillna(0)
X_test = X.loc[list(X.reset_index()['day'].isin(test['date'].values))]
y_test = y.loc[list(X.reset_index()['day'].isin(test['date'].values))].fillna(0)

model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
y_pred

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([[ 0.02150446,  0.97849554],
       [ 0.92710483,  0.07289516],
       [ 0.23170882,  0.76829118],
       ..., 
       [ 0.60271442,  0.39728561],
       [ 0.70929956,  0.29070044],
       [ 0.64739084,  0.35260913]], dtype=float32)

In [255]:
test.head()

Unnamed: 0,date,station,target
0,2015-01-03,ES0691A,0
1,2015-01-03,ES1396A,0
2,2015-01-03,ES1438A,0
3,2015-01-03,ES1480A,0
4,2015-01-03,ES1679A,0


In [253]:
y_test['pred'] = y_pred[:, 1]
submission = pd.merge(test, y_test[['pred']].reset_index().rename(columns={'day': 'date'}), on=['date', 'station'])\
    .drop('target', axis=1)
                      
submission[['pred']].to_csv('../reports/submission1.csv', header=True, index=False)
submission.head()

In [254]:
%cat ../reports/submission1.csv

pred
0.9784955382347107
0.07289516180753708
0.768291175365448
0.2723965048789978
0.03478242829442024
0.21929146349430084
0.6432662010192871
0.007857130840420723
0.05543694645166397
0.4935362637042999
0.8628844618797302
0.5083826184272766
0.4356672763824463
0.677058756351471
0.0076334127224981785
0.35420575737953186
0.39222782850265503
0.8895682096481323
0.5083826184272766
0.29070043563842773
0.17258630692958832
0.007857130840420723
0.07289516180753708
0.39222782850265503
0.8040553331375122
0.0785490944981575
0.31986910104751587
0.6087238788604736
0.007857130840420723
0.05543694645166397
0.32117849588394165
0.8895682096481323
0.0785490944981575
0.29070043563842773
0.6432662010192871
0.007104177493602037
0.05543694645166397
0.32117849588394165
0.2723965048789978
0.07414732873439789
0.013499737717211246
0.0786505863070488
0.003322285832837224
0.012142645195126534
0.32117849588394165
0.20981541275978088
0.03478242829442024
0.02264479175209999