In [1]:
import os.path as op
import warnings

import numpy as np
import pandas as pd
from scipy.signal import correlate, hilbert
import scipy.ndimage as nd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import get_scorer, make_scorer, r2_score, mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor

from utilities import reduce_df_mem_usage, print_score

from matplotlib import pyplot as plt
import seaborn as sns

import gc

In [2]:
#sns.set("dark_background")
#sns.set(context="talk")
gc.enable()

sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
%matplotlib inline

rand_seed = 1234
rand_state = np.random.RandomState(rand_seed)

warnings.simplefilter(action='ignore', category=FutureWarning)

PATH = '/home/dhren/Documents/000_flatiron/002_projects/006_1028_kaggle_ASHRAE/data'

FIGSIZE = (28, 20)

In [3]:
def apply_mega_est(mega_est, df, cols, stat = 'mean'):
    """stat = mean, median"""
    y_pred = np.zeros((len(mega_est), len(df)), dtype = 'f4')

    for i, est in enumerate(mega_est):
        y_pred[i] = est.predict(df[cols])
        
    if stat == 'median':
        y_pred = np.median(y_pred, axis = 0)
    else:
        y_pred = np.mean(y_pred, axis = 0)

    return y_pred

In [4]:
def med_nojunk(x):
    bol = pd.isna(x) | (x == 0)
    return np.median(x[~bol])

def junk(x):
    bol = pd.isna(x) | (x == 0)
    return bol


def make_mods(x):
    bid = x.building_id.unique()[0]
    mid = x.meter.unique()[0]
    
    m_min, m_max = x.meter_reading.min(), x.meter_reading.max()

    encode = RobustScaler()
    lr = BaggingRegressor(base_estimator = LinearRegression(), n_estimators = 10, random_state = rand_state)
    clipped_lr = TransformedTargetRegressor(regressor=lr, inverse_func = lambda x : np.clip(x, m_min, m_max), check_inverse=True)
    #lr = LinearRegression()
    pipe = make_pipeline(encode, clipped_lr)

    scores = cross_validate(pipe, x[["pred_week", "pred_dayofweek", "pred_hourofday"]], x["meter_reading"], cv = 5, n_jobs = -1, scoring = {'log_mse' : log_mse, 'r2' : get_scorer('r2'), 'mae' : mae}, return_estimator = True, return_train_score = True)

    mods[(bid, mid)] = scores.pop('estimator')
    
    x['Prediction'] = apply_mega_est(mods[(bid, mid)], x, ["pred_week", "pred_dayofweek", "pred_hourofday"], stat = 'median')
    
    return x

def apply_mods(x):
    if len(x) == 0:
        display(x)
        raise RuntimeError("Dataframe is empty.")
    bid = x.building_id.unique()[0]
    mid = x.meter.unique()[0]
    
    try:
        x = x.interpolate(method = 'nearest').bfill().ffill()  
    except Exception as e:
        print("=" * 25, bid, mid)
        raise e
        
    x['Prediction'] = apply_mega_est(mods[(bid, mid)], x, ["pred_week", "pred_dayofweek", "pred_hourofday"], stat = 'median')

    return x

log_mse = make_scorer(mean_squared_log_error)
mae = make_scorer(mean_absolute_error)

In [6]:
# for i in range(10):
for i in [0]:
    
    print("{:s} {:2d}/{:2d} {:s}".format("="*15, i+1, 10, "="*15))

    df_tmp = pd.read_csv(op.join(PATH, 'train_%02d.csv'%(i)))

    df_tmp['timestamp'] = pd.to_datetime(df_tmp['timestamp'])
    df_tmp['week'] = pd.Series(df_tmp.timestamp).dt.week.values
    df_tmp['dayofweek'] = pd.Series(df_tmp.timestamp).dt.dayofweek.values
    df_tmp['hourofday'] = pd.Series(df_tmp.timestamp).dt.hour.values

    df_train = reduce_df_mem_usage(df_tmp)
    del df_tmp; gc.collect()

    group = ['building_id', 'meter']
    target = 'meter_reading'

    grp = df_train.groupby(group)[[target]]
    idx = grp.transform(junk)
    df_train.loc[idx.values.ravel(), target] = grp.transform(med_nojunk).loc[idx.values.ravel()]

    del idx
    gc.collect()

    preds_week = reduce_df_mem_usage(df_train.groupby(['building_id', 'meter', 'week'])[[target]].median().rename({'meter_reading': 'pred_week'},axis=1))
    preds_dayofweek = reduce_df_mem_usage(df_train.groupby(['building_id', 'meter', 'dayofweek'])[[target]].median().rename({'meter_reading': 'pred_dayofweek'},axis=1))
    preds_hourofday = reduce_df_mem_usage(df_train.groupby(['building_id', 'meter', 'hourofday'])[[target]].median().rename({'meter_reading': 'pred_hourofday'},axis=1))

    df_tmp = df_train.merge(preds_week, on=['building_id', 'meter', 'week'], how='left')
    df_tmp = df_tmp.merge(preds_dayofweek, on=['building_id', 'meter', 'dayofweek'], how='left')
    df_tmp = df_tmp.merge(preds_hourofday, on=['building_id', 'meter', 'hourofday'], how='left')

    df_train = reduce_df_mem_usage(df_tmp)

    del df_tmp
    gc.collect()

    mods = {}

    df_train = df_train.groupby(['building_id', 'meter']).apply(make_mods)

    print("Mean Sqrd Log Error", mean_squared_log_error(df_train.meter_reading, df_train.pred_week))
    print("Mean Sqrd Log Error", mean_squared_log_error(df_train.meter_reading, df_train.Prediction))

    del df_train
    gc.collect()

    df_tmp = pd.read_csv(op.join(PATH, 'test_%02d.csv'%(i)))

    df_tmp['timestamp'] = pd.to_datetime(df_tmp['timestamp'])
    df_tmp['week'] = pd.Series(df_tmp.timestamp).dt.week.values
    df_tmp['dayofweek'] = pd.Series(df_tmp.timestamp).dt.dayofweek.values
    df_tmp['hourofday'] = pd.Series(df_tmp.timestamp).dt.hour.values

    df_test = reduce_df_mem_usage(df_tmp)

    del df_tmp
    gc.collect()

    df_tmp = df_test.merge(preds_week, on=['building_id', 'meter', 'week'], how='left')
    df_tmp = df_tmp.merge(preds_dayofweek, on=['building_id', 'meter', 'dayofweek'], how='left')
    df_tmp = df_tmp.merge(preds_hourofday, on=['building_id', 'meter', 'hourofday'], how='left')

    df_test = reduce_df_mem_usage(df_tmp)

    del df_tmp
    gc.collect()

    df_test = df_test.groupby(['building_id', 'meter']).apply(apply_mods)
    
    df_test[['row_id', 'Prediction']].to_csv(op.join(PATH, 'submission_1029_periodic_linReg_%02d.csv'%(i)), index = False)

Mean Sqrd Log Error 0.13190043
Mean Sqrd Log Error 0.11123194


In [None]:
aha = df_test[(df_test.building_id == 0) & (df_test.meter == 0)]

In [None]:
apply_mega_est(mods[(0, 0)], aha, ["pred_week", "pred_dayofweek", "pred_hourofday"], stat = 'median').max()

In [None]:
aha.interpolate(method = 'nearest')#.bfill().ffill()

aha.loc[0, 'pred_week'] = np.nan

aha.bfill()

In [None]:
i

In [None]:
df_tmp = pd.read_csv(op.join(PATH, 'test_%02d.csv'%(i)))

In [None]:
df_test.groupby(['building_id', 'meter'])

In [None]:
%%time 

li = []

for i in range(10):
    li.append(pd.read_csv(op.join(PATH, 'submission_1029_periodic_linReg_%02d.csv'%(i))

pd.concat(li, axis=0, ignore_index=True).to_csv(op.join(PATH, 'submission_1029_periodic_linReg.csv', index = False)