In [1]:
import os.path as op
import warnings

import numpy as np
import pandas as pd
from scipy.signal import correlate, hilbert
import scipy.ndimage as nd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import get_scorer, make_scorer, r2_score, mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor

from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import FileLink

import gc

In [2]:
#sns.set("dark_background")
#sns.set(context="talk")
gc.enable()

sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
%matplotlib inline

rand_seed = 1234
rand_state = np.random.RandomState(rand_seed)

warnings.simplefilter(action='ignore', category=FutureWarning)

PATH = '/kaggle/input/ashrae-energy-prediction'

FIGSIZE = (28, 20)

In [3]:
def apply_mega_est(mega_est, df, cols, stat = 'mean'):
    """stat = mean, median"""
    y_pred = np.zeros((len(mega_est), len(df)), dtype = 'f4')

    for i, est in enumerate(mega_est):
        y_pred[i] = est.predict(df[cols])
        
    if stat == 'median':
        y_pred = np.median(y_pred, axis = 0)
    else:
        y_pred = np.mean(y_pred, axis = 0)

    return y_pred

def partition_idx(l, n):
    q,r = divmod(l, n) 
    return [i * q + min(i,r) for i in range(n+1)] 

def partition_lst(lst, n):
    idx = partition_idx(len(lst), n)
    return [lst[idx[i]: idx[i+1]] for i in range(n)]

In [4]:
def reduce_df_mem_usage(df, verbose=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def print_score(score_dict):
    for k in sorted(score_dict):
        print("{:25s} : {:0.4f}  +/- {:0.4f}".format(k, score_dict[k].mean(), score_dict[k].std()))

In [5]:
def med_nojunk(val):
    bol = pd.isna(val) | (val == 0)
    val = np.median(val[~bol])

    return val

def denoise(val):
    vl = val.size

    bol = pd.isna(val) | (val <= 0)
    val[bol] = np.median(val[~bol])
    
    val = np.pad(val, (0,vl % 2), mode = 'constant')
    val2 = val
    val_fft = np.fft.rfft(val)
    
    val_fft *= np.interp(np.fft.rfftfreq(val.size, 0.001), [0., 50., 70., 500.], [1. ,1., 0., 0.])
    val = np.fft.irfft(val_fft)
    val = val[:vl]

    bol = val <= 0
    val[bol] = np.median(val[~bol])

    return val

def junk(x):
    bol = pd.isna(x) | (x == 0)
    return bol

def make_mods(x):
    bid = x.building_id.unique()[0]
    mid = x.meter.unique()[0]
    m_min, m_max = x.meter_reading.min(), x.meter_reading.max()
        
    encode = RobustScaler()
    #lr = BaggingRegressor(base_estimator = LinearRegression(), n_estimators = 10, random_state = rand_state)
    lr = LinearRegression()
    clipped_lr = TransformedTargetRegressor(regressor=lr, func = np.log1p, inverse_func = lambda x : np.clip(np.expm1(x), m_min, m_max), check_inverse=False)
    pipe = make_pipeline(encode, clipped_lr)
    
    scores = cross_validate(pipe, x[["pred_week", "pred_dayofweek", "pred_hourofday"]], x["meter_reading"], cv = 3, n_jobs = -1, scoring = {'log_mse' : log_mse, 'r2' : get_scorer('r2'), 'mae' : mae}, return_estimator = True, return_train_score = True)

    mods[(bid, mid)] = scores.pop('estimator')
    x['Prediction'] = apply_mega_est(mods[(bid, mid)], x, ["pred_week", "pred_dayofweek", "pred_hourofday"], stat = 'median')
    
    return x

def apply_mods(x):
    bid = x.building_id.unique()[0]
    mid = x.meter.unique()[0]
    x = x.interpolate(method = 'nearest').bfill().ffill()
    x['Prediction'] = apply_mega_est(mods[(bid, mid)], x, ["pred_week", "pred_dayofweek", "pred_hourofday"], stat = 'median')

    return x

log_mse = make_scorer(mean_squared_log_error)
mae = make_scorer(mean_absolute_error)

In [6]:
num_iter = 10

In [7]:
%%time

df_tmp = pd.read_csv(op.join(PATH, 'train.csv'))

bids = df_tmp.building_id.sort_values().unique()

for i, batch in enumerate(partition_lst(bids, num_iter)):
    df_tmp[df_tmp.building_id.isin(batch)].to_csv('train_{:02d}.csv'.format(i), index = False)

CPU times: user 3min 22s, sys: 16.8 s, total: 3min 39s
Wall time: 3min 38s


In [8]:
%%time

df_tmp = pd.read_csv(op.join(PATH, 'test.csv'))

for i, batch in enumerate(partition_lst(bids, num_iter)):
    df_tmp[df_tmp.building_id.isin(batch)].to_csv('test_{:02d}.csv'.format(i), index = False)

CPU times: user 6min 33s, sys: 20.4 s, total: 6min 54s
Wall time: 6min 52s


In [9]:
%%time

for i in range(num_iter):
# for i in [0]:
    
    print("{:s} {:2d}/{:2d} {:s}".format("="*15, i+1, 10, "="*15))
    
    print("Loading training dataset")

    df_tmp = pd.read_csv('train_%02d.csv'%(i))

    df_tmp['timestamp'] = pd.to_datetime(df_tmp['timestamp'])
    df_tmp['week'] = pd.Series(df_tmp.timestamp).dt.week.values
    df_tmp['dayofweek'] = pd.Series(df_tmp.timestamp).dt.dayofweek.values
    df_tmp['hourofday'] = pd.Series(df_tmp.timestamp).dt.hour.values

    df_train = reduce_df_mem_usage(df_tmp)
    del df_tmp; gc.collect()

    group = ['building_id', 'meter']
    target = 'meter_reading'

    #idx = grp.transform(junk)
    #df_train.loc[idx.values.ravel(), target] = grp.transform(med_nojunk).loc[idx.values.ravel()]

    #del idx
    #gc.collect()

    print("Cleaning training dataset")
    
    grp = df_train.groupby(group)[[target]]
    df_train[target] = grp.transform(denoise)

    print("Creating period predictions")
    
    preds_week = reduce_df_mem_usage(df_train.groupby(['building_id', 'meter', 'week'])[[target]].median().rename({'meter_reading': 'pred_week'},axis=1))
    preds_dayofweek = reduce_df_mem_usage(df_train.groupby(['building_id', 'meter', 'dayofweek'])[[target]].median().rename({'meter_reading': 'pred_dayofweek'},axis=1))
    preds_hourofday = reduce_df_mem_usage(df_train.groupby(['building_id', 'meter', 'hourofday'])[[target]].median().rename({'meter_reading': 'pred_hourofday'},axis=1))

    df_tmp = df_train.merge(preds_week, on=['building_id', 'meter', 'week'], how='left')
    df_tmp = df_tmp.merge(preds_dayofweek, on=['building_id', 'meter', 'dayofweek'], how='left')
    df_tmp = df_tmp.merge(preds_hourofday, on=['building_id', 'meter', 'hourofday'], how='left')

    df_train = reduce_df_mem_usage(df_tmp)

    del df_tmp
    gc.collect()

    print("Computing local weighted sums")
    mods = {}

    df_train = df_train.groupby(['building_id', 'meter']).apply(make_mods)
    
    print("Mean Sqrd Log Error (weekly)", mean_squared_log_error(df_train.meter_reading, df_train.pred_week))
    print("Mean Sqrd Log Error (weighted)", mean_squared_log_error(df_train.meter_reading, df_train.Prediction))

    del df_train
    gc.collect()

    print("Loading test dataset")
    df_tmp = pd.read_csv('test_%02d.csv'%(i))

    df_tmp['timestamp'] = pd.to_datetime(df_tmp['timestamp'])
    df_tmp['week'] = pd.Series(df_tmp.timestamp).dt.week.values
    df_tmp['dayofweek'] = pd.Series(df_tmp.timestamp).dt.dayofweek.values
    df_tmp['hourofday'] = pd.Series(df_tmp.timestamp).dt.hour.values

    df_test = reduce_df_mem_usage(df_tmp)

    del df_tmp
    gc.collect()

    print("Creating period predictions")
    df_tmp = df_test.merge(preds_week, on=['building_id', 'meter', 'week'], how='left')
    df_tmp = df_tmp.merge(preds_dayofweek, on=['building_id', 'meter', 'dayofweek'], how='left')
    df_tmp = df_tmp.merge(preds_hourofday, on=['building_id', 'meter', 'hourofday'], how='left')

    df_test = reduce_df_mem_usage(df_tmp)

    del df_tmp
    gc.collect()

    print("Applying local weighted sums")
    df_test = df_test.groupby(['building_id', 'meter']).apply(apply_mods)
    
    print("Saving test predictions")
    df_test[['row_id', 'Prediction']].to_csv('submission_1029_periodic_hicut_linReg_%02d.csv'%(i), index = False)

Loading training dataset
Cleaning training dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Creating period predictions
Computing local weighted sums
Mean Sqrd Log Error (weekly) 0.06445387
Mean Sqrd Log Error (weighted) 0.048947483
Loading test dataset
Creating period predictions
Applying local weighted sums
Saving test predictions
Loading training dataset
Cleaning training dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Creating period predictions
Computing local weighted sums
Mean Sqrd Log Error (weekly) 0.17045408
Mean Sqrd Log Error (weighted) 0.105825946
Loading test dataset
Creating period predictions
Applying local weighted sums
Saving test predictions
Loading training dataset
Cleaning training dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Creating period predictions
Computing local weighted sums
Mean Sqrd Log Error (weekly) 0.09707176
Mean Sqrd Log Error (weighted) 0.058195964
Loading test dataset
Creating period predictions
Applying local weighted sums
Saving test predictions
Loading training dataset
Cleaning training dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Creating period predictions
Computing local weighted sums
Mean Sqrd Log Error (weekly) 0.078217104
Mean Sqrd Log Error (weighted) 0.040918317
Loading test dataset
Creating period predictions
Applying local weighted sums
Saving test predictions
Loading training dataset
Cleaning training dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Creating period predictions
Computing local weighted sums
Mean Sqrd Log Error (weekly) 0.2331397
Mean Sqrd Log Error (weighted) 0.107988894
Loading test dataset
Creating period predictions
Applying local weighted sums
Saving test predictions
Loading training dataset
Cleaning training dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Creating period predictions
Computing local weighted sums
Mean Sqrd Log Error (weekly) 0.1966032
Mean Sqrd Log Error (weighted) 0.1659514
Loading test dataset
Creating period predictions
Applying local weighted sums
Saving test predictions
Loading training dataset
Cleaning training dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Creating period predictions
Computing local weighted sums
Mean Sqrd Log Error (weekly) 0.120877564
Mean Sqrd Log Error (weighted) 0.101763524
Loading test dataset
Creating period predictions
Applying local weighted sums
Saving test predictions
Loading training dataset
Cleaning training dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Creating period predictions
Computing local weighted sums
Mean Sqrd Log Error (weekly) 0.21889761
Mean Sqrd Log Error (weighted) 0.2154123
Loading test dataset
Creating period predictions
Applying local weighted sums
Saving test predictions
Loading training dataset
Cleaning training dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Creating period predictions
Computing local weighted sums
Mean Sqrd Log Error (weekly) 0.17891495
Mean Sqrd Log Error (weighted) 0.16951269
Loading test dataset
Creating period predictions
Applying local weighted sums
Saving test predictions
Loading training dataset
Cleaning training dataset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Creating period predictions
Computing local weighted sums
Mean Sqrd Log Error (weekly) 0.12656912
Mean Sqrd Log Error (weighted) 0.11205386
Loading test dataset
Creating period predictions
Applying local weighted sums
Saving test predictions
CPU times: user 1h 34min 38s, sys: 33.6 s, total: 1h 35min 12s
Wall time: 1h 36min 7s


In [10]:
%%time 

li = []

for i in range(num_iter):
    li.append(pd.read_csv('submission_1029_periodic_hicut_linReg_%02d.csv'%(i)))

CPU times: user 10.4 s, sys: 808 ms, total: 11.2 s
Wall time: 10.8 s


In [11]:
%%time 

df_final = pd.concat(li, axis=0, ignore_index=True).sort_values('row_id')
df_final.columns = ['row_id', 'meter_reading']

df_final.to_csv('submission_1029_periodic_hicut_linReg.csv', index = False)
FileLink(r'submission_1029_periodic_linReg.csv')

CPU times: user 3min 48s, sys: 5.4 s, total: 3min 53s
Wall time: 3min 52s
