In [1]:
import os.path as op
import warnings

import numpy as np
import pandas as pd
from scipy.signal import correlate, hilbert
import scipy.ndimage as nd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline

from utilities import reduce_df_mem_usage

from matplotlib import pyplot as plt
import seaborn as sns

import gc

In [2]:
#sns.set("dark_background")
#sns.set(context="talk")
gc.enable()

sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
%matplotlib inline

rand_seed = 1234
rand_state = np.random.RandomState(rand_seed)

warnings.simplefilter(action='ignore', category=FutureWarning)

PATH = '/home/dhren/Documents/000_flatiron/002_projects/006_1028_kaggle_ASHRAE/data'

FIGSIZE = (28, 20)

In [3]:
def med_nojunk(x):
    bol = pd.isna(x) | (x == 0)
    return np.median(x[~bol])

def junk(x):
    bol = pd.isna(x) | (x == 0)
    return bol

In [4]:
df_tmp = pd.read_csv(op.join(PATH, 'train.csv'))
df_train = reduce_df_mem_usage(df_tmp)

del df_tmp
gc.collect()

df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
df_train['week'] = pd.Series(df_train.timestamp).dt.week.values
display(df_train.head())

Mem. usage decreased to 289.19 Mb (53.1% reduction)


Unnamed: 0,building_id,meter,timestamp,meter_reading,week
0,0,0,2016-01-01,0.0,53
1,1,0,2016-01-01,0.0,53
2,2,0,2016-01-01,0.0,53
3,3,0,2016-01-01,0.0,53
4,4,0,2016-01-01,0.0,53


In [5]:
group = ['building_id', 'meter']
target = 'meter_reading'

grp = df_train.groupby(group)[[target]]
idx = grp.transform(junk)
df_train.loc[idx.values.ravel(), target] = grp.transform(med_nojunk).loc[idx.values.ravel()]

In [6]:
pred = 'prediction'
group = ['building_id', 'meter', 'week']

preds = df_train.groupby(group)[[target]].median().rename({'meter_reading':pred},axis=1)

df_train =  df_train.merge(preds, on=group, how='left')

print("Mean Sqrd Log Error", mean_squared_log_error(df_train.meter_reading, df_train.prediction))

del idx, df_train
gc.collect()

0.281774


22

In [9]:
df_tmp = pd.read_csv(op.join(PATH, 'test.csv'))
df_test = reduce_df_mem_usage(df_tmp)

del df_tmp
gc.collect()

df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])
df_test['week'] = pd.Series(df_test.timestamp).dt.week.values

df_test = df_test.merge(preds, on=group, how='left')
display(df_test.head())

Mem. usage decreased to 596.49 Mb (53.1% reduction)


Unnamed: 0,row_id,building_id,meter,timestamp,week,prediction
0,0,0,0,2017-01-01,52,207.156998
1,1,1,0,2017-01-01,52,97.196548
2,2,2,0,2017-01-01,52,5.93825
3,3,3,0,2017-01-01,52,141.630997
4,4,4,0,2017-01-01,52,1161.715088


In [10]:
df_test[['row_id', 'meter_reading']].to_csv(op.join(PATH, 'submission_1028_weekly_median_forloop.csv'), index = False)

Unnamed: 0,row_id,building_id,meter,timestamp,week,prediction
41697595,41697595,1444,0,2018-05-09 07:00:00,19,6.075
41697596,41697596,1445,0,2018-05-09 07:00:00,19,4.8
41697597,41697597,1446,0,2018-05-09 07:00:00,19,9.475
41697598,41697598,1447,0,2018-05-09 07:00:00,19,195.862503
41697599,41697599,1448,0,2018-05-09 07:00:00,19,2.9125
