In [None]:
#Notes
#meter - The meter id code. 
#Read as {0: electricity, 1: chilledwater, 2: steam, 3: hotwater}. Not every building has all meter types.

#Sites: 0, 1, 2, ..., 15

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
import time, datetime, warnings, os, random, gc, pickle

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
from sklearn.base import clone

warnings.filterwarnings("ignore")
random.seed(0)

In [None]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

def compress_df(bigdf, exception=[]):
    initial_mem = bigdf.memory_usage().sum() / 1024**2
    print("Memory usage of {}: {} MB".format(get_df_name(bigdf), initial_mem))
    
    cols = list(bigdf.columns)
    for x in exception:
        cols.remove(x)
    
    for col in cols:
        col_data = bigdf[col]
        col_type = col_data.dtype.name
        if col_type == 'object':
            bigdf[col] = pd.to_numeric(col_data.astype('category').cat.codes, downcast="integer")
        elif col_type == 'bool':
            bigdf[col] = col_data.astype('int8')
        elif col_type.startswith('int') or (col_data.round() == col_data).all():
            bigdf[col] = pd.to_numeric(col_data, downcast='integer')
        else:
            bigdf[col] = pd.to_numeric(col_data, downcast='float')
    
    final_mem = bigdf.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(final_mem, 100 * (initial_mem - final_mem) / initial_mem))
    
    return bigdf

def adjust_weather(df):
    
    site_offsets = [-5, 0, -7, -5, -8, 0, -5, -5, -5, -6, -7, -5, 0, -6, -5, -5]
    offset_map = {site: offset for site, offset in enumerate(site_offsets)}
    df.timestamp = df.apply(lambda x: x['timestamp'] + datetime.timedelta(hours=offset_map[x['site_id']]), axis=1)
    
    site_dfs = []
    
    if get_df_name(df) == 'train_weather_df':
        n = range(8784)
    else:
        n = range(8784, 26304)
        
    df['timestamp2'] = (df.timestamp - pd.to_datetime("2016-01-01")).dt.total_seconds() // 3600

    for site_id in df.site_id.unique():
        site_df = df[df.site_id == site_id].set_index('timestamp2').reindex(n)
        site_df.site_id = site_id
        for col in [c for c in site_df.columns if c not in ['timestamp', 'site_id']]:
            site_df[f"had_{col}"] = ~site_df[col].isna()
            site_df[col] = site_df[col].interpolate(limit_direction='both', method='linear')
            site_df[col] = site_df[col].fillna(df[col].median())
        site_dfs.append(site_df)
    df = pd.concat(site_dfs).reset_index()
    
    df.drop('timestamp2', axis=1, inplace=True)
    
    for col in [c for c in df.columns if c != 'timestamp']:
        if df[col].isna().any(): df[f'had_{col}'] = ~df[col].isna().astype('bool')
    return df

In [None]:
print('Loading train data')
train_df = pd.read_csv('../input/ashrae-energy-prediction/train.csv', parse_dates=['timestamp'])
building_df = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv').fillna(-1)
building_df = compress_df(building_df)
train_weather_df = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv', parse_dates=['timestamp'])

print ('Fix train weather data')
train_weather_df = adjust_weather(train_weather_df)
train_weather_df = compress_df(train_weather_df, ['timestamp']).set_index(['site_id', 'timestamp'])

print('Merging train data')
train_df = pd.merge(train_df, building_df, on='building_id', how='left')
train_df = train_df.set_index(['site_id', 'timestamp']).join(train_weather_df, on = ['site_id', 'timestamp']).fillna(-1)
train_df = compress_df(train_df).reset_index()

del train_weather_df
gc.collect()

In [None]:
print('Loading test data')
test_df = pd.read_csv('../input/ashrae-energy-prediction/test.csv', parse_dates=['timestamp'])
test_weather_df = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv', parse_dates=['timestamp'])

print ('Fix test weather data')
test_weather_df = adjust_weather(test_weather_df)
test_weather_df = compress_df(test_weather_df, ['timestamp']).set_index(['site_id', 'timestamp'])

print('Merging test data')
test_df = pd.merge(test_df, building_df, on='building_id', how='left')
test_df = test_df.set_index(['site_id', 'timestamp']).join(test_weather_df, on = ['site_id', 'timestamp']).fillna(-1)
test_df = compress_df(test_df).reset_index()

del test_weather_df, building_df
gc.collect()

In [None]:
print('Shape of Data')
print('Train:{}'.format(train_df.shape))
print('Test:{}'.format(test_df.shape))

In [None]:
def make_is_bad_zero(Xy_subset, min_interval=48, summer_start=3000, summer_end=7500):
    meter = Xy_subset.meter_id.iloc[0]
    is_zero = Xy_subset.meter_reading == 0
    if meter == 0:
        # Electrical meters should never be zero. Keep all zero-readings in this table so that
        # they will all be dropped in the train set.
        return is_zero

    transitions = (is_zero != is_zero.shift(1))
    all_sequence_ids = transitions.cumsum()
    ids = all_sequence_ids[is_zero].rename("ids")
    if meter in [2, 3]:
        # It's normal for steam and hotwater to be turned off during the summer
        keep = set(ids[(Xy_subset.timestamp < summer_start) |
                       (Xy_subset.timestamp > summer_end)].unique())
        is_bad = ids.isin(keep) & (ids.map(ids.value_counts()) >= min_interval)
    elif meter == 1:
        time_ids = ids.to_frame().join(Xy_subset.timestamp).set_index("timestamp").ids
        is_bad = ids.map(ids.value_counts()) >= min_interval

        # Cold water may be turned off during the winter
        jan_id = time_ids.get(0, False)
        dec_id = time_ids.get(8283, False)
        if (jan_id and dec_id and jan_id == time_ids.get(500, False) and
                dec_id == time_ids.get(8783, False)):
            is_bad = is_bad & (~(ids.isin(set([jan_id, dec_id]))))
    else:
        raise Exception(f"Unexpected meter type: {meter}")

    result = is_zero.copy()
    result.update(is_bad)
    return result

def find_bad_zeros(X):
    """Returns an Index object of the rows which should be deleted."""
    Xy = X.assign(meter_reading=X.meter_reading, meter_id=X.meter)
    is_bad_zero = Xy.groupby(["building_id", "meter"]).apply(make_is_bad_zero)
    return is_bad_zero[is_bad_zero].index.droplevel([0, 1])

In [None]:
print('Outlier detection & Meter comparisons')
for i in range(0, 16):
    train_df2 = train_df[train_df['site_id']==i]
    temp_df = train_df2.groupby(['meter', 'timestamp']).meter_reading.mean().reset_index()
    ax = sns.FacetGrid(temp_df, col='meter', col_wrap=2, height=4, aspect=2,  sharey=False)
    ax.map(plt.plot, 'timestamp', 'meter_reading', color="teal", linewidth = 3).set_titles('Site ' + str(i) + ' Meter {col_name}')
    plt.subplots_adjust(hspace=0.45)
    plt.show()

In [None]:
def prepdata(df):
    #Categorical
    le = LabelEncoder()
    df.primary_use = le.fit_transform(df.primary_use)
    
    gc.collect()

    #Convert to datetime
    df['hour'] = df.timestamp.dt.hour.astype(np.int8)
    df['weekday'] = df.timestamp.dt.weekday.astype(np.int8)
    #df['day_year'] = df.timestamp.dt.dayofyear.astype(np.int16)
    #df['month'] = df.timestamp.dt.month.astype(np.int16)
    #df['dayofmonth'] = df.timestamp.dt.day.astype(np.int8)
    
    #Rescaling
    minyear = df.year_built.min()
    df.year_built = df.year_built - minyear
    df.square_feet = np.log1p(df.square_feet)
    
    #Train_df
    if get_df_name(df) == 'train_df':
        df.reset_index(drop=True)
        df.sort_values('timestamp')
    
    #Removing features
    #features = ['sea_level_pressure', 'wind_direction', 'wind_speed']
    #df.drop(features, axis=1, inplace=True)
    return df

train_df = prepdata(train_df)

test_df = prepdata(test_df)
row_ids = test_df.row_id
test_df.drop('row_id', axis=1, inplace=True)

In [None]:
#Removing bad records
print('Removing initial observations for site_id=0')
train_df = train_df[~((train_df['site_id'] == 0) & (train_df['meter'] == 0) & (train_df['timestamp'] < datetime.date(2016, 5, 1)))]

print('Removing uneasonably large observations for meter 2 of building 1099')
train_df = train_df[~((train_df['building_id'] == 1099) & (train_df['meter'] == 2) & (train_df['meter_reading'] > 3e4))]

print('Removing bad zeroes')
train_df['timestamp'] = (train_df.timestamp - pd.to_datetime("2016-01-01")).dt.total_seconds() // 3600
badrows = find_bad_zeros(train_df[['timestamp', 'building_id', 'meter', 'meter_reading']])
train_df.drop(index=list(badrows), inplace=True)
del badrows
gc.collect()

In [None]:
train_y = np.log1p(train_df.meter_reading)
train_df.drop(['meter_reading', 'timestamp'], axis=1, inplace=True)
test_df.drop('timestamp', axis=1, inplace=True)

In [None]:
print('Fitting LGBM regressor for meters 0, 1, 2 and 3')
fitted = {}
importances = []
cat = ['building_id', 'site_id', 'primary_use', 'hour', 'weekday', 'had_air_temperature', 'had_cloud_coverage', 
       'had_dew_temperature', 'had_precip_depth_1_hr', 'had_sea_level_pressure', 'had_wind_direction', 'had_wind_speed'] #'meter',
model = LGBMRegressor(boosting_type='gbdt', num_leaves=100, max_depth=-1, learning_rate=0.05, n_estimators=100,
                      colsample_bytree=0.85, reg_lambda=2, metric='rmse', random_state=0)
result = np.zeros(len(test_df))

for val in train_df.meter.unique():
    X1 = train_df[train_df.meter == val].drop('meter', axis=1)
    fitted[val] = clone(model).fit(X1, train_y.loc[X1.index], categorical_feature=cat)
    importances.append(fitted[val].feature_importances_)
    del X1
    ix = np.nonzero((test_df['meter'] == val).to_numpy())
    result[ix] = fitted[val].predict(test_df.iloc[ix].drop('meter', axis=1))
    
del train_df, train_y
gc.collect()


In [None]:
print('Model predictions')
predict = np.expm1(result)

del test_df

print('Averaging')
pred = np.clip(predict, 0, None)
#pred = np.clip(np.round_(predict, decimals = 2), 0, None)

submission = pd.DataFrame({'row_id': row_ids, 'meter_reading': pred})
submission.to_csv('submission.csv.gz', index=False, compression='gzip')

In [None]:
for i in range(4):
    lgb.plot_importance(fitted[i])
    plt.show()