In [20]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
from lightgbm import plot_tree
from graphviz import Digraph
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold,GroupKFold, StratifiedKFold
from sklearn.metrics import roc_auc_score,mean_squared_error
import category_encoders as ce
import time
import pickle
import gc

%matplotlib inline

# Load Data

In [21]:
with open('./data/df_train_total.pickle', 'rb') as handle:
    df_train_total = pickle.load(handle)
    
with open('./data/df_test_total.pickle', 'rb') as handle:
    df_test_total = pickle.load(handle)

# Add Feature

In [22]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    # if categorical_colunms are not given than treat object as categorical features
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns

In [23]:
df_train_total["hour"] = df_train_total["timestamp"].dt.hour
df_test_total["hour"] = df_test_total["timestamp"].dt.hour

In [24]:
df_train_total,colname = label_encoder(df_train_total, categorical_columns=['primary_use'])
df_test_total,colname = label_encoder(df_test_total, categorical_columns=['primary_use'])

# Log Target

In [25]:
df_train_total['meter_reading'] = np.log1p(df_train_total['meter_reading'])

In [26]:
with open('./data/df_train_total_feat.pickle', 'wb') as handle:
    pickle.dump(df_train_total, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('./data/df_test_total_feat.pickle', 'wb') as handle:
    pickle.dump(df_test_total, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
df_train_total.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour
0,0,0,2016-01-01,0.0,0,0,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,0
1,1,0,2016-01-01,0.0,0,0,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,0
2,2,0,2016-01-01,0.0,0,0,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,0
3,3,0,2016-01-01,0.0,0,0,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,0
4,4,0,2016-01-01,0.0,0,0,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0,0


In [18]:
df_test_total.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour
0,0,0,0,2017-01-01,0,0,7432,2008.0,,17.8,4.0,11.7,,1021.4,100.0,3.6,0
1,1,1,0,2017-01-01,0,0,2720,2004.0,,17.8,4.0,11.7,,1021.4,100.0,3.6,0
2,2,2,0,2017-01-01,0,0,5376,1991.0,,17.8,4.0,11.7,,1021.4,100.0,3.6,0
3,3,3,0,2017-01-01,0,0,23685,2002.0,,17.8,4.0,11.7,,1021.4,100.0,3.6,0
4,4,4,0,2017-01-01,0,0,116607,1975.0,,17.8,4.0,11.7,,1021.4,100.0,3.6,0
