In [50]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
from lightgbm import plot_tree
from graphviz import Digraph
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold,GroupKFold
from sklearn.metrics import roc_auc_score,mean_squared_error
import time
import pickle

%matplotlib inline

# Load Data

In [2]:
with open('./data/df_train_total.pickle', 'rb') as handle:
    df_train_total = pickle.load(handle)
    
with open('./data/df_test_total.pickle', 'rb') as handle:
    df_test_total = pickle.load(handle)

In [3]:
df_train_total.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0


In [9]:
df_test_total.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0,2017-01-01,0,Education,7432,2008.0,,17.8,4.0,11.7,,1021.4,100.0,3.6
1,1,1,0,2017-01-01,0,Education,2720,2004.0,,17.8,4.0,11.7,,1021.4,100.0,3.6
2,2,2,0,2017-01-01,0,Education,5376,1991.0,,17.8,4.0,11.7,,1021.4,100.0,3.6
3,3,3,0,2017-01-01,0,Education,23685,2002.0,,17.8,4.0,11.7,,1021.4,100.0,3.6
4,4,4,0,2017-01-01,0,Education,116607,1975.0,,17.8,4.0,11.7,,1021.4,100.0,3.6


# Helper Function

In [32]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    # if categorical_colunms are not given than treat object as categorical features
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns

In [33]:
df_train_total,colname = label_encoder(df_train_total, categorical_columns=['primary_use'])
df_test_total,colname = label_encoder(df_test_total, categorical_columns=['primary_use'])

# Train with Seperate Meter Type

## General Settings

In [65]:
params = {'objective':'regression',
          'boosting_type':'gbdt',
          'metric':'rmse',
          'learning_rate':0.1,
          'num_leaves': 2**8,
          'max_depth':-1,
          'colsample_bytree':0.5,# feature_fraction 0.7
          'subsample_freq':1,
          'subsample':0.7,
          'verbose':-1,
          'num_threads':5,
          'seed': 47,#42
                } 

In [6]:
category_cols = ['building_id', 'site_id', 'primary_use']

In [48]:
def fold_train_model(splits_num,features_train,labels_train,features_test,categorical):
    splits = splits_num
    folds = KFold(n_splits = splits,random_state=50)
    predictions = np.zeros(len(features_test))
    ave_score = 0
    
    for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, labels_train.values)):
        print("Fold {}".format(fold_num))
        train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
        valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]

        trn_data = lgb.Dataset(train_df, label=y_train_df,categorical_feature=categorical)
        val_data = lgb.Dataset(valid_df, label=y_valid_df,categorical_feature=categorical)

        valid_results = {}
        clf = lgb.train(params,
                        trn_data,
                        10000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=500,
                        evals_result=valid_results)

        pred = clf.predict(valid_df)
        score = np.sqrt(mean_squared_error(y_valid_df, pred))
        ave_score += score / splits
        predictions += clf.predict(features_test) / splits
    return ave_score,predictions

In [66]:
def train_meter_type(meter_type,df_train_total,df_test_total):
    # prepare data
    df_type_train = df_train_total[df_train_total.meter == meter_type]
    df_type_label = df_type_train['meter_reading']
    df_type_train.drop(columns = ['meter','meter_reading'],inplace=True)
    df_type_train['timestamp'] = df_type_train['timestamp'].astype('int64') // 10**9

    df_type_test = df_test_total[df_test_total.meter == meter_type]
    df_type_row_id = df_type_test['row_id']
    df_type_test.drop(columns = ['row_id','meter'],inplace=True)
    df_type_test['timestamp'] = df_type_test['timestamp'].astype('int64') // 10**9
    
    # train model
    print('train model')
    ave_score,predictions_type = fold_train_model(3,df_ele_train,df_ele_label,df_ele_test,category_cols)
    print('ave socre is %s'%(ave_score))
    
    # get prediction
    print('get prediction')
    sub_type = pd.DataFrame({'row_id': df_type_row_id, 'meter_reading': predictions_type})
    return sub_type

## Train electricity

In [44]:
df_ele_train = df_train_total[df_train_total.meter == 0]
df_ele_label = df_ele_train['meter_reading']
df_ele_train.drop(columns = ['meter','meter_reading'],inplace=True)
df_ele_train['timestamp'] = df_ele_train['timestamp'].astype('int64') // 10**9

df_ele_test = df_test_total[df_test_total.meter == 0]
df_ele_row_id = df_ele_test['row_id']
df_ele_test.drop(columns = ['row_id','meter'],inplace=True)
df_ele_test['timestamp'] = df_ele_test['timestamp'].astype('int64') // 10**9

In [45]:
df_ele_train.head()

Unnamed: 0,building_id,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,1451606400,0,0,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
1,1,1451606400,0,0,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
2,2,1451606400,0,0,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
3,3,1451606400,0,0,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
4,4,1451606400,0,0,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0


In [46]:
df_ele_test.head()

Unnamed: 0,building_id,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,1483228800,0,0,7432,2008.0,,17.8,4.0,11.7,,1021.4,100.0,3.6
1,1,1483228800,0,0,2720,2004.0,,17.8,4.0,11.7,,1021.4,100.0,3.6
2,2,1483228800,0,0,5376,1991.0,,17.8,4.0,11.7,,1021.4,100.0,3.6
3,3,1483228800,0,0,23685,2002.0,,17.8,4.0,11.7,,1021.4,100.0,3.6
4,4,1483228800,0,0,116607,1975.0,,17.8,4.0,11.7,,1021.4,100.0,3.6


In [51]:
ave_score,predictions_ele = fold_train_model(3,df_ele_train,df_ele_label,df_ele_test,category_cols)

Fold 0
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 61.5223	valid_1's rmse: 130.004
Early stopping, best iteration is:
[220]	training's rmse: 74.2884	valid_1's rmse: 128.788
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 57.3464	valid_1's rmse: 256.484
Early stopping, best iteration is:
[21]	training's rmse: 115.913	valid_1's rmse: 253.955


In [52]:
ave_score

32639.97757681098

In [59]:
sub_ele = pd.DataFrame({'row_id': df_ele_row_id, 
                        'meter_reading': predictions_ele})

In [67]:
sub_ele_f = train_meter_type(0,df_train_total,df_test_total)

train model
Fold 0




Training until validation scores don't improve for 500 rounds.


KeyboardInterrupt: 