In [31]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
from lightgbm import plot_tree
from graphviz import Digraph
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold,GroupKFold, StratifiedKFold
from sklearn.metrics import roc_auc_score,mean_squared_error
import time
import pickle
import gc

%matplotlib inline

# Load Data

In [3]:
with open('./data/df_train_total.pickle', 'rb') as handle:
    df_train_total = pickle.load(handle)
    
with open('./data/df_test_total.pickle', 'rb') as handle:
    df_test_total = pickle.load(handle)

In [32]:
df_train_total.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour
103,105,0,2016-01-01,23.3036,1,0,50623,,5.0,3.8,,2.4,,1020.9,240.0,3.1,0
104,106,0,2016-01-01,0.3746,1,0,5374,,4.0,3.8,,2.4,,1020.9,240.0,3.1,0
105,106,3,2016-01-01,0.0,1,0,5374,,4.0,3.8,,2.4,,1020.9,240.0,3.1,0
106,107,0,2016-01-01,175.184,1,0,97532,2005.0,10.0,3.8,,2.4,,1020.9,240.0,3.1,0
107,108,0,2016-01-01,91.2653,1,0,81580,1913.0,5.0,3.8,,2.4,,1020.9,240.0,3.1,0


In [33]:
df_test_total.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour
0,0,0,0,2017-01-01,0,0,7432,2008.0,,17.8,4.0,11.7,,1021.4,100.0,3.6,0
1,1,1,0,2017-01-01,0,0,2720,2004.0,,17.8,4.0,11.7,,1021.4,100.0,3.6,0
2,2,2,0,2017-01-01,0,0,5376,1991.0,,17.8,4.0,11.7,,1021.4,100.0,3.6,0
3,3,3,0,2017-01-01,0,0,23685,2002.0,,17.8,4.0,11.7,,1021.4,100.0,3.6,0
4,4,4,0,2017-01-01,0,0,116607,1975.0,,17.8,4.0,11.7,,1021.4,100.0,3.6,0


# Remove Strange Data

In [6]:
df_train_total = df_train_total.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

# Add Feature

In [7]:
# add hour feature
df_train_total["hour"] = df_train_total["timestamp"].dt.hour
df_test_total["hour"] = df_test_total["timestamp"].dt.hour

In [55]:
# log transform square feet
#df_train_total["square_feet"] = np.log(df_train_total["square_feet"])
#df_test_total["square_feet"] = np.log(df_test_total["square_feet"])

In [38]:
# add weekday feature
# 3fold improve ele reduce others
#df_train_total["weekend"] = df_train_total["timestamp"].dt.weekday
#df_test_total["weekend"] = df_test_total["timestamp"].dt.weekday

# Helper Function

In [8]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    # if categorical_colunms are not given than treat object as categorical features
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns

In [9]:
df_train_total,colname = label_encoder(df_train_total, categorical_columns=['primary_use'])
df_test_total,colname = label_encoder(df_test_total, categorical_columns=['primary_use'])

# Train with Seperate Meter Type

## General Settings

In [34]:
params = {'objective':'regression',
          'boosting_type':'gbdt',
          'metric':'rmse',
          'learning_rate':0.1,
          'num_leaves': 2**8,
          'max_depth':-1,
          'colsample_bytree':0.5,# feature_fraction 0.7
          'subsample_freq':1,
          'subsample':0.7,
          'verbose':-1,
          'num_threads':8,
          'seed': 47,#42
                } 

In [35]:
category_cols = ['building_id', 'site_id', 'primary_use']
category_cols

['building_id', 'site_id', 'primary_use']

In [13]:
def fold_train_model(splits_num,features_train,labels_train,features_test,categorical):
    splits = splits_num
    folds = KFold(n_splits = splits,random_state=50)
    #folds = StratifiedKFold(n_splits=splits, shuffle=True, random_state=50)
    predictions = np.zeros(len(features_test))
    ave_score = 0
    
    for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, labels_train.values)):
    #for fold_num, (trn_idx, val_idx) in enumerate(folds.split(features_train.values, features_train['building_id'])):
        print("Fold {}".format(fold_num))
        train_df, y_train_df = features_train.iloc[trn_idx], labels_train.iloc[trn_idx]
        valid_df, y_valid_df = features_train.iloc[val_idx], labels_train.iloc[val_idx]

        trn_data = lgb.Dataset(train_df, label=y_train_df,categorical_feature=categorical)
        val_data = lgb.Dataset(valid_df, label=y_valid_df,categorical_feature=categorical)

        valid_results = {}
        clf = lgb.train(params,
                        trn_data,
                        2000,#10000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=500,
                        evals_result=valid_results)

        pred = clf.predict(valid_df)
        score = np.sqrt(mean_squared_error(y_valid_df, pred))
        ave_score += score / splits
        predictions += clf.predict(features_test) / splits
    return ave_score,predictions

In [39]:
def train_meter_type(meter_type,df_train_total,df_test_total,category_cols,splits_num):
    # prepare data
    df_type_train = df_train_total[df_train_total.meter == meter_type]
    # transfer label with log
    df_type_label = np.log1p(df_type_train['meter_reading'])
    df_type_train.drop(columns = ['meter','meter_reading'],inplace=True)
    df_type_train['timestamp'] = df_type_train['timestamp'].astype('int64') // 10**9

    df_type_test = df_test_total[df_test_total.meter == meter_type]
    df_type_row_id = df_type_test['row_id']
    df_type_test.drop(columns = ['row_id','meter'],inplace=True)
    df_type_test['timestamp'] = df_type_test['timestamp'].astype('int64') // 10**9
    
    # train model
    print('train model')
    ave_score,predictions_type = fold_train_model(splits_num,df_type_train,df_type_label,df_type_test,category_cols)
    print('ave socre is %s'%(ave_score))
    
    # get prediction
    print('get prediction')
    sub_type = pd.DataFrame({'row_id': df_type_row_id, 'meter_reading': np.expm1(predictions_type)})
    return sub_type,ave_score

## Train electricity

In [37]:
sub_ele_f,ave_score = train_meter_type(0,df_train_total,df_test_total,category_cols,5)

train model
Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.266748	valid_1's rmse: 0.559504
[1000]	training's rmse: 0.237133	valid_1's rmse: 0.560137
Early stopping, best iteration is:
[688]	training's rmse: 0.251473	valid_1's rmse: 0.558608
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.269794	valid_1's rmse: 0.470216
[1000]	training's rmse: 0.24067	valid_1's rmse: 0.470578
Early stopping, best iteration is:
[614]	training's rmse: 0.260061	valid_1's rmse: 0.469193
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.260278	valid_1's rmse: 0.639227
[1000]	training's rmse: 0.232711	valid_1's rmse: 0.637414
[1500]	training's rmse: 0.215577	valid_1's rmse: 0.637362
Early stopping, best iteration is:
[1130]	training's rmse: 0.227936	valid_1's rmse: 0.63719
Fold 3




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.269433	valid_1's rmse: 0.645725
Early stopping, best iteration is:
[364]	training's rmse: 0.28454	valid_1's rmse: 0.644422
Fold 4




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.26859	valid_1's rmse: 0.536366
Early stopping, best iteration is:
[127]	training's rmse: 0.344066	valid_1's rmse: 0.525367
ave socre is 0.566955909866192
get prediction


In [40]:
sub_cw_f,ave_score_cw = train_meter_type(1,df_train_total,df_test_total,category_cols,5)

train model
Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.629557	valid_1's rmse: 1.21655
[1000]	training's rmse: 0.558434	valid_1's rmse: 1.21666
Early stopping, best iteration is:
[748]	training's rmse: 0.588338	valid_1's rmse: 1.21384
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.617786	valid_1's rmse: 1.16296
[1000]	training's rmse: 0.547017	valid_1's rmse: 1.15972
[1500]	training's rmse: 0.500941	valid_1's rmse: 1.15906
[2000]	training's rmse: 0.468833	valid_1's rmse: 1.15847
Did not meet early stopping. Best iteration is:
[2000]	training's rmse: 0.468833	valid_1's rmse: 1.15847
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.635005	valid_1's rmse: 1.25812
Early stopping, best iteration is:
[106]	training's rmse: 0.827763	valid_1's rmse: 1.24709
Fold 3




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.619028	valid_1's rmse: 1.45423
Early stopping, best iteration is:
[261]	training's rmse: 0.691677	valid_1's rmse: 1.44713
Fold 4




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.622213	valid_1's rmse: 1.37951
Early stopping, best iteration is:
[81]	training's rmse: 0.862303	valid_1's rmse: 1.32142
ave socre is 1.2775907752308326
get prediction


In [41]:
sub_stm_f,ave_score_stm = train_meter_type(2,df_train_total,df_test_total,category_cols,5)

train model
Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.787484	valid_1's rmse: 1.38646
Early stopping, best iteration is:
[191]	training's rmse: 0.876957	valid_1's rmse: 1.38117
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.755858	valid_1's rmse: 1.46344
Early stopping, best iteration is:
[48]	training's rmse: 1.02621	valid_1's rmse: 1.41208
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.741598	valid_1's rmse: 1.70662
Early stopping, best iteration is:
[48]	training's rmse: 1.03007	valid_1's rmse: 1.50799
Fold 3




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.746122	valid_1's rmse: 1.4308
Early stopping, best iteration is:
[350]	training's rmse: 0.775329	valid_1's rmse: 1.42992
Fold 4




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.787313	valid_1's rmse: 1.33257
Early stopping, best iteration is:
[156]	training's rmse: 0.896347	valid_1's rmse: 1.29294
ave socre is 1.4048192273505766
get prediction


In [42]:
sub_hw_f,ave_score_hw = train_meter_type(3,df_train_total,df_test_total,category_cols,5)

train model
Fold 0




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.828352	valid_1's rmse: 1.48754
Early stopping, best iteration is:
[86]	training's rmse: 1.02956	valid_1's rmse: 1.41845
Fold 1




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.802518	valid_1's rmse: 1.51392
Early stopping, best iteration is:
[323]	training's rmse: 0.852504	valid_1's rmse: 1.51069
Fold 2




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.845101	valid_1's rmse: 1.14539
Early stopping, best iteration is:
[438]	training's rmse: 0.861143	valid_1's rmse: 1.14299
Fold 3




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.827446	valid_1's rmse: 1.48363
Early stopping, best iteration is:
[247]	training's rmse: 0.910644	valid_1's rmse: 1.48091
Fold 4




Training until validation scores don't improve for 500 rounds.
[500]	training's rmse: 0.785977	valid_1's rmse: 2.06125
Early stopping, best iteration is:
[26]	training's rmse: 1.22155	valid_1's rmse: 1.78732
ave socre is 1.4680710590541086
get prediction


In [43]:
# five fold
(ave_score + ave_score_cw + ave_score_stm + ave_score_hw) / 4

1.1793592428754274

In [19]:
# remove strange data and add hour feature
(ave_score + ave_score_cw + ave_score_stm + ave_score_hw) / 4

1.284522580244981

In [65]:
# hour + log square feet feature
(ave_score + ave_score_cw + ave_score_stm + ave_score_hw) / 4

1.306757096342972

In [45]:
# hour + weekday feature
(ave_score + ave_score_cw + ave_score_stm + ave_score_hw) / 4

1.3276492832153732

In [35]:
# add hour feature
(ave_score + ave_score_cw + ave_score_stm + ave_score_hw) / 4

1.3065904018094043

In [15]:
# StratifiedKFold
(ave_score + ave_score_cw + ave_score_stm + ave_score_hw) / 4

0.7076236766552024

In [61]:
#huber
(ave_score + ave_score_cw + ave_score_stm + ave_score_hw) / 4

1.3604628852421023

In [51]:
# rmse
(ave_score + ave_score_cw + ave_score_stm + ave_score_hw) / 4

1.352893197767166

In [44]:
sub_all = pd.concat([sub_ele_f,sub_cw_f,sub_stm_f,sub_hw_f])
sub_all.sort_values(by='row_id')

Unnamed: 0,row_id,meter_reading
0,0,146.095742
1,1,75.434724
2,2,5.436937
3,3,197.271365
4,4,1088.405789
...,...,...
41697595,41697595,9.793027
41697596,41697596,4.544902
41697597,41697597,3.035976
41697598,41697598,145.007739


In [48]:
sub_all[sub_all.meter_reading < 0]

Unnamed: 0,row_id,meter_reading


In [47]:
sub_all.loc[sub_all.meter_reading < 0, 'meter_reading'] = 0

In [49]:
sub_all.to_csv('./data/baseline_log_five.csv', index = False)

In [27]:
#del sub_all
#gc.collect()

76