In [98]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import math
import os

In [99]:
building_path = os.path.join(os.path.dirname("ashrae.ipynb"), "data/building_metadata.csv")
train_path = os.path.join(os.path.dirname("ashrae.ipynb"), "data/train.csv")
test_path = os.path.join(os.path.dirname("ashrae.ipynb"), "data/test.csv")
weather_test_path = os.path.join(os.path.dirname("ashrae.ipynb"), "data/weather_test.csv")
weather_train_path = os.path.join(os.path.dirname("ashrae.ipynb"), "data/weather_train.csv")

building_meta = pd.read_csv(building_path)
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
weather_test = pd.read_csv(weather_test_path)
weather_train = pd.read_csv(weather_train_path)

In [100]:
df = pd.DataFrame(test.row_id)

In [101]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])
weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'])
weather_test['timestamp'] = pd.to_datetime(weather_test['timestamp'])
building_meta['primary_use'] = building_meta['primary_use'].astype('category')
train = pd.merge(train, building_meta, how='inner', on='building_id')
test = pd.merge(test, building_meta, how='inner', on='building_id')

In [102]:
train = pd.merge(train, weather_train, how='inner', on=['site_id', 'timestamp'])
test = pd.merge(test, weather_test, how='inner', on=['site_id', 'timestamp'])

In [103]:
train['age'] = train['year_built'].max() - train['year_built'] + 1
test['age'] = test['year_built'].max() - test['year_built'] + 1

In [104]:
le = LabelEncoder()
train['primary_use'] = le.fit_transform(train['primary_use']).astype(np.int8)
test['primary_use'] = le.fit_transform(test['primary_use']).astype(np.int8)

In [105]:
train['floor_count'] = train['floor_count'].fillna(-999).astype(np.int16)
test['floor_count'] = test['floor_count'].fillna(-999).astype(np.int16)

train['year_built'] = train['year_built'].fillna(-999).astype(np.int16)
test['year_built'] = test['year_built'].fillna(-999).astype(np.int16)

train['age'] = train['age'].fillna(-999).astype(np.int16)
test['age'] = test['age'].fillna(-999).astype(np.int16)

train['cloud_coverage'] = train['cloud_coverage'].fillna(-999).astype(np.int16)
test['cloud_coverage'] = test['cloud_coverage'].fillna(-999).astype(np.int16) 

In [106]:
train['month'] = train['timestamp'].dt.month.astype(np.int8)
train['weekofyear'] = train['timestamp'].dt.weekofyear.astype(np.int8)
train['dayofyear'] = train['timestamp'].dt.dayofyear.astype(np.int16)
train['hour'] = train['timestamp'].dt.hour.astype(np.int8)  
train['dayofweek'] = train['timestamp'].dt.dayofweek.astype(np.int8)
train['dayofmonth'] = train['timestamp'].dt.day.astype(np.int8)
train['weekofmonth'] = train['timestamp'].dt.day/7
train['weekofmonth'] = train['weekofmonth'].apply(lambda x: math.ceil(x)).astype(np.int8)

train['year_built'] = train['year_built']-1900
train['square_feet'] = np.log(train['square_feet'])
    
test['month'] = test['timestamp'].dt.month.astype(np.int8)
test['weekofyear'] = test['timestamp'].dt.weekofyear.astype(np.int8)
test['dayofyear'] = test['timestamp'].dt.dayofyear.astype(np.int16)   
test['hour'] = test['timestamp'].dt.hour.astype(np.int8)
test['dayofweek'] = test['timestamp'].dt.dayofweek.astype(np.int8)
test['dayofmonth'] = test['timestamp'].dt.day.astype(np.int8)
test['weekofmonth'] = test['timestamp'].dt.day/7
test['weekofmonth'] = test['weekofmonth'].apply(lambda x: math.ceil(x)).astype(np.int8)
    
test['year_built'] = test['year_built']-1900
test['square_feet'] = np.log(test['square_feet'])

In [107]:
y_train = np.log1p(train["meter_reading"])  
x_train = train.drop(["meter_reading", "precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed","timestamp"], axis=1)

In [108]:
from sklearn.model_selection import train_test_split

In [109]:
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=42)

  


In [110]:
train_data = lgb.Dataset(X_train, label=Y_train)
validation_data = train_data.create_valid(X_test, label=Y_test)

In [111]:
param = {'metric': 'rmse'}
num_round = 200
bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data], early_stopping_rounds=5)

[1]	valid_0's rmse: 2.07707
Training until validation scores don't improve for 5 rounds
[2]	valid_0's rmse: 2.02038
[3]	valid_0's rmse: 1.97405
[4]	valid_0's rmse: 1.92687
[5]	valid_0's rmse: 1.89109
[6]	valid_0's rmse: 1.85961
[7]	valid_0's rmse: 1.83061
[8]	valid_0's rmse: 1.79822
[9]	valid_0's rmse: 1.77242
[10]	valid_0's rmse: 1.74953
[11]	valid_0's rmse: 1.72899
[12]	valid_0's rmse: 1.70991
[13]	valid_0's rmse: 1.68942
[14]	valid_0's rmse: 1.67389
[15]	valid_0's rmse: 1.6607
[16]	valid_0's rmse: 1.64279
[17]	valid_0's rmse: 1.6304
[18]	valid_0's rmse: 1.61687
[19]	valid_0's rmse: 1.60837
[20]	valid_0's rmse: 1.59992
[21]	valid_0's rmse: 1.58541
[22]	valid_0's rmse: 1.5758
[23]	valid_0's rmse: 1.56838
[24]	valid_0's rmse: 1.56006
[25]	valid_0's rmse: 1.5517
[26]	valid_0's rmse: 1.54446
[27]	valid_0's rmse: 1.53654
[28]	valid_0's rmse: 1.52608
[29]	valid_0's rmse: 1.51911
[30]	valid_0's rmse: 1.50903
[31]	valid_0's rmse: 1.50366
[32]	valid_0's rmse: 1.49538
[33]	valid_0's rmse: 1.48

In [112]:
test = test.drop(["row_id"], axis=1)

In [113]:
ypred = bst.predict(test)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: timestamp

In [None]:
ypred

In [None]:
df['meter_reading'] = 0

In [114]:
test

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,...,wind_direction,wind_speed,age,month,weekofyear,dayofyear,hour,dayofweek,dayofmonth,weekofmonth
0,0,0,2017-01-01 00:00:00,0,0,8.913550,108,-999,17.8,4,...,100.0,3.6,10,1,52,1,0,6,1,1
1,1,0,2017-01-01 00:00:00,0,0,7.908387,104,-999,17.8,4,...,100.0,3.6,14,1,52,1,0,6,1,1
2,2,0,2017-01-01 00:00:00,0,0,8.589700,91,-999,17.8,4,...,100.0,3.6,27,1,52,1,0,6,1,1
3,3,0,2017-01-01 00:00:00,0,0,10.072597,102,-999,17.8,4,...,100.0,3.6,16,1,52,1,0,6,1,1
4,4,0,2017-01-01 00:00:00,0,0,11.666565,75,-999,17.8,4,...,100.0,3.6,43,1,52,1,0,6,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41498566,1444,0,2018-12-31 23:00:00,15,1,9.884254,14,-999,3.3,-999,...,140.0,5.1,104,12,1,365,23,0,31,5
41498567,1445,0,2018-12-31 23:00:00,15,0,8.365905,-2899,-999,3.3,-999,...,140.0,5.1,-999,12,1,365,23,0,31,5
41498568,1446,0,2018-12-31 23:00:00,15,1,9.329456,97,-999,3.3,-999,...,140.0,5.1,21,12,1,365,23,0,31,5
41498569,1447,0,2018-12-31 23:00:00,15,4,10.301424,101,-999,3.3,-999,...,140.0,5.1,17,12,1,365,23,0,31,5


In [115]:
df

Unnamed: 0,row_id
0,0
1,1
2,2
3,3
4,4
...,...
41697595,41697595
41697596,41697596
41697597,41697597
41697598,41697598
