# Importing Libraries

*The following code has been modified by myself. It's original code is from Kaggle user Koustav Banerjee*

In [84]:
#Data Cleaning and Organizing
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore')
import gc


#pre processing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

#Data Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Modeling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import r2_score, mean_squared_error, f1_score, plot_confusion_matrix, accuracy_score, precision_score, recall_score

# Reading and understanding our data

In [85]:
#each data set, not included in github
buildMetData = pd.read_csv('energyCSV/building_metadata.csv')
test = pd.read_csv('energyCSV/test.csv')
train = pd.read_csv('energyCSV/train.csv')
weatherTest = pd.read_csv('energyCSV/weather_test.csv')
weatherTrain = pd.read_csv('energyCSV/weather_train.csv')

train

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.000
1,1,0,2016-01-01 00:00:00,0.000
2,2,0,2016-01-01 00:00:00,0.000
3,3,0,2016-01-01 00:00:00,0.000
4,4,0,2016-01-01 00:00:00,0.000
...,...,...,...,...
20216095,1444,0,2016-12-31 23:00:00,8.750
20216096,1445,0,2016-12-31 23:00:00,4.825
20216097,1446,0,2016-12-31 23:00:00,0.000
20216098,1447,0,2016-12-31 23:00:00,159.575


# Reduce The Memory For Faster Processing

In [86]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [87]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

weatherTrain = reduce_mem_usage(weatherTrain)
weatherTest = reduce_mem_usage(weatherTest)
buildMetData = reduce_mem_usage(buildMetData)

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)
Mem. usage decreased to  0.03 Mb (60.3% reduction)


## Pre-processing

In [88]:
print(train.info(), test.info(), weatherTrain.info(), weatherTest.info(), buildMetData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   building_id    int16  
 1   meter          int8   
 2   timestamp      object 
 3   meter_reading  float32
dtypes: float32(1), int16(1), int8(1), object(1)
memory usage: 289.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   row_id       int32 
 1   building_id  int16 
 2   meter        int8  
 3   timestamp    object
dtypes: int16(1), int32(1), int8(1), object(1)
memory usage: 596.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   site_id             139773 non-null  int8   
 1   timestamp           139773 non-null  object 


The 'timestamp' coloumns in 'train', 'test', 'weatherTrain', 'weatherTest' - as well as the column 'primary_use' in BuildMetData - are currently an onject type. Need to change those formats into something more usable.

In [89]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

weatherTrain['timestamp'] = pd.to_datetime(weatherTrain['timestamp'])
weatherTest['timestamp'] = pd.to_datetime(weatherTest['timestamp'])

In [90]:
print(train.info(), test.info(), weatherTrain.info(), weatherTest.info(), buildMetData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column         Dtype         
---  ------         -----         
 0   building_id    int16         
 1   meter          int8          
 2   timestamp      datetime64[ns]
 3   meter_reading  float32       
dtypes: datetime64[ns](1), float32(1), int16(1), int8(1)
memory usage: 289.2 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
 #   Column       Dtype         
---  ------       -----         
 0   row_id       int32         
 1   building_id  int16         
 2   meter        int8          
 3   timestamp    datetime64[ns]
dtypes: datetime64[ns](1), int16(1), int32(1), int8(1)
memory usage: 596.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------

Much better. Now, as the data is currently 'timestamp presents itself as a single moment in time. A single hour for every day. this can be broken into further cartegories: 'hour', 'day', 'weekday', & 'month'. This recategorizing could help us find trends in specific, months, days, or even hours for the data to find trends in.

In [91]:
train

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.000000
1,1,0,2016-01-01 00:00:00,0.000000
2,2,0,2016-01-01 00:00:00,0.000000
3,3,0,2016-01-01 00:00:00,0.000000
4,4,0,2016-01-01 00:00:00,0.000000
...,...,...,...,...
20216095,1444,0,2016-12-31 23:00:00,8.750000
20216096,1445,0,2016-12-31 23:00:00,4.825000
20216097,1446,0,2016-12-31 23:00:00,0.000000
20216098,1447,0,2016-12-31 23:00:00,159.574997


In [92]:
train["hour"] = train["timestamp"].dt.hour
train["day"] = train["timestamp"].dt.day
train["weekend"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month

test["hour"] = test["timestamp"].dt.hour
test["day"] = test["timestamp"].dt.day
test["weekend"] = test["timestamp"].dt.weekday
test["month"] = test["timestamp"].dt.month

In [93]:
train

Unnamed: 0,building_id,meter,timestamp,meter_reading,hour,day,weekend,month
0,0,0,2016-01-01 00:00:00,0.000000,0,1,4,1
1,1,0,2016-01-01 00:00:00,0.000000,0,1,4,1
2,2,0,2016-01-01 00:00:00,0.000000,0,1,4,1
3,3,0,2016-01-01 00:00:00,0.000000,0,1,4,1
4,4,0,2016-01-01 00:00:00,0.000000,0,1,4,1
...,...,...,...,...,...,...,...,...
20216095,1444,0,2016-12-31 23:00:00,8.750000,23,31,5,12
20216096,1445,0,2016-12-31 23:00:00,4.825000,23,31,5,12
20216097,1446,0,2016-12-31 23:00:00,0.000000,23,31,5,12
20216098,1447,0,2016-12-31 23:00:00,159.574997,23,31,5,12


In [94]:
#Merging Train, BuildMetData, weatherTrain
trainBuild = train.merge(buildMetData, on=['building_id'], how='left')
train = trainBuild.merge(weatherTrain, on=['site_id','timestamp'], how='left')

#Merging Test, BuildMetData, weatherTest
testBuild = test.merge(buildMetData, on=['building_id'], how='left')
test = testBuild.merge(weatherTest, on=['site_id','timestamp'], how='left')

# Featuring Engineering

There are a couple things that should be considered as me move forward.
- a lot of the data in this set is still left as null, inf, or empty. This can be delt with later but the biggest offender is floor_count
- This data was given pre train-test split
- Data is to big to merge train and test to create a train-test split via sklearn
- Some of the data needs to be label encoded because models don't like strings
- Timestamp is no longer needed since we have split it into deeper, separate columns

In [95]:
#Label encoding primary_use so that the model can run
le = LabelEncoder()

train['primary_use'] = le.fit_transform(train['primary_use']).astype(np.int8)
test['primary_use'] = le.fit_transform(test['primary_use']).astype(np.int8)

In [96]:
train

Unnamed: 0,building_id,meter,timestamp,meter_reading,hour,day,weekend,month,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01 00:00:00,0.000000,0,1,4,1,0,0,7432,2008.0,,25.000000,6.0,20.000000,,1019.5,0.0,0.000000
1,1,0,2016-01-01 00:00:00,0.000000,0,1,4,1,0,0,2720,2004.0,,25.000000,6.0,20.000000,,1019.5,0.0,0.000000
2,2,0,2016-01-01 00:00:00,0.000000,0,1,4,1,0,0,5376,1991.0,,25.000000,6.0,20.000000,,1019.5,0.0,0.000000
3,3,0,2016-01-01 00:00:00,0.000000,0,1,4,1,0,0,23685,2002.0,,25.000000,6.0,20.000000,,1019.5,0.0,0.000000
4,4,0,2016-01-01 00:00:00,0.000000,0,1,4,1,0,0,116607,1975.0,,25.000000,6.0,20.000000,,1019.5,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20216095,1444,0,2016-12-31 23:00:00,8.750000,23,31,5,12,15,1,19619,1914.0,,1.700195,,-5.601562,-1.0,1008.5,180.0,8.796875
20216096,1445,0,2016-12-31 23:00:00,4.825000,23,31,5,12,15,0,4298,,,1.700195,,-5.601562,-1.0,1008.5,180.0,8.796875
20216097,1446,0,2016-12-31 23:00:00,0.000000,23,31,5,12,15,1,11265,1997.0,,1.700195,,-5.601562,-1.0,1008.5,180.0,8.796875
20216098,1447,0,2016-12-31 23:00:00,159.574997,23,31,5,12,15,4,29775,2001.0,,1.700195,,-5.601562,-1.0,1008.5,180.0,8.796875


In [97]:
#We don't need time stamp anymore
train = train.drop("timestamp", axis = 1)
test = test.drop("timestamp", axis = 1)

In [98]:
train

Unnamed: 0,building_id,meter,meter_reading,hour,day,weekend,month,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0.000000,0,1,4,1,0,0,7432,2008.0,,25.000000,6.0,20.000000,,1019.5,0.0,0.000000
1,1,0,0.000000,0,1,4,1,0,0,2720,2004.0,,25.000000,6.0,20.000000,,1019.5,0.0,0.000000
2,2,0,0.000000,0,1,4,1,0,0,5376,1991.0,,25.000000,6.0,20.000000,,1019.5,0.0,0.000000
3,3,0,0.000000,0,1,4,1,0,0,23685,2002.0,,25.000000,6.0,20.000000,,1019.5,0.0,0.000000
4,4,0,0.000000,0,1,4,1,0,0,116607,1975.0,,25.000000,6.0,20.000000,,1019.5,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20216095,1444,0,8.750000,23,31,5,12,15,1,19619,1914.0,,1.700195,,-5.601562,-1.0,1008.5,180.0,8.796875
20216096,1445,0,4.825000,23,31,5,12,15,0,4298,,,1.700195,,-5.601562,-1.0,1008.5,180.0,8.796875
20216097,1446,0,0.000000,23,31,5,12,15,1,11265,1997.0,,1.700195,,-5.601562,-1.0,1008.5,180.0,8.796875
20216098,1447,0,159.574997,23,31,5,12,15,4,29775,2001.0,,1.700195,,-5.601562,-1.0,1008.5,180.0,8.796875


# Dealing With Cat,Num Features And Dropping Unnecessary Features

In [99]:
# creating a list if feat columns and drop columns for later indexing in train-test split
featCols = ['building_id', 'primary_use', 'hour', 'day', 'weekend', 'month', 'meter', 'square_feet', 'year_built', 'air_temperature', 'cloud_coverage', 'dew_temperature']

dropCols = ['precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']

In [100]:
valTarget = np.log1p(train["meter_reading"])
del train["meter_reading"]

In [101]:
train = train.drop(dropCols + ["site_id", "floor_count"], axis = 1)
test = test.drop(dropCols + ["site_id", "floor_count","row_id"], axis = 1)

In [102]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20216100 entries, 0 to 20216099
Data columns (total 12 columns):
 #   Column           Dtype  
---  ------           -----  
 0   building_id      int16  
 1   meter            int8   
 2   hour             int64  
 3   day              int64  
 4   weekend          int64  
 5   month            int64  
 6   primary_use      int8   
 7   square_feet      int32  
 8   year_built       float16
 9   air_temperature  float16
 10  cloud_coverage   float16
 11  dew_temperature  float16
dtypes: float16(4), int16(1), int32(1), int64(4), int8(2)
memory usage: 1.1 GB


# Modelling

Using KFolds and cross validating with LightGBM. There's a protion of the script that is commented out. That's because the original coder only trained on the first 80% of the data and then validated on the last 20%. This was his way of keeping the script quicker and easier to manager.

For sure this is a valid way to go, and I urge anyone with a less computationally powerful computer uncomment lines 8 and 9 these ones at the top

    if i + 1 < num_folds:
        continue

My submission isn't going towards Kaggle where the hardware is limited and my computer runs stronger hardware (thanks mom and dad). This isn't just for show, the output is a bit more accurate. Though negligable, not a worthless amount.

*The scores go from an RMSE score of:* **0.488646(train)** & **0.685545(validate)**


*...to an RMSE score of:* **0.447411(train)** & **0.591119(validate)**


I kept the code in while I was tuning parameters for ease of running the script multiple times. Then, when I was happy with the result, I commented that part of the code out to run it all in the slower but more powerful version.

In [127]:
num_folds = 5
kf = KFold(n_splits = num_folds, shuffle = True, random_state = 42)
error = 0
models = []

for i, (train_index, val_index) in enumerate(kf.split(train)):
    #uncomment the bellow secton to run the data faster
    #if i + 1 < num_folds:
        #continue
    print(train_index.max(), val_index.min())
    
    #Splitting The DF Into Train Test Split
    xtrain = train[featCols].iloc[train_index]
    xTest = train[featCols].iloc[val_index]
    yTrain = valTarget.iloc[train_index]
    yTest = valTarget.iloc[val_index]
    
    #Training The Model
    lgb_train = lgb.Dataset(xTrain, yTrain > 0)
    lgb_eval = lgb.Dataset(xTest, yTest > 0)
    params = {
            'objective': 'binary',
            'boosting': 'gbdt',
            'learning_rate': 0.1,
            'metric': {'binary_logloss'},
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq' : 5
            }
    gbm_class = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=(lgb_train, lgb_eval),
               early_stopping_rounds=20,
               verbose_eval = 20)
    
    #validating on the above data
    lgb_train = lgb.Dataset(xTrain[yTrain > 0], yTrain[yTrain > 0]) 
    lgb_eval = lgb.Dataset(xTest[yTest > 0] , yTest[yTest > 0])
    params = {
            'boosting': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'learning_rate': 0.5,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq' : 5
            }
    gbm_regress = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=(lgb_train, lgb_eval),
               early_stopping_rounds=20,
               verbose_eval = 20)
#     models.append(gbm)

    yPred = (gbm_class.predict(xTest, num_iteration=gbm_class.best_iteration) > .5) *\
    (gbm_regress.predict(xTest, num_iteration=gbm_regress.best_iteration))
    error += np.sqrt(mean_squared_error(yPred, (yTest)))/num_folds
    print(np.sqrt(mean_squared_error(yPred, (yTest))))
    break
print(error)

20216098 4
[LightGBM] [Info] Number of positive: 14674161, number of negative: 1498719
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1171
[LightGBM] [Info] Number of data points in the train set: 16172880, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.907331 -> initscore=2.281477
[LightGBM] [Info] Start training from score 2.281477
Training until validation scores don't improve for 20 rounds
[20]	training's binary_logloss: 0.175663	valid_1's binary_logloss: 0.175722
[40]	training's binary_logloss: 0.155254	valid_1's binary_logloss: 0.155318
[60]	training's binary_logloss: 0.143971	valid_1's binary_logloss: 0.144039
[80]	training's binary_logloss: 0.136291	valid_1's binary_logloss: 0.136401
[100]	training's binary_logloss: 0.130267	valid_1's binary_logloss: 0.130382
[120]	training's binary_logloss: 0.125483	valid_1's binary_logloss: 0.125673
[140]	trai

[1420]	training's rmse: 0.389612	valid_1's rmse: 0.394289
[1440]	training's rmse: 0.388831	valid_1's rmse: 0.393593
[1460]	training's rmse: 0.388172	valid_1's rmse: 0.392997
[1480]	training's rmse: 0.387362	valid_1's rmse: 0.392256
[1500]	training's rmse: 0.38624	valid_1's rmse: 0.391177
[1520]	training's rmse: 0.385453	valid_1's rmse: 0.390487
[1540]	training's rmse: 0.384619	valid_1's rmse: 0.389734
[1560]	training's rmse: 0.383948	valid_1's rmse: 0.389105
[1580]	training's rmse: 0.383184	valid_1's rmse: 0.388403
[1600]	training's rmse: 0.382439	valid_1's rmse: 0.387719
[1620]	training's rmse: 0.381761	valid_1's rmse: 0.38712
[1640]	training's rmse: 0.380831	valid_1's rmse: 0.386301
[1660]	training's rmse: 0.380093	valid_1's rmse: 0.385629
[1680]	training's rmse: 0.379359	valid_1's rmse: 0.385009
[1700]	training's rmse: 0.378589	valid_1's rmse: 0.384244
[1720]	training's rmse: 0.377774	valid_1's rmse: 0.383508
[1740]	training's rmse: 0.377135	valid_1's rmse: 0.382941
[1760]	training'

In [104]:
sorted(zip(gbm_regress.feature_importance(), gbm_regress.feature_name()),reverse = True)

[(2404, 'building_id'),
 (2157, 'square_feet'),
 (1057, 'meter'),
 (851, 'year_built'),
 (790, 'primary_use'),
 (698, 'month'),
 (668, 'hour'),
 (545, 'air_temperature'),
 (323, 'dew_temperature'),
 (195, 'weekend'),
 (150, 'day'),
 (32, 'cloud_coverage')]

In [105]:
from tqdm import tqdm
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test.shape[0]/50000)))):
    
    res.append(np.expm1
                ((gbm_class.predict
                (test.iloc[i:i+step_size], num_iteration=gbm_class.best_iteration) > .5) *\
                (gbm_regress.predict(test.iloc[i:i+step_size], num_iteration=gbm_regress.best_iteration))))
    i+=step_size

100%|██████████| 834/834 [02:06<00:00,  6.57it/s]


In [106]:
res = np.concatenate(res)
pd.DataFrame(res).describe()

Unnamed: 0,0
count,41697600.0
mean,312.713
std,1971.325
min,-0.8630281
25%,17.11868
50%,78.33743
75%,253.4396
max,250186.2


In [107]:
print(res)

[  0.           0.           0.         ...   7.74453153 139.13351683
  20.30705139]


In [108]:
sub = pd.read_csv("energyCSV/sample_submission.csv")

In [109]:
sub["meter_reading"] = res

In [110]:
res

array([  0.        ,   0.        ,   0.        , ...,   7.74453153,
       139.13351683,  20.30705139])

In [111]:
sub

Unnamed: 0,row_id,meter_reading
0,0,0.000000
1,1,0.000000
2,2,0.000000
3,3,0.000000
4,4,0.000000
...,...,...
41697595,41697595,19.091084
41697596,41697596,3.053052
41697597,41697597,7.744532
41697598,41697598,139.133517


In [112]:
sub['meter_reading'].sum()

13039380415.315474

In [113]:
train

Unnamed: 0,building_id,meter,hour,day,weekend,month,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature
0,0,0,0,1,4,1,0,7432,2008.0,25.000000,6.0,20.000000
1,1,0,0,1,4,1,0,2720,2004.0,25.000000,6.0,20.000000
2,2,0,0,1,4,1,0,5376,1991.0,25.000000,6.0,20.000000
3,3,0,0,1,4,1,0,23685,2002.0,25.000000,6.0,20.000000
4,4,0,0,1,4,1,0,116607,1975.0,25.000000,6.0,20.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
20216095,1444,0,23,31,5,12,1,19619,1914.0,1.700195,,-5.601562
20216096,1445,0,23,31,5,12,0,4298,,1.700195,,-5.601562
20216097,1446,0,23,31,5,12,1,11265,1997.0,1.700195,,-5.601562
20216098,1447,0,23,31,5,12,4,29775,2001.0,1.700195,,-5.601562


In [114]:
valTarget.sum()

83811130.0