In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import math
import time
# import xgboost as xgb
import gc
import pickle
from numpy import array

In [2]:
with open('train_LGBM_1209', 'rb') as f:
    train_all = pickle.load(f)
with open('test_LGBM_1209', 'rb') as f:
    test_all = pickle.load(f)

In [3]:
# display(train_all)

In [4]:
submission = pd.read_csv('files/test.csv')
submission.insert(0, 'predicted', 0)
submission.insert(0, 'meter_reading', 0)

In [5]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist


In [6]:
submission, NAlist = reduce_mem_usage(submission)

Memory usage of properties dataframe is : 1908.7647705078125  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  676.0209350585938  MB
This is  35.416670796933424 % of the initial size


In [7]:
print(train_all.shape)
print(test_all.shape)

(19770715, 24)
(41498571, 24)


In [8]:
with pd.option_context('display.max_rows', None):
    print(train_all.dtypes)

building_id            uint16
meter                   uint8
timestamp              object
meter_reading         float32
site_id                 uint8
primary_use             uint8
square_feet            uint32
floor_count             uint8
age                     uint8
weekday                 uint8
hour                    uint8
month                   uint8
air_temperature       float32
cloud_coverage         uint16
dew_temperature       float32
precip_depth_1_hr     float32
sea_level_pressure    float32
wind_direction          uint8
wind_speed            float32
humidity              float32
apparent_temp         float32
min_temperature       float32
max_temperature       float32
mean_temperature      float32
dtype: object


In [9]:
with pd.option_context('display.max_rows', None):
    print(test_all.dtypes)

row_id                 uint32
building_id            uint16
meter                   uint8
timestamp              object
site_id                 uint8
primary_use             uint8
square_feet            uint32
floor_count             uint8
age                     uint8
weekday                 uint8
hour                    uint8
month                   uint8
air_temperature       float32
cloud_coverage         uint16
dew_temperature       float32
precip_depth_1_hr     float32
sea_level_pressure    float32
wind_direction          uint8
wind_speed            float32
humidity              float32
apparent_temp         float32
min_temperature       float32
max_temperature       float32
mean_temperature      float32
dtype: object


In [10]:
def evalerror(preds, actual):
    preds[preds<0] = 0
    return math.sqrt(np.square(np.log(preds+1)-np.log(actual+1)).mean())

In [11]:
split_building = [778,1099]
split_siteid = [7]

In [12]:
drop_col_test = [
    'row_id','meter','timestamp'
    ,'cloud_coverage'
]

drop_col_valid = [
# 'building_id',
'meter',
# 'timestamp',
# 'meter_reading',
# 'site_id',
'cloud_coverage',
# 'precip_depth_1_hr',
# 'sea_level_pressure'
]

categorical_features_valid = ['building_id','primary_use','site_id','age','weekday','hour','wind_direction']
categorical_features = ['building_id','primary_use','site_id','age','month','weekday','hour','wind_direction']
# 'month'

In [13]:
def learn_valid(X,test):
    tick = time.time()
    X = X.drop(['min_temperature','max_temperature'],axis=1)
    test = test.drop(['min_temperature','max_temperature'],axis=1)
    X1 = X.query('(timestamp <= "2016-05-01")')
    X2 = X.query('(timestamp >= "2016-09-01")')
    if X1.empty or X2.empty:
        a = np.array([])
        b = np.array([])
        return (a,b)
    y1 = X1['meter_reading']
    y2 = X2['meter_reading']
    X1 = X1.drop(['meter_reading','timestamp','site_id'],axis=1)
    X2 = X2.drop(['meter_reading','timestamp','site_id'],axis=1)
    X1 = X1.drop(['month'],axis=1)
    X2 = X2.drop(['month'],axis=1)
    y1 = np.log(y1+1)
    y2 = np.log(y2+1)

    params = {
        "objective": "regression",
        "boosting": "gbdt",
        "num_leaves": 1280,
        "learning_rate": 0.05,
        "feature_fraction": 0.85,
        "reg_lambda": 2,
        "metric": "rmse",
#         "device": "gpu",
    }

#     kf = KFold(n_splits=3)
#     models = []
#     for train_index,test_index in kf.split(features):
#         train_features = features.loc[train_index]
#         train_target = target.loc[train_index]

#         test_features = features.loc[test_index]
#         test_target = target.loc[test_index]
#     print(X1.columns)
#     print(X2.columns)
    if list(X1.columns) != list(X2.columns):
        print('error in valid')
        print(X1.columns)
        print(X2.columns)
        return 0
    categorical_features_valid2 = []
    categorical_features2 = []
    for cat in categorical_features_valid:
        if cat in X1.columns:
            categorical_features_valid2.append(cat)
    
    d_training = lgb.Dataset(X1, label=y1,categorical_feature=categorical_features_valid2, free_raw_data=False)
    d_test = lgb.Dataset(X2, label=y2,categorical_feature=categorical_features_valid2, free_raw_data=False)

    model = lgb.train(params, train_set=d_training, num_boost_round=400, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    models.append(model)
#     del train_features, train_target, test_features, test_target, d_training, d_test
    gc.collect()
#     return 0
#     for cal cv score
#     y2_preds = model.predict(X2, num_iteration=model.best_iteration)
#     y2 = np.exp(y2) - 1
#     y2_preds = np.exp(y2_preds) - 1
#     y2_preds[y2_preds<0] = 0
#     tock = time.time()
#     print(tock-tick)
#     return (y2,y2_preds) 

# for real
    best_iteration = model.best_iteration
    y = X['meter_reading']
    y = np.log(y+1)
    X = X.drop(['meter_reading','timestamp'],axis=1)
    for cat in categorical_features:
        if cat in X.columns:
            categorical_features2.append(cat)
    print(X.columns)
    print(test.columns)
#     return 0
    d_training = lgb.Dataset(X, label=y,categorical_feature=categorical_features2, free_raw_data=False)
    if list(X.columns) != list(test.columns):
        print('error in real train')
        return 0
    model = lgb.train(params, train_set=d_training, num_boost_round=best_iteration)
    preds = model.predict(test)
    preds = np.exp(preds) - 1
    preds[preds<0] = 0
    tock = time.time()
    print(tock-tick)
    return preds 

In [14]:
def learn2(train_site):
    nunique = train_site.apply(pd.Series.nunique)
    cols_to_drop = nunique[nunique == 1].index
    train.drop(cols_to_drop, axis=1)
    test.drop(cols_to_drop, axis=1)
    

In [15]:
gc.collect()

40

In [16]:
# test

In [17]:
y2_all = np.array([])
y2_allpreds = np.array([])
preds_all1 = []
preds_all2 = []
models = []
for meter in range(0,4):
    print('Meter {}'.format(meter))
    train_meter = train_all[train_all['meter'] == meter]
    train = train_meter[~train_meter['building_id'].isin(split_building)]
    train = train[~train['site_id'].isin(split_siteid)]
    
    test_meter = test_all[test_all['meter'] == meter]
    test = test_meter[~test_meter['building_id'].isin(split_building)]
    test = test[~test['site_id'].isin(split_siteid)]
    del train_meter
    del test_meter
    gc.collect()
#     for validation
    train.drop(drop_col_valid,axis=1,inplace=True)
    row = test['row_id']
    test.drop(drop_col_test,axis=1,inplace=True)
    preds = learn_valid(train,test)
    
#     (y2,y2_preds) = learn_valid(train,test)
#     y2_all = np.concatenate((y2_all, y2), axis=None) 
#     y2_allpreds = np.concatenate((y2_allpreds, y2_preds), axis=None)
#     break
    
#     for learn
    row = row.values.astype(int)
    submission.at[row,'predicted'] = 1
    submission.at[row,'meter_reading'] = preds
    del row, preds
    gc.collect()

Meter 0




Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.573696	valid_1's rmse: 0.952961
[50]	training's rmse: 0.35632	valid_1's rmse: 0.924503
[75]	training's rmse: 0.297427	valid_1's rmse: 0.932728
Early stopping, best iteration is:
[42]	training's rmse: 0.393695	valid_1's rmse: 0.922186
Index(['building_id', 'site_id', 'primary_use', 'square_feet', 'floor_count',
       'age', 'weekday', 'hour', 'month', 'air_temperature', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'humidity', 'apparent_temp', 'mean_temperature'],
      dtype='object')
Index(['building_id', 'site_id', 'primary_use', 'square_feet', 'floor_count',
       'age', 'weekday', 'hour', 'month', 'air_temperature', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'humidity', 'apparent_temp', 'mean_temperature'],
      dtype='object')
110.94637823104858
Meter 1
Training until v

In [20]:
# # y2_all = np.array([])
# # y2_allpreds = np.array([])
# models = []
# for meter in range(0,4):
#     print('Meter {}'.format(meter))
#     train_meter = train_all[train_all['meter'] == meter]
#     train = train_meter[~train_meter['building_id'].isin(split_building)]
#     train = train[~train['site_id'].isin(split_siteid)]
    
#     test_meter = test_all[test_all['meter'] == meter]
#     test = test_meter[~test_meter['building_id'].isin(split_building)]
#     test = test[~test['site_id'].isin(split_siteid)]
#     del train_meter
#     del test_meter
#     gc.collect()
#     s = train['building_id'].unique()
#     for build in s:
#         print(build)
#         t1 = train[train['building_id'] == build]
#         t2 = test[test['building_id'] == build]
#         t1.drop(drop_col_valid,axis=1,inplace=True)
#         t2.drop(drop_col_test,axis=1,inplace=True)
#         (y2,y2_preds) = learn_valid(t1,t2)
#         y2_all = np.concatenate((y2_all, y2), axis=None) 
#         y2_allpreds = np.concatenate((y2_allpreds, y2_preds), axis=None)

In [None]:
# evalerror(y2_all,y2_allpreds)

In [None]:
# import matplotlib.pyplot as plt
# for model in models:
#     lgb.plot_importance(model)
#     plt.show()

In [21]:
# for split_id
for building in split_building:
    for meter in range(0,4):
        train = train_all[train_all['building_id'] == building]
        test = test_all[test_all['building_id'] == building]
        train = train[train['meter'] == meter]
        test = test[test['meter'] == meter]
        if train.shape[0] == 0:
            continue
        print(building,meter)
        print(train.shape[0])
#         
        train.drop(drop_col_valid,axis=1,inplace=True)
        row = test['row_id']
        test.drop(drop_col_test,axis=1,inplace=True)
#         remove col with one value
        nunique = train.apply(pd.Series.nunique)
        cols_to_drop = nunique[nunique == 1].index
        train.drop(cols_to_drop, axis=1)
        test.drop(cols_to_drop, axis=1)
        
        preds = learn_valid(train,test)
#         
        row = row.values.astype(int)
        submission.at[row,'predicted'] = 1
        submission.at[row,'meter_reading'] = preds
        del row, preds
        gc.collect()

778 0
8773
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.125685	valid_1's rmse: 0.193453
[50]	training's rmse: 0.076283	valid_1's rmse: 0.178241
[75]	training's rmse: 0.054845	valid_1's rmse: 0.175922
[100]	training's rmse: 0.0430767	valid_1's rmse: 0.17533
[125]	training's rmse: 0.0356037	valid_1's rmse: 0.175203
[150]	training's rmse: 0.0303244	valid_1's rmse: 0.175673
Early stopping, best iteration is:
[122]	training's rmse: 0.0363096	valid_1's rmse: 0.174998
Index(['building_id', 'site_id', 'primary_use', 'square_feet', 'floor_count',
       'age', 'weekday', 'hour', 'month', 'air_temperature', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'humidity', 'apparent_temp', 'mean_temperature'],
      dtype='object')
Index(['building_id', 'site_id', 'primary_use', 'square_feet', 'floor_count',
       'age', 'weekday', 'hour', 'month', 'air_temperature', 'dew_temperature',
       'precip_de

In [None]:
# for split_siteid
for site_id in split_siteid:
    for meter in range(0,4):
        train = train_all[train_all['site_id'] == site_id]
        test = test_all[test_all['site_id'] == site_id]
        train = train[train['meter'] == meter]
        test = test[test['meter'] == meter]
        if train.shape[0] == 0:
            continue
        print(meter,train.shape[0])
        train.drop(drop_col_valid,axis=1,inplace=True)
        row = test['row_id']
        test.drop(drop_col_test,axis=1,inplace=True)
#         remove col with one value
        nunique = train.apply(pd.Series.nunique)
        cols_to_drop = nunique[nunique == 1].index
        train.drop(cols_to_drop, axis=1)
        test.drop(cols_to_drop, axis=1)
        
        preds = learn_valid(train,test)
#         
        row = row.values.astype(int)
        submission.at[row,'predicted'] = 1
        submission.at[row,'meter_reading'] = preds
        del row, preds
        gc.collect()

In [23]:
with open('sub2', 'wb') as f:
    pickle.dump(submission, f)