<a href="https://colab.research.google.com/github/laurence-lin/Kaggle_competition/blob/master/Ashrae_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

import sklearn
import lightgbm as lgb


import matplotlib.pyplot as plt
import os

import gc
from google.colab import files

In [4]:
# load data from Cloud Storage
from google.colab import auth
auth.authenticate_user()

# Configure GCP project and use gsutil to copy the file from storage
!gcloud config set project 'blind-detection'
!gsutil cp -r gs://ashare_dataset/*.csv  sample_data/

Updated property [core/project].
Copying gs://ashare_dataset/building_metadata.csv...
Copying gs://ashare_dataset/sample_submission.csv...
Copying gs://ashare_dataset/test.csv...
Copying gs://ashare_dataset/train.csv...
\ [4 files][  2.4 GiB/  2.4 GiB]  111.0 MiB/s                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://ashare_dataset/weather_test.csv...
Copying gs://ashare_dataset/weather_train.csv...
/ [6 files][  2.4 GiB/  2.4 GiB]   22.1 MiB/s                                   
Operation completed over 6 objects/2.4 GiB.                                      


In [0]:


# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
print(os.listdir('sample_data/'))
data_path = 'sample_data/'
train = pd.read_csv(os.path.join(data_path, 'train.csv'))
test = pd.read_csv(os.path.join(data_path, 'test.csv'))
building = pd.read_csv(os.path.join(data_path, 'building_metadata.csv'))
weather_test = pd.read_csv(os.path.join(data_path, 'weather_test.csv'))
#submission = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))
weather_train = pd.read_csv(os.path.join(data_path, 'weather_train.csv'))

train = reduce_mem_usage(train, use_float16 = True)
building = reduce_mem_usage(building, use_float16 = True)
weather_train = reduce_mem_usage(weather_train, use_float16 = True)
test = reduce_mem_usage(test)
weather_test = reduce_mem_usage(weather_test)

['README.md', 'anscombe.json', 'train.csv', 'weather_train.csv', 'building_metadata.csv', 'weather_test.csv', 'test.csv', 'sample_submission.csv', 'mnist_train_small.csv', 'mnist_test.csv', 'california_housing_train.csv', 'california_housing_test.csv']
Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.90 MB
Decreased by 71.8%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 2.65 MB
Decreased by 72.4%
Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.65 MB
Decreased by 71.8%
Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 8.96 MB
Decreased by 53.0%


In [7]:
print(train.head())
print(train.shape)
print('Building meta data: ')
print(building.head())
print('Weather meta data for building: ')
print(weather_train.head())

gc.collect()

   building_id  meter            timestamp  meter_reading
0            0      0  2016-01-01 00:00:00            0.0
1            1      0  2016-01-01 00:00:00            0.0
2            2      0  2016-01-01 00:00:00            0.0
3            3      0  2016-01-01 00:00:00            0.0
4            4      0  2016-01-01 00:00:00            0.0
(20216100, 4)
Building meta data: 
   site_id  building_id primary_use  square_feet  year_built  floor_count
0        0            0   Education         7432      2008.0          NaN
1        0            1   Education         2720      2004.0          NaN
2        0            2   Education         5376      1991.0          NaN
3        0            3   Education        23685      2002.0          NaN
4        0            4   Education       116607      1975.0          NaN
Weather meta data for building: 
   site_id            timestamp  ...  wind_direction  wind_speed
0        0  2016-01-01 00:00:00  ...             0.0    0.000000
1        0

33

As to training data, I've about 1448 buildings to predict. 
Time scan over 3 years, each day as time step. 

In [8]:
# Data engineering

from sklearn.preprocessing import LabelEncoder

# Category feaure encoding
le = LabelEncoder()
building['primary_use'] = le.fit_transform(building['primary_use'])

# delete useless feature
weather_train.drop(['sea_level_pressure', 'wind_direction', 'wind_speed'], axis = 1, inplace = True)
weather_test.drop(['sea_level_pressure', 'wind_direction', 'wind_speed'], axis = 1, inplace = True)

gc.collect()

0

Now should we rearrange the data? Each building locate in single site.
Assumption to rearrange:
1. Sort by building_id
2. Sort by time_stamp

Feature engineering:
Data cleaning: delete useless column
Data processing: standardization
Feature engineering: create new features

In [9]:
'''
# Reduce memory usage
print(train.memory_usage().sum())
print(building.memory_usage().sum())
print(weather_train.memory_usage().sum())
'''
# Data Preprocessing
def preprocessing(x, building_data, weather_data, test = False):
    '''
    Preprocessing for training and testing data:
    Merge data with other two feature dataset
    Create new useful features
    Sort data by timestamp
    
    Return:
    training: X & y
    testing: X
    '''
    # Merge all feature data
    X = x.merge(building_data, on = 'building_id', how = 'left')
    X = X.merge(weather_data, on = ['site_id', 'timestamp'], how = 'left')
    
    # we create new feature from timestamp: assumed useful information in 
    # 3 time feature: hour, weekday, in_holiday
    # Could validate this later
    X['timestamp'] = pd.to_datetime(X['timestamp'], format = '%Y-%m-%d %H:%M:%S')
    X['square_feet'] = np.log1p(X['square_feet'])
    
    # Sort training data by timestamp
    if not test:
        X.sort_values(by = 'timestamp', inplace = True) # sort train data by time
        X.reset_index(drop = True, inplace = True)  # reset index to default that messed by sorting
        
    # holiday? Should I set holiday by hand? How to validate the association?
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                "2017-01-01", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                "2019-01-01"]
    
    X['hour'] = X['timestamp'].dt.hour
    X['weekday'] = X['timestamp'].dt.dayofweek
    #X["is_holiday"] = (X.timestamp.dt.date.astype("str").isin(holidays)).astype(int)
    
    # After create useful features from timestamp, remove if
    X.drop('timestamp', axis = 1, inplace = True)
    
    # Return X & y
    if not test:
        y = np.log1p(X['meter_reading'])
        X.drop('meter_reading', axis = 1, inplace = True)
        return X, y
    elif test == True:
        row_id = X['row_id']
        X.drop('row_id', axis = 1, inplace = True)
        return X, row_id
    
## Q1: Should I do log transformation?
## Q2: Should I set holiday by hand?
## Q3: There are still columns contains multiple missing values

# Create training data
x_data, y_data = preprocessing(train, building, weather_train, False)
del train, weather_train
gc.collect()

57

In [10]:
# 70% for training, 30% for validating
train_end = int(len(x_data) * 0.5)
x_train = x_data[0:train_end]
x_valid = x_data[train_end:]
y_train = y_data[0:train_end]
y_valid = y_data[train_end:]

# Specify categorical features for xgboost
cat_feature = ['building_id', 'site_id', 'meter', 'primary_use', 'hour', 'weekday']
train_set = lgb.Dataset(x_train, label = y_train, 
                        categorical_feature = cat_feature, 
                        free_raw_data = False
                       )
valid_set = lgb.Dataset(x_valid, label = y_valid,
                       categorical_feature = cat_feature,
                       free_raw_data = False)

print('Create LGBM dataset, training on 70% of data and validate on 30% of data')
params = {'task':'train',
          'objective':'regression',
          'boosting':'gbdt',
          'num_iterations':1000,
          'learning_rate':0.05,
          'num_leaves':45,
          'metric':'rmse',
          'feature_fraction':0.8,
          'reg_lambda':2
         }
train_booster = lgb.train(params,
                          train_set = train_set,
                          #num_boost_round = 1000,
                          valid_sets = [train_set, valid_set],
                          verbose_eval = 200,
                          early_stopping_rounds = 200
                         )

gc.collect()

Create LGBM dataset, training on 70% of data and validate on 30% of data




Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 0.918024	valid_1's rmse: 1.33676
[400]	training's rmse: 0.8642	valid_1's rmse: 1.33928
Early stopping, best iteration is:
[313]	training's rmse: 0.882021	valid_1's rmse: 1.33565


604

In [11]:
del x_train, x_valid, y_train, y_valid, train_set, valid_set
gc.collect()

test_data, row_ids = preprocessing(test, building, weather_test, True)
print(row_ids.head())

del test, weather_test, building
gc.collect()
# Scoring test data
prediction = train_booster.predict(data = test_data 
                                   #num_iteration = train_booster.best_iteration
                                  )
# Transform back from the log1p transformation on the y_data
prediction = np.expm1(prediction)
# limit the value minimum to zero, no negative
prediction = np.clip(prediction, a_min = 0, a_max = None)

gc.collect()

0    0
1    1
2    2
3    3
4    4
Name: row_id, dtype: int32


4

In [13]:
# save model for later comparison
train_booster.save_model('lgbm_1')

<lightgbm.basic.Booster at 0x7f1eabcbb320>

In [14]:
!ls

adc.json  lgbm_1  sample_data


In [15]:
from google.colab import files
files.download('lgbm_1')

MessageError: ignored