## TEAM FRUIT GARDEN
### https://www.kaggle.com/c/ashrae-energy-prediction

Б16-513
Аккад О.А.
Концов А.М.
Туровский И.А.
Уваров М.П

#### Importing libraries

In [1]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import gc


path_data = "./Data/"
path_train = path_data + "train.csv"
path_test = path_data + "test.csv"
path_building = path_data + "building_metadata.csv"
path_weather_train = path_data + "weather_train.csv"
path_weather_test = path_data + "weather_test.csv"

#### Memmory reduce
fairly stolen from some guy

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype


def reduce_mem_usage(df, use_float16=False):
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    return df

#### Read train values

In [3]:
building = pd.read_csv(path_building)
building = reduce_mem_usage(building, use_float16=True)

df_train = pd.read_csv(path_train)
df_train = reduce_mem_usage(df_train, use_float16=True)

weather_train = pd.read_csv(path_weather_train)
weather_train = reduce_mem_usage(weather_train, use_float16=True)

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%
Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.90 MB
Decreased by 71.8%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 2.65 MB
Decreased by 72.4%


#### Prepare data method

In [4]:
def merge_and_process_data(X, building_data, weather_data, is_test):
    building_data.year_built[building_data.year_built.isnull()] = building_data.year_built.median()
    
    X = X.merge(building_data, on="building_id", how="left")
    X = X.merge(weather_data,  on=["site_id", "timestamp"], how="left")
    
    X.timestamp = pd.to_datetime(X.timestamp, format="%Y-%m-%d %H:%M:%S")
    gc.collect()
    X["month"] = X.timestamp.dt.month
    X["day"]   = X.timestamp.dt.day
    X["hour"]  = X.timestamp.dt.hour
    
    drop_features = ["timestamp", "wind_direction", "wind_speed", "precip_depth_1_hr"]

    X.drop(drop_features, axis=1, inplace=True)

    if is_test:
        Y = X.row_id
        X.drop("row_id", axis=1, inplace=True)
    else:
        Y = X.meter_reading
        X.drop("meter_reading", axis=1, inplace=True)
    return X, Y

In [5]:
#df_train = df_train[df_train['meter_reading'] > 0]

X_train, Y_train = merge_and_process_data(df_train, building, weather_train, is_test=False)
gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


78

In [6]:
X_train.tail()
Y_train = Y_train.loc[X_train.air_temperature.notna()]
X_train = X_train.loc[X_train.air_temperature.notna()]

In [7]:
pd.DataFrame(Y_train).tail()

Unnamed: 0,meter_reading
20216095,8.75
20216096,4.825
20216097,0.0
20216098,159.574997
20216099,2.85


#### Add categories

In [8]:
categories = ["building_id", "site_id", "meter", "primary_use", "month", "day", "hour"]

dataset = lgb.Dataset(X_train, label=Y_train, categorical_feature=categories, free_raw_data=False)

#### Build model

In [9]:
params = {
    "objective": "regression",
    "metric": "rmsle"
}

model = lgb.train(params, train_set=dataset)



#### Read & Process test values

In [10]:
df_test = pd.read_csv(path_test)
weather_test = pd.read_csv(path_weather_test)

df_test = reduce_mem_usage(df_test)
weather_test = reduce_mem_usage(weather_test)

Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.65 MB
Decreased by 71.8%
Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 8.96 MB
Decreased by 53.0%


In [11]:
X_test, row_ids = merge_and_process_data(df_test, building, weather_test, is_test=True)
gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


78

#### Finally Predict

In [12]:
pred = model.predict(X_test)
gc.collect()
pred = np.clip(pred, 0, a_max=None)

In [None]:
X_test.tail()

In [13]:
pd.DataFrame(pred).tail()

Unnamed: 0,0
41697595,62.685221
41697596,62.685221
41697597,62.685221
41697598,62.685221
41697599,109.859188


In [14]:
submission = pd.DataFrame({"row_id": row_ids, "meter_reading": pred})
submission.to_csv("new_submission_2.csv", index=False)
print("DONE")

DONE


In [None]:
#### Plots
import matplotlib.pyplot as plt
# plt.scatter(X_test, row_ids)
# plt.plot(X_test, pred)
# plt.show()
# lgb.plot(pred)
ax = lgb.plot_importance(model, max_num_features=10)
plt.show()