In [1]:
!pip -q install lightgbm

In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from lightgbm import LGBMRegressor

In [6]:
data = pd.read_csv("data/FIN_DATA_v4.csv")

In [7]:
data1 = data.copy()

In [11]:
denom = data1["PRICE_MEDIAN_LAG_3"].replace(0, np.nan)

In [12]:
data1["VARIABLED"] = (data1["PRICE_MEDIAN"] - data1["PRICE_MEDIAN_LAG_3"]) / denom

In [13]:
data1 = data1.replace([np.inf, -np.inf], np.nan).dropna(subset=["VARIABLED"])

In [14]:
data1 = pd.get_dummies(data1).sort_values("DATE_YM")

In [15]:
X = data1.drop("VARIABLED", axis=1)
y = data1["VARIABLED"]

In [17]:
DATE_SPLIT = 202012

In [19]:
train_mask = X["DATE_YM"] <= DATE_SPLIT
test_mask  = X["DATE_YM"] >  DATE_SPLIT

X_train, y_train = X.loc[train_mask], y.loc[train_mask]
X_test,  y_test  = X.loc[test_mask],  y.loc[test_mask]

In [21]:
model_LGBM = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [22]:
model_LGBM.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000716 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2331
[LightGBM] [Info] Number of data points in the train set: 2484, number of used features: 41
[LightGBM] [Info] Start training from score 0.050330


In [23]:
y_train_pred = model_LGBM.predict(X_train)
y_test_pred  = model_LGBM.predict(X_test)

In [27]:
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse  = np.sqrt(mean_squared_error(y_test, y_test_pred))

train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
test_mape  = mean_absolute_percentage_error(y_test, y_test_pred)

train_r2 = r2_score(y_train, y_train_pred)
test_r2  = r2_score(y_test, y_test_pred)

In [29]:
result_LGBM = pd.DataFrame(
    {
        "R2":   [train_r2, test_r2],
        "RMSE": [train_rmse, test_rmse],
        "MAPE": [train_mape, test_mape],
    },
    index=["train", "test"]
)

result_LGBM

Unnamed: 0,R2,RMSE,MAPE
train,0.996002,0.012906,360824700000.0
test,0.777756,0.177391,1319552000000.0
