In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import pickle
from urllib.request import urlopen
import matplotlib.pyplot as plt
from nose.tools import *
import time
import datetime as dt
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm, skew, kurtosis
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV,cross_val_score 
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
import joblib
import lightgbm as lgb
from lightgbm import LGBMRegressor

In [None]:
fin_ext_data = pd.read_csv(Path("Ext_Data") / "external_data_no_cyc.csv")

In [None]:
data = pd.read_parquet(Path("data") / "train.parquet")

In [None]:
data1 = pd.read_parquet(Path("data") / "test.parquet")

In [None]:
fin_ext_data

In [None]:
fin_ext_data = fin_ext_data.drop(['Unnamed: 0'], axis=1)
fin_ext_data['date'] = pd.to_datetime(fin_ext_data['date'])

In [None]:
train_merged = data.merge(fin_ext_data, on='date')

In [None]:
test_merged = data1.merge(fin_ext_data, on='date')

In [None]:
data_tr = train_merged.sort_values(["date", "counter_name"])
y_train = data_tr["log_bike_count"].values
X_train = data_tr.drop(["log_bike_count", "bike_count"], axis=1)

In [None]:
data_ts = test_merged.sort_values(["date", "counter_name"])
y_test = data_ts["log_bike_count"].values
X_test = data_ts.drop(["log_bike_count", "bike_count"], axis=1)

In [None]:
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
import time

date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()
num_features = ['temp', 'dwpt', 'rhum', 'prcp', 'wdir', 'wspd', 'pres']

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name", "season"]

rest_cols = ['holiday', 'weekend', 'is_night', 'lockdown1', 'lockdown2']

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("numf", StandardScaler(), num_features),
        ("rem", 'passthrough', rest_cols)
    ]
)

params = {"lgbmregressor__learning_rate" : [0.01, 0.05, 0.1, 0.2],
          "lgbmregressor__max_depth" : [5, 6, 8, 9, 10, 12],
          "lgbmregressor__num_leaves" : [50, 100, 200, 400, 800, 1000],
          "lgbmregressor__min_data_in_leaf" : [50, 100, 200, 500],
          "lgbmregressor__lambda_l2": [1, 3, 5, 7, 9],
          "lgbmregressor__n_estimators" : [1000, 5000, 10000]
}

regressor = LGBMRegressor()
pipe = make_pipeline(date_encoder, preprocessor, regressor)
rscv = RandomizedSearchCV(pipe, param_distributions=params, n_iter=40, scoring="neg_root_mean_squared_error", n_jobs=-1, cv=5, verbose=3)
rscv.fit(X_train, y_train)

In [None]:
bst = rscv.best_params_
bst

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

print(
    f"Train set, RMSE={mean_squared_error(y_train, rscv.predict(X_train), squared=False):.7f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, rscv.predict(X_test), squared=False):.7f}"
)

print(
    f"Train set r2, RMSE={r2_score(y_train, rscv.predict(X_train)):.7f}"
)
print(
    f"Test set r2, RMSE={r2_score(y_test, rscv.predict(X_test)):.7f}"
)