# Libs, data importing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
import plotly.express as px
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# Modelling and Forecasting
# ==============================================================================
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from rich.progress import track
from datetime import date
from joblib import dump, load

# Configuration
# ==============================================================================
import warnings
import holidays
warnings.filterwarnings('ignore')



In [None]:
data_train = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/train_preprocessed.csv')
data_train.drop('Unnamed: 0', axis=1, inplace=True)


In [None]:
data_test = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/test.csv')

In [None]:
data_val = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/valid.csv')
# drop row 0101000020E610000000000000000000000000000000000000
data_val = data_val[data_val['point'] != '0101000020E610000000000000000000000000000000000000']

In [None]:
data_unique_points = data_train['point'].unique()
test_unique_points = data_test['point'].unique()
valid_unique_points = data_val['point'].unique()

# Split data into year, month, day, hour and adding holidays, weekends, weekdays

Add datetime column to test and valid datasets

In [None]:
data_test['datetime'] = pd.to_datetime(data_test['hour'], unit='s')
data_val['datetime'] = pd.to_datetime(data_val['hour'], unit='s')
data_train['datetime'] = pd.to_datetime(data_train['time'])

In [None]:
data_train

Add yyyy, mm, dd, hh to test and valid

In [None]:
data_train["dayhour"] = data_train["datetime"].dt.hour
data_train["weekday"] = data_train["datetime"].dt.weekday

data_test["dayhour"] = data_test["datetime"].dt.hour
data_test["weekday"] = data_test["datetime"].dt.weekday

data_val["dayhour"] = data_val["datetime"].dt.hour
data_val["weekday"] = data_val["datetime"].dt.weekday

Add holidays

In [None]:
holidays_russia = holidays.country_holidays('RU', years = [2019, 2020])

data_train['is_holiday'] = data_train['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)
data_test['is_holiday'] = data_test['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)
data_val['is_holiday'] = data_val['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)

In [None]:
data_train = data_train.drop(['time'], axis=1)

# Regressions: catboost, lightgbm and xgboost

In [None]:
# fig, ax = plt.subplots(figsize=(11, 4))

# data_train[data_train['point'] == '0101000020E6100000DBC1F19351543E4006FC5DE561F84D40']['num_posts'].plot(ax=ax, label='train')
# data_val[data_val['point'] == '0101000020E6100000DBC1F19351543E4006FC5DE561F84D40']['sum'].plot(ax=ax, label='validation')
# data_test[data_test['point'] == '0101000020E6100000DBC1F19351543E4006FC5DE561F84D40']['sum'].plot(ax=ax, label='test')
# ax.set_title('Number of users')
# ax.legend();

In [None]:
def custom_metric(y_true, y_pred):
    return abs(y_true - y_pred) / y_true

def flatten(l):
    return [item for sublist in l for item in sublist]

def learn_regressor(data_train: pd.DataFrame, data_test: pd.DataFrame, model):
    data_test_boost_error = []
    test_unique_points = data_test['point'].unique()
    for point in track(test_unique_points, description='Learning'):
        model = model
        data_train_point = data_train[data_train['point'] == point]
        data_test_point = data_test[data_test['point'] == point]
        X_train = data_train_point[['timestamp', 'dayhour', 'weekday', 'is_holiday']]
        y_train = data_train_point['num_posts']
        model.fit(X_train, y_train)
        data_test_point['boost'] = np.round(model.predict(data_test_point[['hour', 'dayhour', 'weekday', 'is_holiday']].values))
        data_test_point['boost_error'] = custom_metric(data_test_point['sum'], data_test_point['boost'])
        data_test_boost_error.append(list(data_test_point['boost_error']))
    return data_test_boost_error


## Mutual info score calculation

In [None]:
# Mutual information matrix
# =================================================================================
from sklearn.metrics import mutual_info_score

def mutual_info_matrix(data):
    mi_matrix = pd.DataFrame(index=data.columns, columns=data.columns)
    for i in track(data.columns, description='Calculating MI matrix'):
        for j in data.columns:
            mi_matrix.loc[i, j] = round(float(mutual_info_score(data[i], data[j])), 2)
    return mi_matrix

mi_train = mutual_info_matrix(data_train)

# convert all columns to float 
mi_train = mi_train.astype(float)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))  
sns.heatmap(mi_train, annot=True, cmap='Blues')

In [None]:
# correlation heatmap
# =================================================================================\
corr = data_train.corr()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr, annot=True, fmt='.2g',cmap='Blues')

## XGBoost

### test data

time = 36.2 s

error = 0.86883

In [None]:
%%time
data_test_xgboost_error = learn_regressor(data_train, data_test, XGBRegressor())

In [None]:
data_test_xgboost_error = flatten(data_test_xgboost_error)
data_test['xgboost_error'] = data_test_xgboost_error

In [None]:
data_test['xgboost_error'].mean()

### validation data

time = 37.2 s

error = 0.86883

In [None]:
%%time
data_val_xgboost_error = learn_regressor(data_train, data_val, XGBRegressor())

In [None]:
data_val_xgboost_error = flatten(data_val_xgboost_error)

In [None]:
data_val['xgboost_error'] = data_val_xgboost_error
data_val['xgboost_error'].mean()

## CatBoost 

In [None]:
# 

### test data

time = 121.6 s

error = 0.86764

In [None]:
%%time
data_test_catboost_error = learn_regressor(data_train, data_test, CatBoostRegressor())
data_test_catboost_error = flatten(data_test_catboost_error)
data_test['catboost_error'] = data_test_catboost_error
data_test['catboost_error'].mean()


### validation data

time = 124.9 s

error = 0.87005

In [None]:
%%time
data_val_catboost_error = learn_regressor(data_train, data_val, CatBoostRegressor())
data_val_catboost_error = flatten(data_val_catboost_error)
data_val['catboost_error'] = data_val_catboost_error
data_val['catboost_error'].mean()

## LightGBM

### test data

time = 30.2 s

error = 0.866203

In [None]:
%%time
data_test_lgdmboost_error = learn_regressor(data_train, data_test, LGBMRegressor())
data_test_lgdmboost_error = flatten(data_test_lgdmboost_error)
data_test['lgdmboost_error'] = data_test_lgdmboost_error
data_test['lgdmboost_error'].mean()

### validation data

time = 30.5 s

error = 0.868838

In [None]:
%%time
data_val_lgdmboost_error = learn_regressor(data_train, data_val, LGBMRegressor())
data_val_lgdmboost_error = flatten(data_val_lgdmboost_error)
data_val['lgdmboost_error'] = data_val_lgdmboost_error
data_val['lgdmboost_error'].mean()