# Libs, data importing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
import plotly.express as px
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# Modelling and Forecasting
# ==============================================================================
from sklearn.ensemble import RandomForestRegressor


from datetime import date
from joblib import dump, load
from rich.progress import track

# Configuration
# ==============================================================================
import warnings
import holidays
warnings.filterwarnings('ignore')



In [None]:
data_train = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/train_preprocessed.csv')
data_train.drop('Unnamed: 0', axis=1, inplace=True)


In [None]:
data_test = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/test.csv')

In [None]:
data_val = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/valid.csv')
# drop row 0101000020E610000000000000000000000000000000000000
data_val = data_val[data_val['point'] != '0101000020E610000000000000000000000000000000000000']

In [None]:
data_unique_points = data_train['point'].unique()
test_unique_points = data_test['point'].unique()
valid_unique_points = data_val['point'].unique()

# adding datetime holidays, weekends, weekdays

Add datetime column to test and valid datasets

In [None]:
data_test['datetime'] = pd.to_datetime(data_test['hour'], unit='s')
data_val['datetime'] = pd.to_datetime(data_val['hour'], unit='s')
data_train['datetime'] = pd.to_datetime(data_train['time'])

In [None]:
data_train

Add weekday to test and valid

In [None]:
data_train["dayhour"] = data_train["datetime"].dt.hour
data_train["weekday"] = data_train["datetime"].dt.weekday

data_test["dayhour"] = data_test["datetime"].dt.hour
data_test["weekday"] = data_test["datetime"].dt.weekday

data_val["dayhour"] = data_val["datetime"].dt.hour
data_val["weekday"] = data_val["datetime"].dt.weekday

Add holidays

In [None]:
holidays_russia = holidays.country_holidays('RU', years = [2019, 2020])

data_train['is_holiday'] = data_train['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)
data_test['is_holiday'] = data_test['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)
data_val['is_holiday'] = data_val['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)

In [None]:
data_train = data_train.drop(['time'], axis=1)

# Random Forest solution

In [None]:
def custom_metric(y_true, y_pred):
    return abs(y_true - y_pred) / y_true

def flatten(l):
    return [item for sublist in l for item in sublist]

def rfr_model_fit_predict(data_train: pd.DataFrame, data_test: pd.DataFrame, model):

    data_test_rf_error = []
    test_unique_points = data_test['point'].unique()

    for point in track(test_unique_points, description='Fitting and predicting'):
        data_train_point = data_train[data_train['point'] == point]
        data_test_point = data_test[data_test['point'] == point]

        X_train = data_train_point[['timestamp', 'dayhour', 'weekday', 'is_holiday']]
        y_train = data_train_point['num_posts']
        X_test = data_test_point[['hour', 'dayhour', 'weekday', 'is_holiday']]
        y_test = data_test_point['sum']

        model.fit(X_train, y_train)
        y_pred = np.round(model.predict(X_test))
        data_test_rf_error.append(custom_metric(y_test, y_pred))
    
    return data_test_rf_error

### test data

time = 46 s

error = 0.8662

In [None]:
%%time

data_test_rfr_error = rfr_model_fit_predict(data_train, data_test, RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))
data_test_rfr_error = flatten(data_test_rfr_error)
data_test['rfr_error'] = data_test_rfr_error
data_test['rfr_error'].mean()

### validation data

time = 48 s

error = 0.86883

In [None]:
%%time
data_val_rfr_error = rfr_model_fit_predict(data_train, data_val, RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))
data_val_rfr_error = flatten(data_val_rfr_error)
data_val['rfr_error'] = data_val_rfr_error
data_val['rfr_error'].mean()