# Libs, data importing

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
import plotly.express as px
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
%matplotlib inline

# Modelling and Forecasting
# ==============================================================================
from sklearn.ensemble import RandomForestRegressor


from datetime import date
from joblib import dump, load
from rich.progress import track

# Configuration
# ==============================================================================
import warnings
import holidays
warnings.filterwarnings('ignore')



In [36]:
data_train = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/train_preprocessed.csv')
data_train.drop('Unnamed: 0', axis=1, inplace=True)


In [37]:
data_test = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/test.csv')

In [38]:
data_val = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/valid.csv')
# drop row 0101000020E610000000000000000000000000000000000000
data_val = data_val[data_val['point'] != '0101000020E610000000000000000000000000000000000000']

In [39]:
data_unique_points = data_train['point'].unique()
test_unique_points = data_test['point'].unique()
valid_unique_points = data_val['point'].unique()

# adding datetime holidays, weekends, weekdays

Add datetime column to test and valid datasets

In [40]:
data_test['datetime'] = pd.to_datetime(data_test['hour'], unit='s')
data_val['datetime'] = pd.to_datetime(data_val['hour'], unit='s')
data_train['datetime'] = pd.to_datetime(data_train['time'])

In [41]:
data_train

Unnamed: 0,point,lon,lat,time,timestamp,num_posts,datetime
0,0101000020E610000002A5EC7AB31D3E4097654065F8EA...,30.116020,59.835705,2019-11-22 11:00:00,1574420400,1,2019-11-22 11:00:00
1,0101000020E610000002A5EC7AB31D3E4097654065F8EA...,30.116020,59.835705,2019-11-22 12:00:00,1574424000,1,2019-11-22 12:00:00
2,0101000020E610000002A5EC7AB31D3E4097654065F8EA...,30.116020,59.835705,2019-11-23 13:00:00,1574514000,1,2019-11-23 13:00:00
3,0101000020E610000002A5EC7AB31D3E4097654065F8EA...,30.116020,59.835705,2019-11-25 16:00:00,1574697600,1,2019-11-25 16:00:00
4,0101000020E610000002A5EC7AB31D3E4097654065F8EA...,30.116020,59.835705,2019-11-27 10:00:00,1574848800,1,2019-11-27 10:00:00
...,...,...,...,...,...,...,...
3625651,0101000020E6100000FF9D4C0EC3853E4094782B2D0DF3...,30.522508,59.898840,2019-12-06 04:00:00,3151209600,2,2019-12-06 04:00:00
3625652,0101000020E6100000FF9D4C0EC3853E4094782B2D0DF3...,30.522508,59.898840,2019-12-23 08:00:00,1577088000,1,2019-12-23 08:00:00
3625653,0101000020E6100000FF9D4C0EC3853E40ABD94A1972EF...,30.522508,59.870670,2019-06-10 09:00:00,1560157200,1,2019-06-10 09:00:00
3625654,0101000020E6100000FF9D4C0EC3853E40ABD94A1972EF...,30.522508,59.870670,2019-10-06 14:00:00,1570370400,1,2019-10-06 14:00:00


Add weekday to test and valid

In [42]:
data_train["dayhour"] = data_train["datetime"].dt.hour
data_train["weekday"] = data_train["datetime"].dt.weekday

data_test["dayhour"] = data_test["datetime"].dt.hour
data_test["weekday"] = data_test["datetime"].dt.weekday

data_val["dayhour"] = data_val["datetime"].dt.hour
data_val["weekday"] = data_val["datetime"].dt.weekday

Add holidays

In [43]:
holidays_russia = holidays.country_holidays('RU', years = [2019, 2020])

data_train['is_holiday'] = data_train['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)
data_test['is_holiday'] = data_test['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)
data_val['is_holiday'] = data_val['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)

In [44]:
data_train = data_train.drop(['time'], axis=1)

# Random Forest solution

In [45]:
def custom_metric(y_true, y_pred):
    return abs(y_true - y_pred) / y_pred

def flatten(l):
    return [item for sublist in l for item in sublist]

def rfr_model_fit_predict(data_train: pd.DataFrame, data_test: pd.DataFrame, model):

    data_test_predictions =[]
    data_test_rf_error = []
    test_unique_points = data_test['point'].unique()

    for point in track(test_unique_points, description='Fitting and predicting'):
        data_train_point = data_train[data_train['point'] == point]
        data_test_point = data_test[data_test['point'] == point]

        X_train = data_train_point[['timestamp', 'dayhour', 'weekday', 'is_holiday']]
        y_train = data_train_point['num_posts']
        X_test = data_test_point[['hour', 'dayhour', 'weekday', 'is_holiday']]
        y_test = data_test_point['sum']

        model.fit(X_train, y_train)
        y_pred = np.round(model.predict(X_test))
        data_test_predictions.append(y_pred)
        data_test_rf_error.append(custom_metric(y_test, y_pred))
    
    return data_test_rf_error, data_test_predictions

### test data

time = 46 s

error = 0.8662

In [46]:
%%time

data_test_rfr_error, data_test_predictions = rfr_model_fit_predict(data_train, data_test, RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))
data_test_rfr_error = flatten(data_test_rfr_error)
data_test_predictions = flatten(data_test_predictions)
data_test['rfr_predictions'] = data_test_predictions
data_test['rfr_error'] = data_test_rfr_error
data_test['rfr_error'].mean()

Output()

CPU times: user 46.2 s, sys: 112 ms, total: 46.3 s
Wall time: 46.2 s


8.682857142857143

In [48]:
data_test

Unnamed: 0,hour,lat,lon,point,sum,error,datetime,dayhour,weekday,is_holiday,rfr_predictions,rfr_error
0,1582711200,59.934863,30.331616,0101000020E61000009BAC04C2E4543E40DB251193A9F7...,7,0.370265,2020-02-26 10:00:00,10,2,0,1.0,6.0
1,1581937200,59.940488,30.329370,0101000020E6100000DBC1F19351543E4006FC5DE561F8...,6,0.754735,2020-02-17 11:00:00,11,0,0,1.0,5.0
2,1581523200,59.905597,30.297929,0101000020E61000006AEBE80E454C3E407F614299EAF3...,5,3.754735,2020-02-12 16:00:00,16,2,0,1.0,4.0
3,1581512400,59.921359,30.356319,0101000020E6100000CFC2D4BC375B3E401FBF4913EFF5...,16,4.088069,2020-02-12 13:00:00,13,2,0,1.0,4.0
4,1581771600,59.939363,30.315895,0101000020E61000006141807FDE503E40A554BF083DF8...,10,2.088069,2020-02-15 13:00:00,13,5,0,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1582059600,59.926986,30.331616,0101000020E61000009BAC04C2E4543E4070B5CC78A7F6...,6,0.754735,2020-02-18 21:00:00,21,1,0,1.0,4.0
696,1582038000,59.925860,30.295683,0101000020E6100000AB00D6E0B14B3E406379569882F6...,15,1.215399,2020-02-18 15:00:00,15,1,0,1.0,5.0
697,1581926400,59.937113,30.282208,0101000020E6100000328064CC3E483E400C288C4EF3F7...,20,18.754735,2020-02-17 08:00:00,8,0,0,1.0,6.0
698,1582099200,59.933737,30.322632,0101000020E61000009E01B90998523E407AB3D8B484F7...,8,2.754735,2020-02-19 08:00:00,8,2,0,1.0,4.0


### validation data

time = 48 s

error = 0.86883

In [49]:
%%time
data_val_rfr_error, data_val_predictions = rfr_model_fit_predict(data_train, data_val, RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42))
data_val_rfr_error = flatten(data_val_rfr_error)
data_val_predictions = flatten(data_val_predictions)
data_val['rfr_predictions'] = data_val_predictions
data_val['rfr_error'] = data_val_rfr_error
data_val['rfr_error'].mean()

Output()

CPU times: user 47.8 s, sys: 63.1 ms, total: 47.9 s
Wall time: 47.7 s


8.848396501457726

In [50]:
data_test.to_csv('data/test_rfr_predictions.csv')
data_val.to_csv('data/val_rfr_predictions.csv')