In [1]:
import pandas as pd

In [2]:
tweets_hour_pd = pd.read_csv('tweets_hour.csv')
tweets_hour_pd.head(5)

Unnamed: 0,year,month,day,hour,sentiment,hourly_score,hourly_count,datetime
0,2009,4,7,5,1,0.647632,64,2009-04-07 05:00:00
1,2009,4,7,5,0,-0.404465,17,2009-04-07 05:00:00
2,2009,4,7,6,0,-0.432473,20,2009-04-07 06:00:00
3,2009,4,7,6,1,0.601063,85,2009-04-07 06:00:00
4,2009,4,7,7,1,0.604386,74,2009-04-07 07:00:00


In [3]:
tweets_hour_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1177 entries, 0 to 1176
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          1177 non-null   int64  
 1   month         1177 non-null   int64  
 2   day           1177 non-null   int64  
 3   hour          1177 non-null   int64  
 4   sentiment     1177 non-null   int64  
 5   hourly_score  1177 non-null   float64
 6   hourly_count  1177 non-null   int64  
 7   datetime      1177 non-null   object 
dtypes: float64(1), int64(6), object(1)
memory usage: 73.7+ KB


In [4]:
tweets_hour_pd = tweets_hour_pd.drop_duplicates(subset=['datetime','sentiment'])
print(len(tweets_hour_pd))
tweets_hour_pd[tweets_hour_pd.datetime == '2009-04-07 01:00:00']

1177


Unnamed: 0,year,month,day,hour,sentiment,hourly_score,hourly_count,datetime


In [5]:
# multiprocessing
from joblib import Parallel, delayed

# data manipulation
import numpy as np
import pandas as pd
from darts import TimeSeries
from darts.utils.timeseries_generation import datetime_attribute_timeseries

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# transformers and preprocessing
from darts.dataprocessing.transformers import Scaler

# models
#from darts.models import NaiveSeasonal, StatsForecastAutoARIMA, ExponentialSmoothing, Prophet #local
#from darts.models import LightGBMModel, RNNModel, NBEATSModel, TFTModel #global

# likelihood
from darts.utils.likelihood_models import GaussianLikelihood

# evaluation
from darts.metrics import mape, coefficient_of_variation, mae 

# settings
import warnings
warnings.filterwarnings("ignore")
import logging
logging.disable(logging.CRITICAL)

In [6]:
# make copy of df
dataset_scaled_EDA = tweets_hour_pd.copy()

# min max value calculation
dataset_scaled_EDA['min_hour'] = dataset_scaled_EDA.groupby(['hour','sentiment'])[['hourly_count']] \
                                    .transform(lambda x: x.min())
dataset_scaled_EDA['max_hour'] = dataset_scaled_EDA.groupby(['hour','sentiment'])[['hourly_count']] \
                                    .transform(lambda x: x.max())

# scale
dataset_scaled_EDA['hour_scaled'] = (dataset_scaled_EDA['hourly_count'] - dataset_scaled_EDA['min_hour'])/(dataset_scaled_EDA['max_hour'] - dataset_scaled_EDA['min_hour'])

# add info about year, week of year and day of week
dataset_scaled_EDA['datetime'] = pd.to_datetime(dataset_scaled_EDA['datetime'])
dataset_scaled_EDA['day_of_week'] = [d.strftime('%A') for d in dataset_scaled_EDA['datetime']]
dataset_scaled_EDA['day_of_week'] = pd.Categorical(dataset_scaled_EDA['day_of_week'], 
  categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
  ordered=True)

dataset_scaled_EDA.head(10) 

Unnamed: 0,year,month,day,hour,sentiment,hourly_score,hourly_count,datetime,min_hour,max_hour,hour_scaled,day_of_week
0,2009,4,7,5,1,0.647632,64,2009-04-07 05:00:00,8,278,0.207407,Tuesday
1,2009,4,7,5,0,-0.404465,17,2009-04-07 05:00:00,3,54,0.27451,Tuesday
2,2009,4,7,6,0,-0.432473,20,2009-04-07 06:00:00,10,59,0.204082,Tuesday
3,2009,4,7,6,1,0.601063,85,2009-04-07 06:00:00,44,249,0.2,Tuesday
4,2009,4,7,7,1,0.604386,74,2009-04-07 07:00:00,34,230,0.204082,Tuesday
5,2009,4,7,7,0,-0.374728,15,2009-04-07 07:00:00,11,63,0.076923,Tuesday
6,2009,4,7,8,0,-0.354589,21,2009-04-07 08:00:00,14,47,0.212121,Tuesday
7,2009,4,7,8,1,0.613716,65,2009-04-07 08:00:00,26,200,0.224138,Tuesday
8,2009,4,7,9,1,0.600345,72,2009-04-07 09:00:00,24,193,0.284024,Tuesday
9,2009,4,7,9,0,-0.401946,6,2009-04-07 09:00:00,6,50,0.0,Tuesday


In [7]:
def set_df_sent(df_send, sent=False, target_col='hourly_count', rename_target_col='target', new_index=False,
                split_col='datetime', split_date='2009-04-01', freq_timestamp='H', x_col=[]):

    df = df_send.copy()
    
    n_row = len(df)

    if not sent == False:
        sent_desc = 'Positive' if sent == '1' else 'Negative'
        print(f'Dataframe Sentiment: {sent_desc}')

        # Remove Sentiment:
        df = df[df.sentiment == sent]
        n_row = len(df)

    print(f'Total Number of rows: {n_row}')
    cols = x_col

    if not new_index == False:
        # Apply Time as index
        df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S')
        df = df.set_index('datetime')

        # verifying if there are missing time (hour)
        df = df.asfreq(freq_timestamp)
        print(f"NA's rows Number: {np.abs(n_row - len(df))}")
        df = df.fillna(0)

    df = df.rename(columns={target_col: rename_target_col})
    cols.append(rename_target_col)
    
    if new_index == False:
        cols.append(split_col)

    df = df[cols]

    # Splitting dataframes into train-test
    if new_index == False:
        df_train = df[pd.to_datetime(df[split_col], format='%Y-%m-%d %H:%M:%S') < pd.to_datetime(split_date, format='%Y-%m-%d')]
        print(f"Train dates : {df_train[split_col].min()} --- {df_train[split_col].max()}  (n={len(df_train)} -> {len(df_train)/len(df):.2%})")

        df_test = df[pd.to_datetime(df[split_col], format='%Y-%m-%d %H:%M:%S') >= pd.to_datetime(split_date, format='%Y-%m-%d')]
        print(f"Test dates  : {df_test[split_col].min()} --- {df_test[split_col].max()}  (n={len(df_test)} -> {len(df_test)/len(df):.2%})")

    else:
        df_train = df.loc[df.index < pd.to_datetime(split_date, format='%Y-%m-%d')]
        print(f"Train dates : {df_train.index.min()} --- {df_train.index.max()}  (n={len(df_train)} -> {len(df_train)/len(df):.2%})")

        df_test = df.loc[df.index >= pd.to_datetime(split_date, format='%Y-%m-%d')]
        print(f"Test dates  : {df_test.index.min()} --- {df_test.index.max()}  (n={len(df_test)} -> {len(df_test)/len(df):.2%})")


    fig, ax = plt.subplots(figsize=(11, 4))
    df_train[rename_target_col].plot(ax=ax, label='train', color='blue')
    df_test[rename_target_col].plot(ax=ax, label='test', color='black')
    ax.legend();
    ax.set_title(f"Train-Test dataframes {sent_desc} Sentiment")

    return df, df_train, df_test

In [8]:
dataset_ts = tweets_hour_pd.copy()
dataset_ts = TimeSeries.from_group_dataframe(df=dataset_ts,
                                             group_cols= 'sentiment',
                                             time_col='datetime',
                                             value_cols='hourly_count',
                                             freq= 'H',
                                            )

In [9]:
first_test_date = pd.Timestamp('2009-04-08')
train_dataset_ts, test_dataset_ts = [], []

for single_ts in tqdm(dataset_ts):
    # split into train and test tests
    single_train_ts, single_test_ts = single_ts.split_before(first_test_date)
    train_dataset_ts.append(single_train_ts)
    test_dataset_ts.append(single_test_ts)

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 333.40it/s]


In [10]:
forecast_horizons = len(test_dataset_ts[0])

def _backtests_local_estimator(_estimator, _ts_set, _split_date, _horizons, _single_forecast):
    model = _estimator
    if _single_forecast:
        model.fit(_ts_set.split_before(_split_date)[0])
        backtests_single_ts = model.predict(_horizons)
    
    else:
        backtests_single_ts = model.historical_forecasts(series=_ts_set, 
                                                         start=_split_date - np.timedelta64(_horizons-1, 'D'), 
                                                         verbose=False, 
                                                         overlap_end=False,
                                                         last_points_only=True, 
                                                         forecast_horizon=_horizons,
                                                         retrain=True)
    
    return backtests_single_ts

def backtests_multiple_local_estimators(estimator, multiple_ts_sets=dataset_ts, split_date=first_test_date, horizons=forecast_horizons, single_forecast=True):
    backtests_multiple_ts = Parallel(n_jobs=-1,
                                     verbose=5, 
                                     backend = 'multiprocessing',
                                     pre_dispatch='1.5*n_jobs')(
            delayed(_backtests_local_estimator)(
                _estimator=estimator,
                _ts_set=single_ts_set,
                _split_date=split_date,
                _horizons=horizons,
                _single_forecast=single_forecast
            )
        for single_ts_set in multiple_ts_sets
    )
    
    return backtests_multiple_ts


def darts_kpi(prediction_series, test_series=test_dataset_ts):
    mape_ = np.round(np.mean(mape(actual_series=test_series, 
                                 pred_series=prediction_series, n_jobs=-1)),
                    2)
    print(f' MAPE: {mape_}')
    
    rmse_ = np.round(np.mean(coefficient_of_variation(actual_series=test_series, 
                                 pred_series=prediction_series, n_jobs=-1)),
                    2)
    print(f' RMSE: {rmse_}')
    
    mae_ = np.round(np.mean(mae(actual_series=test_series, 
                                 pred_series=prediction_series, n_jobs=-1)),
                    2)
    print(f' MAE: {mae_}')

In [None]:
from darts.models import NaiveSeasonal

backtests_baseline_model = backtests_multiple_local_estimators(estimator=NaiveSeasonal(K=365))
darts_kpi(backtests_baseline_model)

fig, ax = plt.subplots(figsize=(30, 10))
test_dataset_ts[0].plot(label='True value', color='black')
backtests_baseline_model[0].plot(label='Forecast', color='green')
plt.show()

In [None]:
from darts.models import Croston

backtests_croston = backtests_multiple_local_estimators(estimator=Croston())
darts_kpi(backtests_croston)

fig, ax = plt.subplots(figsize=(30, 10))
test_dataset_ts[0].plot(label='True value', color='black')
backtests_croston[0].plot(label='Forecast', color='green')
plt.show()

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 12 concurrent workers.


In [None]:
backtests_AUTOarima = backtests_multiple_local_estimators(estimator=StatsForecastAutoARIMA())
darts_kpi(backtests_AUTOarima)

fig, ax = plt.subplots(figsize=(30, 10))
test_dataset_ts[0].plot(label='True value', color='black')
backtests_AUTOarima[0].plot(label='Forecast', color='green')
plt.show()