In [11]:
import pandas as pd
from pandas.tseries.offsets import DateOffset
import numpy as np
from dateutil.relativedelta import relativedelta

# import data

In [27]:
news_df = pd.read_csv("/home/pqian/data/690/News Sentiment/news.csv")
news_df["publication_datetime"] = pd.to_datetime(news_df["publication_datetime"])

price_df = pd.read_csv("/home/pqian/data/690/News Sentiment/price.csv")
price_df["daily_return"] = price_df.groupby("ticker")["close"].transform(
    lambda x: x.pct_change()
)
price_df = price_df.dropna()
price_df["Date"] = pd.to_datetime(price_df["Date"])

In [29]:
news_df

Unnamed: 0,publication_datetime,title,body,tickers
0,2017-01-03,World News: Police Question Netanyahu Over Gifts,"""We pay attention to publications in the media...",EL
1,2017-01-03,Business News: Nestle Turns to New CEO for Hea...,"Nestle, the world's largest packaged-food comp...",GIS
2,2017-01-03,Business News: Vermont Drug Law Faces Limits -...,"The Vermont law, enacted in June, instructed s...",ABBV
3,2017-01-03,Life & Arts -- Travel: How Hotel Companies Lau...,Travelers are about to see a flurry of new hot...,HLT
4,2017-01-03,Businesses Ready to Ramp Up Investment --- Aft...,The Federal Reserve last month signaled intere...,HD
...,...,...,...,...
20545,2020-12-30,Terms in Google Ad Deal Revealed,"Ten Republican attorneys general, led by Texas...",GOOGL
20546,2020-12-30,'Son of Sam' Law Invoked in Phony Heiress Case,"Last year, the New York state attorney general...",NFLX
20547,2020-12-30,Boeing MAX Returns to U.S. Sky,Daily round-trip flights between Miami and New...,BA
20548,2020-12-30,"To Curb Ma's Empire, China Weighs Taking a Big...","The regulators, led by the central bank, also ...",PYPL


# Data Sampling
Consider two sceneraios:
 - time-series split:  
        Train: up to 2019-12-31  
        Validation: 2020-01-01 to 2020-06-30  
        Test: from 2020-07-01 onward
    - The issue with this is that validation set has a different distribution with training set due to COVID
- rolling-window split:  
        Train: 12month  
        Validation: 4month  
        Test: 2month  
        Rolling step: 2month

In [20]:
# Define window sizes
train_months = 12
val_months = 4
test_months = 2
step_months = 2

# Find the earliest and latest datetime
start_date = pd.to_datetime("2017-01-01")
end_date = pd.to_datetime("2020-12-31")

# Initialize result list
rolling_splits = []

current_start = start_date

while True:
    train_start = current_start
    train_end = train_start + relativedelta(months=train_months) - pd.Timedelta(days=1)

    val_start = train_end + pd.Timedelta(days=1)
    val_end = val_start + relativedelta(months=val_months) - pd.Timedelta(days=1)

    test_start = val_end + pd.Timedelta(days=1)
    test_end = test_start + relativedelta(months=test_months) - pd.Timedelta(days=1)

    # Stop if test_end exceeds data range
    if test_end > end_date:
        break

    train_set = news_df[
        (news_df["publication_datetime"] >= train_start)
        & (news_df["publication_datetime"] <= train_end)
    ]
    val_set = news_df[
        (news_df["publication_datetime"] >= val_start)
        & (news_df["publication_datetime"] <= val_end)
    ]
    test_set = news_df[
        (news_df["publication_datetime"] >= test_start)
        & (news_df["publication_datetime"] <= test_end)
    ]

    years = {
        "train": pd.date_range(train_start, train_end, freq="MS")
        .strftime("%Y-%m")
        .tolist(),
        "val": pd.date_range(val_start, val_end, freq="MS").strftime("%Y-%m").tolist(),
        "test": pd.date_range(test_start, test_end, freq="MS")
        .strftime("%Y-%m")
        .tolist(),
    }

    rolling_splits.append(
        {
            "train_set": train_set,
            "val_set": val_set,
            "test_set": test_set,
            "years": years,
        }
    )

    # Move forward by rolling step
    current_start += relativedelta(months=step_months)

In [26]:
rolling_splits[-1]

{'train_set':       publication_datetime                                              title  \
 13057           2019-07-01  Streetwise: Fund's Ethical Impact Is Difficult...   
 13058           2019-07-01  Markets Review & Outlook: Second Quarter --- U...   
 13059           2019-07-01  Markets Review & Outlook: Second Quarter --- U...   
 13060           2019-07-01  Markets Review & Outlook: Second Quarter --- S...   
 13061           2019-07-01  Jony Ive's Long Drift From Apple --- The desig...   
 ...                    ...                                                ...   
 18034           2020-06-30  Business News: Cloud Arm Zeros In on Space Org...   
 18035           2020-06-30  Business News: Cloud Arm Zeros In on Space Org...   
 18036           2020-06-30             Casinos Are Sued Over Virus Protection   
 18037           2020-06-30  AMC Delays Reopening Theaters As Studios Push ...   
 18038           2020-06-30                                          Overheard   
 
 