# Summary

This notebook calculates price growths within a specified horizon from every week and bins them into discrete levels, to be used as the multi-label target variable for a prediction task.

# Imports and configuration

In [1]:
import pickle
import numpy as np
import pandas as pd

from typing import List, Tuple

In [2]:
INPUT_PATH = "C:/Users/mushj/Downloads/PROCESSED FINANCE DATA/FMP"
OUTPUT_PATH = "C:/Users/mushj/Downloads/CURATED FINANCE DATA/FMP"

In [3]:
# load historical daily prices
df = pd.read_csv(INPUT_PATH+'/FMP_daily_prices_top1k.csv')

# load list of symbols to keep (based on exploratory analysis)
with open(INPUT_PATH+'/top1k_subset', 'rb') as f:
    include_symbols = pickle.load(f)

# Prepare DataFrame

In [4]:
print("Number of symbols to include:", len(include_symbols))
print("Examples:", include_symbols[:5])

Number of symbols to include: 978
Examples: ['A', 'AA', 'AAL', 'AAON', 'AAP']


In [5]:
df = df.query("symbol in @include_symbols")
df['date'] = pd.to_datetime(df['date'])

In [6]:
df.head()

Unnamed: 0,symbol,date,close,volume
0,A,2005-01-03,16.09,3587208.0
1,A,2005-01-04,15.66,3978002.0
2,A,2005-01-05,15.66,4139634.0
3,A,2005-01-06,15.31,3353443.0
4,A,2005-01-07,15.3,2786175.0


# Engineer multi-label target variable

1. The daily stock price data is downsampled to a weekly frequency. The last daily closing price is used as the week's closing price. For example, if the week contains prices for Monday till Friday, then Friday's closing price is used.
2. Given a forecast horizon (e.g. 1 year ahead), price growths are computed for each week, between a week's closing price and EVERY subsequent DAILY closing price within the forecast horizon.
3. Discrete intervals are defined, for example, <-25%, -25to0%, 0to25% >25%. The multi-label target variable for each week is a vector of binary values that indicate whether a stock will experience a certain level of price growth within the given horizon from that week. For example, a vector of [0, 1, 0, 1] for week 50 of stock ABC means that the stock will experience at least one price change in the interval -25to0%, and at least one price change in the interval >25%, relative to the closing price of week 50.

## Get weekly data

In [7]:
%%time
weekly_df = (
    df
    .sort_values(['symbol', 'date'])
    .groupby('symbol')
    .resample('W-MON', on='date', label='left') # mark each week with Monday
    .last() # get last price of the week
    .reset_index(level='date') # set date index as column
    .reset_index(drop=True) # remove other indexes
    .rename({'date': 'week'}, axis=1)
)

CPU times: total: 22.6 s
Wall time: 23.7 s


In [8]:
weekly_df.head()

Unnamed: 0,week,symbol,close,volume
0,2004-12-27,A,16.09,3587208.0
1,2005-01-03,A,15.23,4190098.0
2,2005-01-10,A,14.86,3027509.0
3,2005-01-17,A,14.45,5241370.0
4,2005-01-24,A,14.9,2821500.0


In [33]:
weekly_df.isna().mean()

week      0.0
symbol    0.0
close     0.0
volume    0.0
dtype: float64

In [43]:
weekly_df.dtypes

week      datetime64[ns]
symbol            object
close            float64
volume           float64
dtype: object

## Validate weekly data

In [28]:
%%time
# store the complete sets of weeks between the start and end weeks of each symbol (span)
week_span = {}

for symbol, data in weekly_df.groupby('symbol'):
    weeks = data.query("symbol == @symbol")['week']
    start, end = weeks.min(), weeks.max()
    week_span[symbol] = set(pd.date_range(start, end, freq='W-MON'))

CPU times: total: 21.5 s
Wall time: 21.8 s


In [29]:
%%time
# detect missing weeks for each symbol
missing_weeks = []

for symbol, data in weekly_df.groupby('symbol'):
    # get weeks in symbol span that are not in symbol data
    missing = week_span[symbol].difference(set(data['week']))
    for w in missing:
        missing_weeks.append({'symbol': symbol, 'missing_week': w})
        
missing_weeks = pd.DataFrame(missing_weeks)

CPU times: total: 1.38 s
Wall time: 1.45 s


In [30]:
# check missing weeks
missing_weeks

## Compute price changes - functions

In [37]:
def label_intervals(
    price_changes: np.array, intervals: List[Tuple]
) -> np.array:
    """Returns an array of boolean values, each indicating whether 
    a price change within an interval is detected.
    Arguments:
        price_changes (np.array): array of price changes
        intervals (List[Tuple]): each tuple contains the lower and upper boundaries of the intervals
    """
    labels = np.zeros(len(intervals), dtype=int)
    
    for i, (low, upp) in enumerate(intervals):
        if np.any((price_changes > low) & (price_changes <= upp)):
            labels[i] = 1
            
    return labels

In [38]:
# test label_intervals()
pc_arr = np.array([0.21, 0.14, -0.18, 0.55, -0.05])
intervals = [(-np.inf, -0.20), (-0.20, 0), (0, 0.20), (0.20, np.inf)]
label_intervals(pc_arr, intervals)

array([0, 1, 1, 1])

In [48]:
def engineer_targets(
    weekly_df: pd.DataFrame, daily_df: pd.DataFrame,
    forecast_horizon: int, intervals: List[Tuple]
) -> pd.DataFrame:
    """Returns a DataFrame that maps each stock's weekly data to the multi-label target variable,
    which indicates the discrete levels of price changes actualized within the forecast horizon relative to
    the week's closing price.
    Arguments:
        weekly_df (pd.DataFrame): contains the weekly closing stock prices of each symbol
        daily_df (pd.DataFrame): contains the daily closing stock prices of each symbol
        forecast_horizon (int): number of days in the forecast horizon
        intervals: each tuple contains the lower and upper boundaries of the price change intervals
    Prerequisites:
        1. weekly_df and daily_df must be sorted by symbol and date (ascending)
        2. the 'date' columns of weekly_df and daily_df must be formated as datetime
    """
    # Initialize a list to store results
    results = []

    # Process each stock symbol separately
    for symbol, group in weekly_df.groupby('symbol'):

        # Iterate through each week's closing price
        for i, row in group.iterrows():
            current_date = row['date']
            current_price = row['close']

            # Define the forecast horizon
            horizon_end_date = current_date + pd.DateOffset(days=forecast_horizon)

            # Get daily prices within the forecast horizon
            future_prices = (
                daily_df
                .query("(symbol == @symbol) & (date > @current_date) & (date <= @horizon_end_date)")
                ['close']
                .values
            )

            # Compute price growths
            price_changes = (future_prices - current_price) / current_price

            # Label based on intervals
            labels = label_intervals(price_changes, intervals)

            # Append results
            results.append({
                'symbol': symbol,
                'week_start_date': current_date,
                'labels': labels
            })

    # Convert results to a DataFrame
    target_df = pd.DataFrame(results)

    return target_df

In [52]:
# test engineer_targets()
test_weekly = pd.DataFrame(
    [
        ['ABC', pd.Timestamp('2025-01-01'), 100.00],
        ['ABC', pd.Timestamp('2025-02-01'), 200.00],
        ['XYZ', pd.Timestamp('2025-01-01'), 300.00]
    ], 
    columns=['symbol', 'date', 'close']
)

test_daily = pd.DataFrame(
    [
        ['ABC', pd.Timestamp('2025-01-02'), 105.00], # +5%
        ['ABC', pd.Timestamp('2025-01-08'), 85.00],  # -15%
        ['ABC', pd.Timestamp('2025-01-20'), 200.00], # +100% but outside of horizon
        ['ABC', pd.Timestamp('2025-02-02'), 210.00], # +5%
        ['XYZ', pd.Timestamp('2025-01-02'), 200.00]  # -33.33%
    ], 
    columns=['symbol', 'date', 'close']
)

intervals = [(-np.inf, -0.20), (-0.20, 0), (0, 0.20), (0.20, np.inf)]

engineer_targets(test_weekly, test_daily, forecast_horizon=10, intervals=intervals)

Unnamed: 0,symbol,week_start_date,labels
0,ABC,2025-01-01,"[0, 1, 1, 0]"
1,ABC,2025-02-01,"[0, 0, 1, 0]"
2,XYZ,2025-01-01,"[1, 0, 0, 0]"


## Compute price changes