# Summary

This notebook concatenates the engineered target variables of each stock symbol into one main target dataframe and performs exploratory analysis to reveal the distributions and properties of the target variable.

# Imports and configuration

In [1]:
import pickle
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
# this folder contains .csv datasets of daily prices
INPUT_PATH = r"C:\Users\mushj\Downloads\CURATED FINANCE DATA\FMP\discrete_return_30d"
OUTPUT_PATH = INPUT_PATH

In [3]:
# get paths to all datasets
dataset_names = glob.glob(os.path.join(INPUT_PATH, '*.csv'), recursive=False)
print("Number of stock symbols:", len(dataset_names))
print("Example path:", dataset_names[0])

Number of stock symbols: 978
Example path: C:\Users\mushj\Downloads\CURATED FINANCE DATA\FMP\discrete_return_30d\A.csv


In [4]:
with open(INPUT_PATH+'/labels', 'rb') as f:
    intervals = pickle.load(f)
    
intervals

[(-inf, -0.15),
 (-0.15, -0.077),
 (-0.077, -0.039),
 (-0.039, -0.02),
 (-0.02, 0),
 (0, 0.02),
 (0.02, 0.039),
 (0.039, 0.077),
 (0.077, 0.15),
 (0.15, inf)]

In [5]:
# list to store each dataset
df_list = []

for file in tqdm(dataset_names, desc="Loading target variable data"):
    df = pd.read_csv(file)
    df_list.append(df)

Loading target variable data: 100%|█████████████████████████████████████████████████| 978/978 [00:02<00:00, 420.16it/s]


In [6]:
target_df = pd.concat(df_list, axis=0).reset_index(drop=True)

In [7]:
target_df

Unnamed: 0,symbol,week,labels
0,A,2004-12-27,[0 1 1 1 1 0 0 0 0 0]
1,A,2005-01-03,[0 0 1 1 1 1 1 0 0 0]
2,A,2005-01-10,[0 0 0 1 1 1 1 1 0 0]
3,A,2005-01-17,[0 0 0 0 1 1 1 1 1 0]
4,A,2005-01-24,[0 0 0 1 1 1 1 1 1 0]
...,...,...,...
852794,ZTS,2024-12-02,[0 1 1 0 1 1 0 0 0 0]
852795,ZTS,2024-12-09,
852796,ZTS,2024-12-16,
852797,ZTS,2024-12-23,


In [8]:
target_df.isna().mean()

symbol    0.000000
week      0.000000
labels    0.004586
dtype: float64

In [9]:
target_df.dtypes

symbol    object
week      object
labels    object
dtype: object

# Preprocessing

In [10]:
# NAs in labels column are due to insufficient data in forecast horizon to generate conclusive labels
target_df = target_df.dropna().reset_index(drop=True)

In [11]:
# remove braces and split by space
labels_df = target_df['labels'].apply(lambda x: x.strip('[]').split())
labels_df.head()

0    [0, 1, 1, 1, 1, 0, 0, 0, 0, 0]
1    [0, 0, 1, 1, 1, 1, 1, 0, 0, 0]
2    [0, 0, 0, 1, 1, 1, 1, 1, 0, 0]
3    [0, 0, 0, 0, 1, 1, 1, 1, 1, 0]
4    [0, 0, 0, 1, 1, 1, 1, 1, 1, 0]
Name: labels, dtype: object

In [12]:
# expand target array into columns
labels_df = pd.DataFrame(labels_df.to_list())
labels_df = labels_df.astype(int)
labels_df.columns = intervals
labels_df.head()

Unnamed: 0,"(-inf, -0.15)","(-0.15, -0.077)","(-0.077, -0.039)","(-0.039, -0.02)","(-0.02, 0)","(0, 0.02)","(0.02, 0.039)","(0.039, 0.077)","(0.077, 0.15)","(0.15, inf)"
0,0,1,1,1,1,0,0,0,0,0
1,0,0,1,1,1,1,1,0,0,0
2,0,0,0,1,1,1,1,1,0,0
3,0,0,0,0,1,1,1,1,1,0
4,0,0,0,1,1,1,1,1,1,0


In [13]:
labels_df.isna().mean()

(-inf, -0.15)       0.0
(-0.15, -0.077)     0.0
(-0.077, -0.039)    0.0
(-0.039, -0.02)     0.0
(-0.02, 0)          0.0
(0, 0.02)           0.0
(0.02, 0.039)       0.0
(0.039, 0.077)      0.0
(0.077, 0.15)       0.0
(0.15, inf)         0.0
dtype: float64

In [24]:
# concatenate with symbols and dates data
print(all(labels_df.index == target_df.index))
combined_df = pd.concat([target_df, labels_df], axis=1)

True


# Summary

In [25]:
combined_df.shape

(848888, 13)

In [26]:
combined_df.dtypes

symbol              object
week                object
labels              object
(-inf, -0.15)        int32
(-0.15, -0.077)      int32
(-0.077, -0.039)     int32
(-0.039, -0.02)      int32
(-0.02, 0)           int32
(0, 0.02)            int32
(0.02, 0.039)        int32
(0.039, 0.077)       int32
(0.077, 0.15)        int32
(0.15, inf)          int32
dtype: object

In [28]:
# proportion of 'positive class' of each label in entire dataset
combined_df[intervals].mean()

(-inf, -0.15)       0.061925
(-0.15, -0.077)     0.225208
(-0.077, -0.039)    0.482011
(-0.039, -0.02)     0.675147
(-0.02, 0)          1.000000
(0, 0.02)           0.908924
(0.02, 0.039)       0.701833
(0.039, 0.077)      0.536944
(0.077, 0.15)       0.270811
(0.15, inf)         0.083366
dtype: float64

# Symbol-level summary

In [31]:
df1 = combined_df.groupby('symbol')[intervals].mean()

In [33]:
# distribution of 'proportion of positive class' across stocks
# e.g. stocks experienced price growths of >=15% within a 30-day forecast horizon from the weekly closing price
# 9.7752% of the time, on average
df1.describe()

Unnamed: 0,"(-inf, -0.15)","(-0.15, -0.077)","(-0.077, -0.039)","(-0.039, -0.02)","(-0.02, 0)","(0, 0.02)","(0.02, 0.039)","(0.039, 0.077)","(0.077, 0.15)","(0.15, inf)"
count,978.0,978.0,978.0,978.0,978.0,978.0,978.0,978.0,978.0,978.0
mean,0.072663,0.245938,0.501165,0.682071,1.0,0.902031,0.70351,0.550934,0.291605,0.097752
std,0.067072,0.123714,0.117906,0.052898,0.0,0.044996,0.039541,0.101291,0.129372,0.086271
min,0.001921,0.044188,0.199808,0.494717,1.0,0.487152,0.533937,0.26609,0.03074,0.000961
25%,0.028946,0.153917,0.41573,0.651258,1.0,0.887608,0.682037,0.483378,0.198847,0.036503
50%,0.050913,0.219981,0.497387,0.691671,1.0,0.912749,0.707973,0.552354,0.271854,0.072019
75%,0.091319,0.312394,0.584389,0.7195,1.0,0.930836,0.731592,0.626321,0.374107,0.126987
max,0.480392,0.72549,0.8125,0.815789,1.0,0.964457,0.79661,0.785714,0.696335,0.52451
