# Summary

This notebook concatenates the engineered target variables of each stock symbol into one main target dataframe.

# Imports and configuration

In [1]:
import pickle
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
TARGET_VAR = 'discrete_return_90d_5p'

INPUT_PATH = r"C:\Users\mushj\Downloads\CURATED FINANCE DATA\FMP" + f'/{TARGET_VAR}'
OUTPUT_PATH = INPUT_PATH

In [3]:
# get paths to all datasets
dataset_names = glob.glob(os.path.join(INPUT_PATH, '*.csv'), recursive=False)
print("Number of stock symbols:", len(dataset_names))
print("Example path:", dataset_names[0])

Number of stock symbols: 978
Example path: C:\Users\mushj\Downloads\CURATED FINANCE DATA\FMP/discrete_return_90d_5p\A.csv


In [4]:
with open(INPUT_PATH+'/labels', 'rb') as f:
    intervals = pickle.load(f)
    
intervals

[(-inf, -0.15),
 (-0.15, -0.077),
 (-0.077, -0.039),
 (-0.039, -0.02),
 (-0.02, 0),
 (0, 0.02),
 (0.02, 0.039),
 (0.039, 0.077),
 (0.077, 0.15),
 (0.15, inf)]

# Combine

In [5]:
# list to store each dataset
df_list = []

for file in tqdm(dataset_names, desc="Loading target variable data"):
    df = pd.read_csv(file)
    df_list.append(df)

Loading target variable data: 100%|█████████████████████████████████████████████████| 978/978 [00:03<00:00, 321.71it/s]


In [6]:
target_df = pd.concat(df_list, axis=0).reset_index(drop=True)

In [7]:
target_df

Unnamed: 0,symbol,week,labels
0,A,2004-12-27,[0 1 1 1 1 1 1 0 0 0]
1,A,2005-01-03,[0 0 1 1 1 1 1 1 1 0]
2,A,2005-01-10,[0 0 0 1 1 1 1 1 1 0]
3,A,2005-01-17,[0 0 1 1 1 1 1 1 1 1]
4,A,2005-01-24,[0 1 1 1 1 1 1 1 1 0]
...,...,...,...
852794,ZTS,2024-12-02,
852795,ZTS,2024-12-09,
852796,ZTS,2024-12-16,
852797,ZTS,2024-12-23,


In [8]:
target_df.dtypes

symbol    object
week      object
labels    object
dtype: object

In [9]:
target_df.isna().mean()

symbol    0.000000
week      0.000000
labels    0.013762
dtype: float64

# Preprocessing

In [10]:
# NAs in labels column are due to insufficient data in forecast horizon to generate conclusive labels
target_df = target_df.dropna().reset_index(drop=True)

In [11]:
# remove braces and split by space
labels_df = target_df['labels'].apply(lambda x: x.strip('[]').split())
labels_df.head()

0    [0, 1, 1, 1, 1, 1, 1, 0, 0, 0]
1    [0, 0, 1, 1, 1, 1, 1, 1, 1, 0]
2    [0, 0, 0, 1, 1, 1, 1, 1, 1, 0]
3    [0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
4    [0, 1, 1, 1, 1, 1, 1, 1, 1, 0]
Name: labels, dtype: object

In [12]:
# expand target array into columns
labels_df = pd.DataFrame(labels_df.to_list())
labels_df = labels_df.astype(int)
labels_df.columns = intervals
labels_df.head()

Unnamed: 0,"(-inf, -0.15)","(-0.15, -0.077)","(-0.077, -0.039)","(-0.039, -0.02)","(-0.02, 0)","(0, 0.02)","(0.02, 0.039)","(0.039, 0.077)","(0.077, 0.15)","(0.15, inf)"
0,0,1,1,1,1,1,1,0,0,0
1,0,0,1,1,1,1,1,1,1,0
2,0,0,0,1,1,1,1,1,1,0
3,0,0,1,1,1,1,1,1,1,1
4,0,1,1,1,1,1,1,1,1,0


In [13]:
labels_df.isna().mean()

(-inf, -0.15)       0.0
(-0.15, -0.077)     0.0
(-0.077, -0.039)    0.0
(-0.039, -0.02)     0.0
(-0.02, 0)          0.0
(0, 0.02)           0.0
(0.02, 0.039)       0.0
(0.039, 0.077)      0.0
(0.077, 0.15)       0.0
(0.15, inf)         0.0
dtype: float64

In [14]:
# concatenate with symbols and dates data
print("Left indexes match right indexes:", all(labels_df.index == target_df.index))
combined_df = pd.concat([target_df, labels_df], axis=1)

Left indexes match right indexes: True


In [15]:
combined_df.shape

(841063, 13)

In [16]:
combined_df.dtypes

symbol              object
week                object
labels              object
(-inf, -0.15)        int32
(-0.15, -0.077)      int32
(-0.077, -0.039)     int32
(-0.039, -0.02)      int32
(-0.02, 0)           int32
(0, 0.02)            int32
(0.02, 0.039)        int32
(0.039, 0.077)       int32
(0.077, 0.15)        int32
(0.15, inf)          int32
dtype: object

In [17]:
directory = OUTPUT_PATH+'/combined'
if not os.path.exists(directory):
    os.makedirs(directory)
    print('Created', directory)
else:
    print('Directory already exists:', directory)

combined_df.to_csv(directory+'/combined.csv', index=False)

Created C:\Users\mushj\Downloads\CURATED FINANCE DATA\FMP/discrete_return_90d_5p/combined
