# How to set up this notebook

```
python3 -m venv venv_make_targets
pip install -r requirements_openbb.txt
pip install -r requirements_make_targets.txt
```

In [None]:
import numpy as np
import pandas as pd
import os
import glob
import pickle
import json
import sys
import argparse
from datetime import datetime, timedelta
from scipy import stats
from openbb_terminal.sdk import openbb

In [None]:
from make_targets import (
    get_all_targets, get_normalized_column, bin_targets)

In [None]:
config_dict = None
config_file = f"/app/config.json"
with open(config_file) as json_file:
    config_dict = json.load(json_file)

In [None]:
def make_targets_02(symbol, start_date, end_date, price_data_sp500, all_10k_df):
    '''
    Function to generate target return information for each symbol based on 
    annual report dates
    Args:
        symbol: stock ticker
        start_date: overall historical start date from where price data is to be fetched
        end_date: overall end date upto which price data is to be fetched
        price_data_sp500: Prefetched dataframe for ticker ^GSPC which gives price data for S&P500
    Returns:
        Pandas DF containing percentage returns between annual report dates for the symbol
    '''
    price_data = openbb.stocks.load(
        symbol, start_date=start_date, end_date=end_date, verbose=False)
    ar_dates_series = all_10k_df[all_10k_df["ticker"] == symbol]["reportDate"]
    ar_dates = list(ar_dates_series.sort_values(ascending=True))
    df = pd.DataFrame()
    for i in range(len(ar_dates)-1):
        curr_report_date = datetime.strptime(ar_dates[i], '%Y-%m-%d')

        #Start and end dates are offset by 2 days to be conservative and allowing the price to settle.
        curr_start_date = \
            datetime.strptime(ar_dates[i], '%Y-%m-%d') + timedelta(days=2)
        curr_end_date_12m = \
            datetime.strptime(ar_dates[i+1], '%Y-%m-%d') - timedelta(days=2)
        num_days_12m = (curr_end_date_12m - curr_start_date).days
        if (num_days_12m < 200):
            continue

        target_dict = get_all_targets(
            price_data, curr_start_date, num_days_12m, 'target')
        sp500_dict = get_all_targets(
            price_data_sp500, curr_start_date, num_days_12m, 'sp500')
        target_dict.update(sp500_dict)
        target_df = pd.DataFrame.from_dict(target_dict, orient='index').T
        target_df['report_date'] = curr_report_date
        target_df['start_date'] = curr_start_date
        target_df['end_date'] = curr_end_date_12m
        df = pd.concat([df, target_df], ignore_index=True)
    df['symbol'] = symbol
    return df


def make_targets_all_symbols_02(start_date, end_date, all_10k_df):
    '''
    Function to return the complete dataframe for all symbols and all annual report date periods
    '''
    symbol_names = all_10k_df["ticker"].unique()
    price_data_sp500 = openbb.stocks.load(
        '^GSPC', start_date=start_date, end_date=end_date, 
        verbose=False)
    full_df = pd.DataFrame()

    for i, symbol in enumerate(symbol_names):
        df = make_targets_02(
            symbol, start_date, end_date, price_data_sp500, all_10k_df)
        full_df = pd.concat([full_df, df], ignore_index=True)
        print('Completed: {}/{}'.format(i+1, len(symbol_names)))
    return full_df


In [None]:
# Get dataframe of tickers we care about
all_10k_df = None
with open(config_dict["10k_df_pkl_pathfn"], "rb") as f:
    all_10k_df = pickle.load(f)

In [None]:
all_10k_df.info()

In [None]:
openbb.__version__
start_date='2002-01-01'
end_date='2023-12-31'

# JPIEN
# Targets is really "return %" for the given period
# target_min is the min return (0.2 quantile) for the given start / end period
# target_max is the max return (0.98 quantile)
# target_Xn is the return of adj close price at the 3m, 6m, end date compared
#   with the start_date adj close price
targets_df = make_targets_all_symbols_02(start_date, end_date, all_10k_df)
targets_df_filtered = targets_df.loc[lambda x: ~(x.isnull().any(axis=1))]

In [None]:

# Era is "year" of annual report filing
targets_df_filtered['era'] = targets_df_filtered['report_date'].apply(
    lambda x: x.year)

In [None]:

# Drop duplicates if they exist. Could be if consecutive annual reports are published in same year.
targets_df_filtered_dropdup = targets_df_filtered.drop_duplicates(
    subset=['era', 'symbol']).reset_index(drop=True)

# Drop eras where there is only one ticker
def remove_single_entries(df):
    if len(df) > 1:
        return df
    
targets_df_filtered_dedup = targets_df_filtered_dropdup.groupby(
    'era', group_keys=False).apply(lambda df: remove_single_entries(df)
)

# Get name of target columns - 
#   target_min, target_max, target_3m, target_6m, target_9m, target_12m
target_cols = [
    c for c in targets_df_filtered_dedup.columns if c.startswith('target')]

In [None]:

# Generate normalised target columns
# JPIEN
# For each annual report year, determine where each ticker's 
#   return falls on a gaussian curve in relation to each other.
for target in target_cols:
    targets_df_filtered_dedup = targets_df_filtered_dedup.groupby(
        'era', group_keys=False).apply(
            lambda df: get_normalized_column(df, target)
            )
    
# Not used?!
target_cols_normalised = [
    c for c in targets_df_filtered_dedup.columns if \
        ( c.startswith('target') & (c.endswith('normalised')) )]

In [None]:

# Create final target column for Machine Learning model building
input_col_target = 'target_12m_normalised'
output_col_target = 'target_ml'

# JPIEN
# bin_targets uses qcut to bin the target_12m_norm values in
#   relationship to each other - 0->0.2, 0.2->0.4, etc - so
#   5 bins total for 6 quantiles.
targets_df_filtered_dedup = targets_df_filtered_dedup.groupby(
    'era', group_keys=False).apply(
        lambda df: bin_targets(
            df, input_col_target, output_col_target, 
            [0, 0.2, 0.4, 0.6, 0.8, 1.0], 
            ['0.0', '0.25', '0.5', '0.75', '1.0'])
        )

In [None]:
with open(config_dict['targets_df_path'], 'wb') as handle:
    pickle.dump(
        targets_df_filtered_dedup, handle, protocol=pickle.HIGHEST_PROTOCOL)