In [4]:
''' 
Development Juptyer Notebooks for The Benchmark Calculation Overtime . 
''' 

from abc import abstractmethod, ABC 
import pandas as pd 
from typing import Iterable, List, Optional
from datetime import datetime
import numpy as np

In [7]:
'''  
pulling cleaned data into the notebook 
'''
df = pd.read_csv("bloomberg_data_cleaned.csv")

#set date to datetime object 
df["date"] = pd.to_datetime(df["date"])
df.set_index(["date","ticker"],inplace= True)
pd.set_option('display.float_format', '{:,.4f}'.format)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,market_cap,volume,price_to_book,price_to_sales,pe_ratio,current_cap_share_class,close_price
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-01-03,A,47257.0847,554098.0,8.7692,7.5274,42.0279,47257.0847,156.48
2022-01-03,AA,11293.546,1033749.0,2.3785,0.9249,8.9225,11293.546,60.36
2022-01-03,AAL,12140.8973,8786970.0,,0.4051,,12140.8973,18.75
2022-01-03,AAON,4106.6876,83790.0,8.8262,7.6801,63.5297,4106.6876,52.22
2022-01-03,AAPL,2986128.318,20242137.0,41.3474,7.9614,30.1565,2986128.318,182.01


In [None]:



class Benchmark:
    """_summary_
    """
    def __init__(self, data: pd.DataFrame, benchmark_start_val: float): 
        self.data: pd.DataFrame = data
        self.cur_constituents: List[str] #this is all of current member of the index. 
        self.dates: List[datetime]= self.data.index.get_level_values("date").unique().to_list()
        self.cur_date: datetime  = self.dates[0]
        self._i: int = 0 
        self.price_history = pd.Series(data = 0.0, index = self.dates)
        self.benchmark_divisor: float = 1
        self.quarterly_recon_dates: List[datetime] = []
        self._get_quarterly_reconst_dates() 
        self.random_constitution()
        self._calc_benchmark_divisor(benchmark_start_val) 
    
    #=========================================================
    #          Benchmark Constitution Methods
    #========================================================


    def next_benchmark_price(self) -> float: 
        if self.is_reconstitution_date():
            # reconsitute the benchmark. 
            None 
        next_benchmark = self._calc_numerator() / self.benchmark_divisor
        self.price_history.loc[self.cur_date] = next_benchmark #type: ignore 
        self.next_date() 
        return next_benchmark
    
    def _calc_numerator(self)-> float: 
        numerator: float = 0
        for ticker in self.cur_constituents: 
            key = (self.cur_date,ticker)
            cap_val  = self.data.loc[key]["market_cap"] # type: ignore 
            if np.isnan(cap_val): 
                cap_val = 0.00 
            numerator +=  cap_val 
        return numerator


    def _calc_benchmark_divisor(self, start_val: float) -> None: 
        # numerator / start_vale = divisor. 
        self.benchmark_divisor = self.next_benchmark_price() / start_val
    
    #=========================================================
    #          Benchmark Constitution Methods
    #========================================================

    def random_constitution(self):
        tickers = self.data.index.get_level_values("ticker").unique()
        mask = [] 
        rnd_selection =  np.random.randint(0,tickers.__len__(),282)
        for i, ticker in enumerate(range(tickers.__len__())): 
            if i in rnd_selection: 
                mask += [True]
            else: 
                mask += [False]
        self.cur_constituents = tickers[mask].to_list()

    #========================================================
    #           Date Handling Methods: 
    #========================================================
    
    def _get_quarterly_reconst_dates(self) -> None: 
        """
        Last trading of the months January, April, July, October
        """
        s_dates: pd.Series = pd.Series(self.dates)
        for year in range(self.dates[0].year, self.dates[-1].year + 1):
            for month in [1,4,7,10]:
                year_month_mask = (s_dates.dt.year == year) &  (s_dates.dt.month == month) #type: ignore 
                self.quarterly_recon_dates += [s_dates[year_month_mask].iloc[-1]]

    def _get_annual_reconstitution_dates(self) -> List[datetime]:
        """Maria: method"""
        dates = self.data.index.get_level_values("date").unique()
        recon = []
        years = pd.DatetimeIndex(dates).year.unique()
        for year in years:
            june = [d for d in dates if d.year == year and d.month == 6]
            if not june:
                continue
            fridays = [d for d in june if pd.Timestamp(d).weekday() == 4] # Monday=0 ... Friday=4
            if not fridays:
                continue

            fridays = sorted(fridays)
            if len(fridays) >= 4:
                recon.append(fridays[3]) # 4th Friday (0-indexed)
            else:
                recon.append(fridays[-1]) # fallback: last Friday available
        return recon


    def is_reconstitution_date(self) -> bool:  
        out = False
        annual_recon_dates = self._get_annual_reconstitution_dates()
        if self.cur_date in annual_recon_dates: 
            return True
        if self.cur_date in self.quarterly_recon_dates: 
            return True 
        return False 
              

    def next_date(self) -> Optional[datetime]:
        if self._i < self.dates.__len__(): 
            self.cur_date = self.dates[self._i]
            self._i += 1 
            return self.cur_date
        else:
            return None 
        

In [166]:
bench = Benchmark(df,100)
iter = 0
while bench.cur_date is not None: 
    iter += 1 
    if iter > 100000: break
    #print(bench.cur_date)
    bench.next_benchmark_price()

bench.price_history


KeyboardInterrupt: 

In [98]:
bench = Benchmark(df)
bench._get_quarterly_reconst_dates_2()
bench.quarterly_recon_dates

d = datetime.strptime('2025-10-31 00:00:00', '%Y-%m-%d %H:%M:%S')
d in bench.quarterly_recon_dates

True

In [167]:
bench.price_history


2022-01-03   100.0000
2022-01-04    99.7669
2022-01-05    97.7660
2022-01-06    97.6922
2022-01-07    97.3352
               ...   
2025-12-24   139.9594
2025-12-26   139.7869
2025-12-29   139.2592
2025-12-30   139.2605
2025-12-31   138.0941
Length: 1003, dtype: float64

In [None]:
bench = Benchmark(df,1000000)


In [116]:
tickers = df.index.get_level_values("ticker").unique()
mask = [] 
rnd_selection =  np.random.randint(0,tickers.__len__(),282)
for i, ticker in enumerate(range(tickers.__len__())): 
    if i in rnd_selection: 
        mask += [True]
    else: 
        mask += [False]
mask
tickers[mask]

Index(['AAON', 'AAPL', 'ACI', 'ADT', 'AES', 'AFL', 'AGCO', 'AGO', 'AIZ', 'AJG',
       ...
       'WLK', 'WMB', 'WMS', 'WMT', 'WU', 'WYNN', 'XP', 'XRAY', 'Z', 'ZS'],
      dtype='object', name='ticker', length=250)

In [119]:
rnd_selection.__len__()
mask.__len__()

1010

In [147]:
bench = Benchmark(df,1000000)
len(bench.cur_constituents)
bench.benchmark_divisor
bench._calc_numerator()
bench.next_date()

Timestamp('2022-01-04 00:00:00')

In [148]:
bench.next_benchmark_price()
bench.next_benchmark_price()
bench.next_benchmark_price()
bench.next_benchmark_price()

np.float64(979452.2028593788)

In [157]:
bench.price_history
price_history = pd.Series(
    data = 0.0,
    index = bench.dates 
)
d = datetime.strptime('2025-10-31 00:00:00', '%Y-%m-%d %H:%M:%S')
print(d)
price_history.loc[d] = 4 
price_history.loc[d]

2025-10-31 00:00:00


np.float64(4.0)

In [None]:
'''
realistically the bechmark itself should be place into a dataframe of the form 
index = Dates | Benchmark_Price | Divisor | Market Cap 

''' 
'''
==========================================
        Data Availability Dataframe: 
==========================================
other thoughts on data availability. Overall what we have is 

'''


data_stats_df = pd.DataFrame(
    columns = ["has_market_cap_data","has_all_data"] 

)




In [None]:
df.loc[d,:]

In [63]:
""" 
==========================================================
         Index Composition and Reconstitution Class
==========================================================
"""
from dataframehelper import DataframeHelper


class IndexComposer: 
    def __init__(self,data: pd.DataFrame) -> None: 
        self.data: pd.DataFrame = data
        self.df_helper = DataframeHelper(data) 

    def get_midcap_800(self, day: datetime) -> pd.DataFrame: 
        ''' This needs to be date based: we only want to drop NA values 
            for a given day
        '''
        # get all of the data for that day 
        df = self.df_helper.slice_by_day(day) 
        # filter only for stocks that are not NA. 
        df = df.dropna(subset = ["market_cap"]).copy()
        #sort from largest to smallest 
        df.sort_values(by = "market_cap", ascending= False, inplace= True)
        # filter out the largest 200 stocks 
        df = df.iloc[199:,]

        return df
    
    def get_share_count_midcap_800(self, day: datetime) -> pd.Series: 
        # get all of the data for that day 
        df = self.get_midcap_800(day)
        return df["market_cap"] / df["close_price"]


    def compute_growth_probability(self, day: datetime, k=5.0) -> pd.Series:
        '''
        Computing Growth Probability

        Since right now we only have P/B data, we can approximate growth
        classification using inverse price-to-book (B/P) mapped into a 
        smooth probability via a logistic function.
        '''
        midcap = self.get_midcap_800(day)
        pb = midcap["price_to_book"].astype(float)

        #convert P/B to B/P since Russell uses B/P
        bp = np.where((pb > 0 ) & np.isfinite(pb), 1.0 / pb, np.nan)
        bp = pd.Series(bp, index=midcap.index)
        bp = bp.fillna(bp.median()) # to fill empty values (for later computation)

        z = (bp -bp.mean()) / (bp.std(ddof=0) + 1e-12) # z-score for standardization
        z_growth = -z # low B/P -> growth, so now high z_growth -> more growthlike

        #normalizing
        #very neg z_growth -> 0 (value), very pos z_growth -> 1 (growth)
        p = 1 / (1 + np.exp(-k * z_growth)) 

        return pd.Series(p, index=midcap.index, name="p_growth")
    
    def growth_subset_filter_v1(self,day) -> pd.Series:
        growth_subset_mask = self.compute_growth_probability(day) > .85
        growth_subset = self.get_midcap_800(day).loc[growth_subset_mask]
        return pd.Series(growth_subset.index.get_level_values("ticker"))




In [64]:
IC = IndexComposer(df) 
IC.get_midcap_800(datetime(2022,1,3))
IC.get_share_count_midcap_800(datetime(2022,1,3))
growth_mask = IC.compute_growth_probability(datetime(2022,1,3)) > .85
growth_subset = IC.get_midcap_800(datetime(2022,1,3)).loc[growth_mask]
growth_subset
growth_list = IC.growth_subset_filter_v1(datetime(2022,1,3))

In [65]:
DH = DataframeHelper(df)
day_df = DH.slice_by_day(datetime(2022,1,3))
day_df = day_df.reset_index() 
day_df = day_df.set_index("ticker")
day_df.loc[growth_list]

Unnamed: 0_level_0,date,market_cap,volume,price_to_book,price_to_sales,pe_ratio,current_cap_share_class,close_price
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
APTV,2022-01-03,44913.4627,622736.0000,5.3808,2.8778,66.6200,44913.4627,166.0300
CTAS,2022-01-03,43963.0519,822552.0000,12.3383,5.9520,39.5676,43963.0519,106.0230
TROW,2022-01-03,43732.1165,428456.0000,4.9423,5.7465,14.7179,43732.1165,194.5800
ANET,2022-01-03,43708.0164,1769040.0000,11.0000,14.7921,58.8227,43708.0164,35.5600
DELL,2022-01-03,43526.1629,656975.0000,4.8809,0.4443,15.2414,16523.1428,56.9800
...,...,...,...,...,...,...,...,...
WFRD,2022-01-03,2033.2856,53303.0000,4.2979,0.5565,,2033.2856,28.9800
VNOM,2022-01-03,1775.4663,67725.0000,4.6613,3.0560,99.4240,1775.4663,22.4400
ELF,2022-01-03,1725.8827,183423.0000,5.6597,4.4223,66.9805,1725.8827,33.1700
DJT,2022-01-03,1552.4080,163460.0000,5.4808,,,1552.4080,51.7000


In [62]:
type(pd.Series(growth_list))

pandas.core.series.Series