In [None]:
''' 
Development Juptyer Notebooks for The Benchmark Calculation Overtime . 
''' 

from abc import abstractmethod, ABC 
import pandas as pd 
from typing import Iterable, List, Optional
from datetime import datetime
import numpy as np

In [None]:
'''  
pulling cleaned data into the notebook 
'''
df = pd.read_csv("bloomberg_data_cleaned.csv")

#set date to datetime object 
df["date"] = pd.to_datetime(df["date"])
df.set_index(["date","ticker"],inplace= True)
pd.set_option('display.float_format', '{:,.4f}'.format)
df.head()

In [None]:
""" 
==========================================================
         Index Composition and Reconstitution Class
==========================================================
data. is our base dataframe 
"""
from dataframehelper import DataframeHelper


class IndexComposer:
    '''     
    Based on maria, filtering functions. Index Composer will work in conjuction with 
    the benchmark class. It's primary purpose is to take in a single day and
    and stock data universe as inputs and spit out subset of tickers that will constitute 
    the midcap growth index.  
    ''' 
    def __init__(self,data: pd.DataFrame, day: datetime ) -> None: 
        self.data: pd.DataFrame = data
        self.df_helper = DataframeHelper(data) 
        self.day: datetime = day
        self.day_slice: pd.DataFrame = self.df_helper.slice_by_day(day)
        self._midcap_800: pd.DataFrame = self.get_midcap_800()
        self.growth_subset: pd.Series = self.growth_subset_filter_v1() #type: ignore 
        self.growth_subset_weights = self.get_weights() 
        self.growth_subset_shares = self.get_share_count()  

    def get_midcap_800(self) -> pd.DataFrame: 
        ''' This needs to be date based: we only want to drop NA values 
            for a given day
        '''
        # get all of the data for that day 
        df = self.day_slice 
        # filter only for stocks that are not NA. 
        df = df.dropna(subset = ["market_cap"]).copy()
        #sort from largest to smallest 
        df.sort_values(by = "market_cap", ascending= False, inplace= True)
        # filter out the largest 200 stocks 
        df = df.iloc[199:,]
        return df


    def compute_growth_probability(self, k=5.0) -> pd.Series:
        '''
        Computing Growth Probability

        Since right now we only have P/B data, we can approximate growth
        classification using inverse price-to-book (B/P) mapped into a 
        smooth probability via a logistic function.
        '''
        midcap = self._midcap_800
        pb = midcap["price_to_book"].astype(float)

        #convert P/B to B/P since Russell uses B/P
        bp = np.where((pb > 0 ) & np.isfinite(pb), 1.0 / pb, np.nan)
        bp = pd.Series(bp, index=midcap.index)
        bp = bp.fillna(bp.median()) # to fill empty values (for later computation)

        z = (bp -bp.mean()) / (bp.std(ddof=0) + 1e-12) # z-score for standardization
        z_growth = -z # low B/P -> growth, so now high z_growth -> more growthlike

        #normalizing
        #very neg z_growth -> 0 (value), very pos z_growth -> 1 (growth)
        p = 1 / (1 + np.exp(-k * z_growth)) 

        return pd.Series(p, index=midcap.index, name="p_growth")
    
    def growth_subset_filter_v1(self) -> pd.Series:
        growth_subset_mask = self.compute_growth_probability() > .85
        growth_subset = self._midcap_800.loc[growth_subset_mask]
        return pd.Series(growth_subset.index.get_level_values("ticker"))
    
    def get_weights(self) -> pd.Series: 
        df = self.day_slice
        df = df.reset_index() 
        df = df.set_index("ticker")
        growth_subset = self.growth_subset
        df = df.loc[growth_subset]
        total_cap = df["market_cap"].sum() 
        return df["market_cap"] / total_cap
    
    def get_share_count(self) -> pd.Series: 
        df = self.day_slice
        df = df.reset_index() 
        df = df.set_index("ticker")
        growth_subset = self.growth_subset
        df = df.loc[growth_subset]
        return df["market_cap"] / df["close_price"]

In [None]:
class Benchmark:
    """_summary_
    """
    def __init__(self, data: pd.DataFrame, benchmark_start_val: float): 
        self.data: pd.DataFrame = data
        self.cur_constituents: pd.Series #this is all of current member of the index. 
        self.dates: pd.Series = pd.Series(self.data.index.get_level_values("date").unique())
        self.cur_date: datetime  = self.dates[0]
        self._i: int = 0 
        self.bechmark_timeseries = pd.Series(data = 0.0, index = self.dates)
        self.benchmark_divisor: float = 1
        self.quarterly_recon_dates: List[datetime] = []
        self._get_quarterly_reconst_dates()
        self.annual_recon_dates = self._get_annual_reconstitution_dates() 
        self.benchmark_makeup_dict: dict = {}
        self.cur_divisor: float = 0.0
        self.DataHelper = DataframeHelper(self.data)

    #=========================================================
    #          Benchmark Constitution Methods
    #========================================================
    def calculate_benchmark(self): 
        # set the benchmark constitution on the first day of the calculation
        composer = IndexComposer(self.data, self.cur_date)
        self.cur_constituents = composer.growth_subset
        self.benchmark_makeup_dict[self.cur_date] = composer.growth_subset
        
        # calculate benchmark for the first day
        print(f"calculating benchmark for the first day {self.cur_date}")
        day_df = composer.day_slice
        day_df = day_df.reset_index() 
        day_df.set_index("ticker",inplace= True)
        day_df = day_df.loc[composer.growth_subset]

        # add columns to day df that we need for calculating the cap adjusted benchmark 
        day_df["weights"] = composer.growth_subset_weights
        day_df["shares"] = composer.growth_subset_shares
        day_df["weight_adjusted_cap"] = day_df["weights"] * day_df["shares"] * day_df["close_price"]
        
        cap_weighted_benchmark_numerator = day_df["weight_adjusted_cap"].sum() 
        self.cur_divisor = cap_weighted_benchmark_numerator / 1839.00  # hardcoded for now #todo will updated soon 
        first_benchmark_price = cap_weighted_benchmark_numerator / self.cur_divisor
        self.bechmark_timeseries.loc[self.cur_date] = first_benchmark_price #type: ignore

        while self.next_date() is not None:
            if self.cur_date in self.annual_recon_dates:
                print(f"{self.cur_date} is an reconsitution date" )
                composer = IndexComposer(self.data,self.cur_date)
                self.benchmark_makeup_dict[self.cur_date] = composer.growth_subset
                # when there is recomposition we'll need to rescale the divisor, we'll take the new bench mark
                # constitution, weights and share counts and calculate what divisor makes its such that the 
                # new constitution equals the previous constitutions benchmark value
                prev_date = self.dates[self._i - 2]

                # add columns to day df that we need for calculating the cap adjusted benchmark 
                prev_day_df = self.DataHelper.slice_by_day(prev_date)
                prev_day_df = prev_day_df.reset_index() 
                prev_day_df.set_index("ticker",inplace= True)
                prev_day_df = prev_day_df.loc[composer.growth_subset]
                prev_day_df["weights"] = composer.growth_subset_weights
                prev_day_df["shares"] = composer.growth_subset_shares
                prev_day_df["weight_adjusted_cap"] = prev_day_df["weights"] * prev_day_df["shares"] * prev_day_df["close_price"]
                cap_weighted_benchmark_numerator = prev_day_df["weight_adjusted_cap"].sum()

                self.cur_divisor =  cap_weighted_benchmark_numerator / self.bechmark_timeseries.loc[prev_date]
                print(f" cur_divisor is {self.cur_divisor},  self.bechmark_timeseries.loc[prev_date])")
            
            # add columns to day df that we need for calculating the cap adjusted benchmark 
            day_df = self.DataHelper.slice_by_day(self.cur_date)
            day_df.reset_index(inplace= True)
            day_df.set_index("ticker",inplace= True)
            day_df = day_df.loc[composer.growth_subset]
            day_df["weights"] = composer.growth_subset_weights
            day_df["shares"] = composer.growth_subset_shares
            day_df["weight_adjusted_cap"] = day_df["weights"] * day_df["shares"] * day_df["close_price"]
            cap_weighted_benchmark_numerator = day_df["weight_adjusted_cap"].sum() 
            benchmark_price = cap_weighted_benchmark_numerator / self.cur_divisor
            self.bechmark_timeseries.loc[self.cur_date] = benchmark_price #type: ignore

    #========================================================
    #           Date Handling Methods: 
    #========================================================
    
    def _get_quarterly_reconst_dates(self) -> None: 
        """
        Last trading of the months January, April, July, October
        """
        s_dates = self.dates
        l_dates: list[datetime] = self.dates.to_list() 
        for year in range(l_dates[0].year, l_dates[-1].year + 1):
            for month in [1,4,7,10]:
                year_month_mask = (s_dates.dt.year == year) &  (s_dates.dt.month == month) #type: ignore 
                self.quarterly_recon_dates += [s_dates[year_month_mask].iloc[-1]]

    def _get_annual_reconstitution_dates(self) -> List[datetime]:
        """Maria: method"""
        dates = self.data.index.get_level_values("date").unique()
        recon = []
        years = pd.DatetimeIndex(dates).year.unique()
        for year in years:
            june = [d for d in dates if d.year == year and d.month == 6]
            if not june:
                continue
            fridays = [d for d in june if pd.Timestamp(d).weekday() == 4] # Monday=0 ... Friday=4
            if not fridays:
                continue

            fridays = sorted(fridays)
            if len(fridays) >= 4:
                recon.append(fridays[3]) # 4th Friday (0-indexed)
            else:
                recon.append(fridays[-1]) # fallback: last Friday available
        return recon


    def is_reconstitution_date(self) -> bool:  
        out = False
        annual_recon_dates = self._get_annual_reconstitution_dates()
        if self.cur_date in annual_recon_dates: 
            return True
        if self.cur_date in self.quarterly_recon_dates: 
            return True 
        return False 

    def set_cur_date(self, day: datetime):
        if day not in self.dates.to_list(): 
            raise ValueError("Invalid Day Selected") 
        self.cur_date = day
        self._i = self.dates.to_list().index(day)
        

    def next_date(self) -> Optional[datetime]:
        if self._i < self.dates.__len__(): 
            self.cur_date = self.dates[self._i]
            self._i += 1 
            return self.cur_date
        else:
            return None 
        

In [None]:
class DataframeHelper:
    DATAOG: pd.DataFrame
    def __init__(self, data: pd.DataFrame) -> None:
        DataframeHelper.DATAOG = data 
        self.data = data 
    
    def slice_by_ticker(self,ticker: str) -> pd.DataFrame:
        ticker_mask = self.data.index.get_level_values("ticker") == ticker
        return self.data.loc[ticker_mask]

    def slice_by_day(self, day: datetime) -> pd.DataFrame: 
        day_mask = self.data.index.get_level_values("date") == day
        return self.data.loc[day_mask]
    
    def slice_any_row_with_na(self) -> pd.DataFrame:
        NA_mask = self.data.isna() 
        return self.data[NA_mask.any(axis=1)]
    
    def slice_complete_na_rows(self) -> pd.DataFrame: 
        #todo: 
        return pd.DataFrame()
    def slice_by_day_range(self, start_date: datetime, end_date: datetime) -> pd.DataFrame: 
        #todo 
        return pd.DataFrame() 
    
    @staticmethod
    def _rolling_growth_rate(price_to_sales_series : pd.Series) -> float:  
        lookback_days = 252 #one trading year 
        #see if there are NA values, there are growth is NA 
        NA_mask = price_to_sales_series.isna()
        if price_to_sales_series.loc[NA_mask].__len__() > 0:
            return np.nan
        # else slope of linear fit is growth rate
        x_arbitrary = range(price_to_sales_series.__len__())
        slope, intercept = np.polyfit(x_arbitrary, price_to_sales_series,1) 
        return slope*1000
        
    def add_roling_sales_growth_col(self):
        # add rolling sales growth , not very performant 
        self.data["1year_PtoS_growth"] = 0.0
        for ticker in self.data.index.get_level_values("ticker").unique():
            df_slice = self.slice_by_ticker(ticker)
            df_slice["rolling_growth"] = df_slice["price_to_sales"].rolling(252,min_periods= 252).apply(self._rolling_growth_rate)
            ticker_mask = df.index.get_level_values("ticker") == ticker
            self.data.loc[ticker_mask,"1year_PtoS_growth"] = df_slice["rolling_growth"]