In [26]:
import pandas as pd
import numpy as np
import os

# ==========================
# Paths / constants
# ==========================
constituents_file_path = "H:/Tech Hardware Shared/$Mike/Quant/Constituents/SnP500_Constit.csv"
csv_directory            = "H:/Tech Hardware Shared/$Mike/Quant/CSV_files/CSV_Daily_RI"
export_path              = "H:/Tech Hardware Shared/$Mike/Quant/Python_Outputs/QC/"
export_audit_file_name   = 'QC_Audit.xlsx'
export_output_file_name  = 'QC_Output.xlsx'
export_examine_CSV_file_name = 'QC_indiv_CSV.xlsx'

TRADING_DAYS_1Y = 252  # trading-day approximation for LTM windows

# ==========================
# Helper: feature builder
# ==========================
def add_return_features(df: pd.DataFrame, ri_col: str, tag: str) -> pd.DataFrame:
    """
    df: per-ISIN daily dataframe with 'Date' and ri_col
    ri_col: 'RI_Abs' or 'RI_Rel'
    tag:    'Abs' or 'Rel' (suffix for new columns)
    """
    if ri_col not in df.columns:
        return df

    df = df.sort_values('Date').copy()

    # Daily total return: from t-1 -> t (no forward-fill through NAs)
    df[f'TR_Daily_{tag}'] = df[ri_col].pct_change(fill_method=None)

    # LTM (252 trading days), ending at t-1
    df[f'TR_LTM_{tag}'] = (df[ri_col] / df[ri_col].shift(TRADING_DAYS_1Y) - 1).shift(1)

    # Std dev of daily returns over trailing 252 trading days, ending at t-1
    df[f'SD_LTM_{tag}'] = (
        df[f'TR_Daily_{tag}']
        .rolling(window=TRADING_DAYS_1Y, min_periods=TRADING_DAYS_1Y)
        .std()
        .shift(1)
    )

    # ---- Only keep the forward window from t to t+63 trading days ----
    df[f'post_3m_{tag}']  = df[ri_col].shift(-63) / df[ri_col] - 1

    return df

# ==========================
# 1) Read & expand monthly constituents to daily
# ==========================
cons_raw = pd.read_csv(constituents_file_path, header=0)
cons_raw.columns = pd.to_datetime(cons_raw.columns, format='%d-%b-%y', errors='coerce')

df = cons_raw.melt(var_name='MonthEnd', value_name='ISIN')
df = df[df['ISIN'].notna()].copy()
df['MonthEnd'] = pd.to_datetime(df['MonthEnd'])

expanded_blocks = []
for month_end, grp in df.groupby('MonthEnd', sort=True):
    isins = grp['ISIN'].dropna().unique()
    if len(isins) == 0:
        continue
    month_start = month_end.replace(day=1)
    all_days = pd.date_range(month_start, month_end, freq='D')
    expanded = pd.DataFrame({
        'Date': np.repeat(all_days.values, len(isins)),
        'ISIN': np.tile(isins, len(all_days))
    })
    expanded_blocks.append(expanded)

if not expanded_blocks:
    raise RuntimeError("No constituents found to expand. Check the constituents file.")

df = pd.concat(expanded_blocks, ignore_index=True)

# ==========================
# 2) Build per-ISIN data (daily), compute features, and collect
# ==========================
all_isin_data = []

for isin in df['ISIN'].unique():
    csv_file_path = os.path.join(csv_directory, f'{isin}.csv')
    if not os.path.exists(csv_file_path):
        continue

    isin_data = pd.read_csv(csv_file_path, header=0, na_values=["-"])

    # Parse dates
    for col in ['Date', 'EOM_Date']:
        if col in isin_data.columns:
            isin_data[col] = pd.to_datetime(isin_data[col], unit='D', origin='1899-12-30', errors='coerce')

    if 'Date' not in isin_data.columns:
        continue
    isin_data = isin_data[isin_data['Date'].notna()].copy()
    isin_data['ISIN'] = isin

    # Numeric + clean zeros -> NaN
    for col in ['RI_Abs', 'RI_Rel']:
        if col in isin_data.columns:
            isin_data[col] = pd.to_numeric(isin_data[col], errors='coerce')
            isin_data[col] = np.where(isin_data[col] == 0, np.nan, isin_data[col])

    # Features for both series
    for col, tag in [('RI_Abs','Abs'), ('RI_Rel','Rel')]:
        isin_data = add_return_features(isin_data, col, tag)

    all_isin_data.append(isin_data)

# ==========================
# 3) Merge all ISIN data with expanded constituents
# ==========================
if len(all_isin_data) == 0:
    raise RuntimeError("No ISIN CSVs found / parsed. Check csv_directory and filenames.")

isin_final_df = pd.concat(all_isin_data, ignore_index=True)

df = pd.merge(df, isin_final_df, on=['Date','ISIN'], how='inner')
df = df.sort_values(['Date','ISIN']).reset_index(drop=True)



In [27]:
import os
import pandas as pd
import numpy as np

# =========================================================
# Assumes df (from Block 1) has at least:
#   EOM_Date, ISIN, TR_Daily_Rel, post_3m_Rel
# TR_Daily_Rel is a decimal return (e.g., 0.30 = +30%)
# =========================================================

# ---- Config
TH_10, TH_20, TH_30 = 0.10, 0.20, 0.30  # thresholds for 10%, 20%, 30%

# Ensure EOM_Date is datetime
if not np.issubdtype(df['EOM_Date'].dtype, np.datetime64):
    df = df.copy()
    df['EOM_Date'] = pd.to_datetime(df['EOM_Date'], errors='coerce')

# Buckets (include neutral band)
ordered_buckets = [
    '<-30%', '-30% to -20%', '-20% to -10%',
    '-10% to +10%',
    '+10% to +20%', '+20% to +30%', '>+30%'
]

def _bucket(x):
    if pd.isna(x):
        return np.nan
    if x > TH_30:                     return '>+30%'
    if TH_20 < x <= TH_30:            return '+20% to +30%'
    if TH_10 < x <= TH_20:            return '+10% to +20%'
    if -TH_10 <= x <= TH_10:          return '-10% to +10%'
    if -TH_20 <= x < -TH_10:          return '-20% to -10%'
    if -TH_30 <= x < -TH_20:          return '-30% to -20%'
    if x < -TH_30:                    return '<-30%'
    return np.nan

# ---- Build working frame (needs TR_Daily_Rel for bucket + post_3m_Rel for stats)
required_cols = ['EOM_Date', 'ISIN', 'TR_Daily_Rel', 'post_3m_Rel']
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"df is missing required columns for Block 2: {missing}")

work = df[required_cols].copy()
work['Bucket'] = work['TR_Daily_Rel'].apply(_bucket)
work = work.dropna(subset=['Bucket'])

# ---- Group by EOM and Bucket
g = work.groupby(['EOM_Date', 'Bucket'], sort=True)

# (A) Occurrence counts (each daily record counted)
occ = g.size().unstack('Bucket', fill_value=0)

# (B) Mean of the forward 3M-from-t return (post_3m_Rel)
means = g['post_3m_Rel'].agg('mean').unstack('Bucket')  # NaN if no data

# (C) Positive / Negative counts for that forward return (deprecation-safe)
posc = g['post_3m_Rel'].apply(lambda s: (s > 0).sum()).unstack('Bucket', fill_value=0)
negc = g['post_3m_Rel'].apply(lambda s: (s < 0).sum()).unstack('Bucket', fill_value=0)

# ---- Ensure all buckets present and ordered
for frame, fill in ((occ, 0), (means, np.nan), (posc, 0), (negc, 0)):
    for b in ordered_buckets:
        if b not in frame.columns:
            frame[b] = fill

occ   = occ.reindex(columns=ordered_buckets)
means = means.reindex(columns=ordered_buckets)
posc  = posc.reindex(columns=ordered_buckets)
negc  = negc.reindex(columns=ordered_buckets)

# ---- Build final flat table
all_index = occ.index.union(means.index).union(posc.index).union(negc.index)
final = pd.DataFrame(index=all_index)

# counts (replace 0 with NaN for clarity as requested)
for b in ordered_buckets:
    final[f"{b}__count"] = occ.reindex(index=final.index)[b].replace(0, np.nan)

# mean of forward 3M from t
for b in ordered_buckets:
    final[f"{b}__mean__post_3m_Rel"] = means.reindex(index=final.index)[b]

# positive / negative counts (replace 0 with NaN to avoid confusion with blanks)
for b in ordered_buckets:
    final[f"{b}__pos_count__post_3m_Rel"] = posc.reindex(index=final.index)[b].replace(0, np.nan)
    final[f"{b}__neg_count__post_3m_Rel"] = negc.reindex(index=final.index)[b].replace(0, np.nan)

# Finish up
final = final.sort_index().reset_index()  # EOM_Date becomes a column

# ---- Export with explicit NaN text for missing cells
output_path = r"H:\Tech Hardware Shared\$Mike\Quant\CSV_files\CSV_Daily_RI\Output\Daily Rel TR output - 3m.xlsx"
final.to_excel(output_path, index=False, na_rep="NaN")
print(f"3M summary (counts + means + pos/neg counts) exported to: {output_path}")


3M summary (counts + means + pos/neg counts) exported to: H:\Tech Hardware Shared\$Mike\Quant\CSV_files\CSV_Daily_RI\Output\Daily Rel TR output - 3m.xlsx


In [15]:
### this may be redundant if code above is fully functional

import os
import pandas as pd
import numpy as np

# --- Thresholds (decimal returns assumed: 0.30 = +30%)
TH_10, TH_20, TH_30 = 0.10, 0.20, 0.30
# If TR_Daily_Rel is whole % (e.g., 30 for 30%), use: TH_10, TH_20, TH_30 = 10, 20, 30

def _bucket(x):
    if pd.isna(x):
        return np.nan
    if x > TH_30:                     return '>+30%'
    if TH_20 < x <= TH_30:            return '+20% to +30%'
    if TH_10 < x <= TH_20:            return '+10% to +20%'
    if -TH_10 <= x <= TH_10:          return '-10% to +10%'
    if -TH_20 <= x < -TH_10:          return '-20% to -10%'
    if -TH_30 <= x < -TH_20:          return '-30% to -20%'
    if x < -TH_30:                    return '<-30%'
    return np.nan

# Bucketize
work = df[['EOM_Date','ISIN','TR_Daily_Rel']].copy()
work['Bucket'] = work['TR_Daily_Rel'].apply(_bucket)

# === KEY CHANGE: count every daily record (not unique ISINs) ===
counts = (work.dropna(subset=['Bucket'])
               .groupby(['EOM_Date','Bucket'])
               .size()                               # <- counts rows (occurrences)
               .unstack(fill_value=0))

# Ensure consistent column order
ordered_cols = [
    '<-30%', '-30% to -20%', '-20% to -10%',
    '-10% to +10%',
    '+10% to +20%', '+20% to +30%', '>+30%'
]
for c in ordered_cols:
    if c not in counts.columns:
        counts[c] = 0
counts = counts[ordered_cols].sort_index()

# Add Total
counts['Total'] = counts.sum(axis=1)

# Final table
table = counts.reset_index()

# Optional preview
with pd.option_context("display.max_rows", 10, "display.max_columns", None):
    print(table.head(10))

# Export
output_path = r"H:\Tech Hardware Shared\$Mike\Quant\CSV_files\CSV_Daily_RI\Output\Daily Rel TR output.xlsx"
table.to_excel(output_path, index=False)
print(f"Table exported to: {output_path}")


Bucket   EOM_Date  <-30%  -30% to -20%  -20% to -10%  -10% to +10%  \
0      1989-10-31      1             1            19         10614   
1      1989-11-30      0             1             6         10629   
2      1989-12-31      0             0             7         10146   
3      1990-01-31      0             1            16         11102   
4      1990-02-28      1             0             5          9655   
5      1990-03-31      1             0             9         10624   
6      1990-04-30      1             0            11         10142   
7      1990-05-31      0             2            10         11104   
8      1990-06-30      0             1             4         10173   
9      1990-07-31      1             1            17         10642   

Bucket  +10% to +20%  +20% to +30%  >+30%  Total  
0                  8             4      1  10648  
1                 11             0      1  10648  
2                 11             0      0  10164  
3                 11     

In [7]:
# Pick the target date
target_date = pd.Timestamp("2023-06-15")

# Filter df for that date
df_on_15jun = df.loc[df["Date"] == target_date]

# Display everything (all rows and columns)
import pandas as pd
with pd.option_context("display.max_rows", None,
                       "display.max_columns", None,
                       "display.width", None):
    print(df_on_15jun)


              Date          ISIN   EOM_Date      RI_Abs      RI_Rel  \
4384871 2023-06-15  AN8068571086 2023-06-30     4522.04    0.475827   
4384872 2023-06-15  BMG0450A1053 2023-06-30     2971.06    0.312627   
4384873 2023-06-15  BMG3223R1088 2023-06-30     2696.96    0.283785   
4384874 2023-06-15  BMG491BT1088 2023-06-30      105.44    0.011095   
4384875 2023-06-15  BMG667211046 2023-06-30       78.82    0.008294   
4384876 2023-06-15  CH0044328745 2023-06-30     3652.54    0.384335   
4384877 2023-06-15  CH0114405324 2023-06-30     1921.66    0.202205   
4384878 2023-06-15  CH1300646267 2023-06-30      894.22    0.094093   
4384879 2023-06-15  IE000IVNQZ81 2023-06-30      479.29    0.050433   
4384880 2023-06-15  IE000S9YS762 2023-06-30     8032.88    0.845252   
4384881 2023-06-15  IE00B4BNMY34 2023-06-30     2932.75    0.308596   
4384882 2023-06-15  IE00B8KQN827 2023-06-30    54163.23    5.699275   
4384883 2023-06-15  IE00BDB6Q211 2023-06-30      843.76    0.088784   
438488