In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans

import datetime

from arch import arch_model
import wrds

import numpy.linalg as la
from pandas.tseries.offsets import BDay
import matplotlib.pyplot as plt

from collections import defaultdict

from joblib import Parallel, delayed

from tqdm import tqdm

In [2]:
df = pd.read_csv("stocks_portfolio.csv")
df

Unnamed: 0,gvkey,permno,ticker,cumretx_q,beme,roa,operating_margin,gross_margin,revenue_growth,capex_intensity,roa_stability,revenue_growth_stability,year,quarter,jdate,beme_pct_rank
0,6307,12749.0,KG,1.152542,0.340217,0.015179,0.123322,-1.433203,0.275058,0.011219,0.009632,0.242875,1975,Q1,1975-01-31,0.269444
1,5606,27828.0,HWP,1.114345,0.270635,0.034255,0.518153,-0.554430,-0.134192,0.025455,0.005770,0.150968,1975,Q1,1975-01-31,0.197663
2,7646,47773.0,NCH,1.139344,0.171573,0.044560,0.702080,-0.634600,-0.000048,0.003616,0.005421,0.066373,1975,Q1,1975-01-31,0.102878
3,2269,49373.0,HRB,1.275862,0.269333,0.201333,0.399247,0.106593,2.591381,0.019595,0.096254,2.297455,1975,Q1,1975-01-31,0.196335
4,8504,49576.0,PST,1.248175,0.304979,0.065044,0.514239,-1.145176,0.248040,0.016154,0.010699,0.217788,1975,Q1,1975-01-31,0.233265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243891,1414,93330.0,PRI,1.023647,0.249410,0.011455,0.284959,0.290445,0.018074,0.000177,0.003658,0.022809,2024,Q4,2024-12-31,0.173690
243892,179841,93345.0,CDXS,1.548703,0.172415,-0.069632,-0.342078,0.829730,0.672251,0.011905,0.062299,0.876491,2024,Q4,2024-12-31,0.102349
243893,183974,93356.0,SPSC,0.947571,0.123651,0.017027,0.144780,0.702329,0.044115,0.006026,0.001995,0.016474,2024,Q4,2024-12-31,0.062014
243894,183945,93371.0,CRMD,1.002476,0.172248,0.113289,0.448254,0.966581,1.724337,0.000084,0.118385,1.954662,2024,Q4,2024-12-31,0.102163


In [3]:
df.isnull().sum()

gvkey                          0
permno                         0
ticker                         0
cumretx_q                      0
beme                           0
roa                            0
operating_margin              41
gross_margin                3133
revenue_growth              4417
capex_intensity                0
roa_stability                  0
revenue_growth_stability       0
year                           0
quarter                        0
jdate                          0
beme_pct_rank                  0
dtype: int64

In [4]:
# target features
features = ['operating_margin', 'gross_margin', 'revenue_growth']

# Sort the dataframe
df = df.sort_values(['permno', 'jdate'])

# Create previous quarter values
for col in features:
    df[col + '_prev'] = df.groupby('permno')[col].shift(1)

# Compute average % change for each feature across jdate
def compute_deltas(x):
    deltas = {}
    for col in features:
        current_mean = x[col].mean()
        prev_mean = x[col + '_prev'].mean()
        if pd.notna(prev_mean) and prev_mean != 0:
            deltas[col + '_delta'] = (current_mean - prev_mean) / prev_mean
        else:
            deltas[col + '_delta'] = None
    return pd.Series(deltas)

avg_deltas = df.groupby('jdate').apply(compute_deltas)

# Merge deltas back
df = df.merge(avg_deltas, left_on='jdate', right_index=True, how='left')

# Extrapolate missing values using previous × (1 + delta)
for col in features:
    df[col] = df.apply(
        lambda row: row[col + '_prev'] * (1 + row[col + '_delta'])
        if pd.isna(row[col]) and pd.notna(row[col + '_prev']) and pd.notna(row[col + '_delta'])
        else row[col],
        axis=1
    )

# Clean up
df.drop(columns=[f + '_prev' for f in features] + [f + '_delta' for f in features], inplace=True)

# Check nulls
print(df[features].isnull().sum())

  deltas[col + '_delta'] = (current_mean - prev_mean) / prev_mean
  deltas[col + '_delta'] = (current_mean - prev_mean) / prev_mean
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  avg_deltas = df.groupby('jdate').apply(compute_deltas)


operating_margin      41
gross_margin        3124
revenue_growth      4391
dtype: int64


In [5]:
features_to_dropna = ['operating_margin', 'gross_margin', 'revenue_growth']
df = df.replace([np.inf, -np.inf], np.nan)  # remove inf
df = df.dropna(subset=features_to_dropna)
# Confirm
print(df[features_to_dropna].isnull().sum())
print(f"Remaining rows: {len(df):,}")

df

operating_margin    0
gross_margin        0
revenue_growth      0
dtype: int64
Remaining rows: 237,561


Unnamed: 0,gvkey,permno,ticker,cumretx_q,beme,roa,operating_margin,gross_margin,revenue_growth,capex_intensity,roa_stability,revenue_growth_stability,year,quarter,jdate,beme_pct_rank
19894,12165,10008.0,GACO,1.213676,0.365155,0.019269,0.139914,0.339251,0.771964,0.015725,0.019758,0.441593,1986,Q1,1986-03-31,0.294805
19539,12622,10010.0,CBOT,1.000000,0.284875,0.012412,0.103623,0.592131,-0.130420,0.009602,0.009422,0.351080,1986,Q1,1986-01-31,0.212232
28055,12622,10010.0,CBOT,1.166667,0.359478,0.023699,0.110345,0.518897,0.069006,0.025768,0.009422,0.190848,1987,Q3,1987-07-31,0.281509
42590,12622,10010.0,CBOT,1.250000,0.123272,0.024862,0.142737,0.322984,0.152647,0.036632,0.003487,0.127214,1990,Q3,1990-07-31,0.060701
43575,12622,10010.0,CBOT,0.852941,0.230516,0.028277,0.174500,0.111473,0.111141,-0.012986,0.003485,0.121452,1990,Q4,1990-10-31,0.155288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239428,184996,93436.0,TSLA,0.993046,0.079142,0.074359,0.082012,0.225295,0.077816,0.021638,0.016941,0.118269,2023,Q4,2023-12-31,0.031369
240599,184996,93436.0,TSLA,0.707462,0.114840,0.012726,0.054974,0.232008,-0.153614,0.025424,0.018973,0.138303,2024,Q1,2024-03-31,0.056483
241661,184996,93436.0,TSLA,1.125662,0.105145,0.012408,0.087333,0.229647,0.197127,0.020136,0.020744,0.140593,2024,Q2,2024-06-30,0.049627
242785,184996,93436.0,TSLA,1.322164,0.083346,0.018131,0.110079,0.251966,-0.012471,0.029311,0.020845,0.114321,2024,Q3,2024-09-30,0.033368


In [6]:
df.isnull().sum()

gvkey                       0
permno                      0
ticker                      0
cumretx_q                   0
beme                        0
roa                         0
operating_margin            0
gross_margin                0
revenue_growth              0
capex_intensity             0
roa_stability               0
revenue_growth_stability    0
year                        0
quarter                     0
jdate                       0
beme_pct_rank               0
dtype: int64

In [7]:
# Create year_quarter column
df['year_quarter'] = df['year'].astype(str) + '-' + df['quarter']

# Define the features to cluster on
features = [
    'beme', 'roa', 'operating_margin', 'gross_margin', 'revenue_growth',
    'capex_intensity', 'roa_stability', 'revenue_growth_stability'
]

# Clean and normalize
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=features)  # drop rows with NaNs in key features

scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Apply KMeans (k=11) per quarter
def apply_kmeans(group, k=11):
    if len(group) < k:
        group['cluster'] = np.arange(len(group))  # fallback: unique IDs
    else:
        model = KMeans(n_clusters=k, random_state=0, n_init='auto')
        group = group.copy()
        group['cluster'] = model.fit_predict(group[features])
    return group

df_clustered = df.groupby('year_quarter').apply(apply_kmeans).reset_index(drop=True)

# Create readable group_id like "1975-Q1-03"
df_clustered['group_id'] = (
    df_clustered['year'].astype(str) + '-' +
    df_clustered['quarter'].astype(str) + '-' +
    df_clustered['cluster'].astype(str).str.zfill(2)
)

df_clustered

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weigh

Unnamed: 0,gvkey,permno,ticker,cumretx_q,beme,roa,operating_margin,gross_margin,revenue_growth,capex_intensity,roa_stability,revenue_growth_stability,year,quarter,jdate,beme_pct_rank,year_quarter,cluster,group_id
0,2504,10890.0,BGH,1.205298,1.092623,0.224869,0.034908,0.033447,-0.011460,0.359554,-0.352652,-0.230341,1975,Q1,1975-03-31,0.252214,1975-Q1,7,1975-Q1-07
1,3144,11308.0,KO,1.490566,0.010045,0.376797,0.037971,0.019406,-0.008650,0.031822,-0.356569,-0.283328,1975,Q1,1975-03-31,0.147155,1975-Q1,10,1975-Q1-10
2,4021,11690.0,DM,1.249283,0.386919,0.525212,0.036463,0.032450,-0.010259,0.369277,-0.361284,-0.242001,1975,Q1,1975-03-31,0.182376,1975-Q1,5,1975-Q1-05
3,4194,11754.0,EK,1.467198,0.148968,0.309265,0.035074,0.032314,-0.012117,-0.031929,-0.339578,-0.239373,1975,Q1,1975-03-31,0.160114,1975-Q1,5,1975-Q1-05
4,5686,12319.0,HM,1.329861,0.213929,0.354117,0.041465,0.016762,-0.006837,1.124468,-0.286274,-0.309547,1975,Q1,1975-03-31,0.166084,1975-Q1,3,1975-Q1-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237556,1414,93330.0,PRI,1.023647,0.327064,0.230340,0.035707,0.031454,-0.008239,-0.535102,-0.387538,-0.402213,2024,Q4,2024-12-31,0.173690,2024-Q4,0,2024-Q4-00
237557,179841,93345.0,CDXS,1.548703,-0.467967,-0.464391,0.032366,0.034932,-0.000578,-0.157092,0.144220,0.311711,2024,Q4,2024-12-31,0.102349,2024-Q4,0,2024-Q4-00
237558,183974,93356.0,SPSC,0.947571,-0.971505,0.278081,0.034960,0.034110,-0.007934,-0.346588,-0.402619,-0.407511,2024,Q4,2024-12-31,0.062014,2024-Q4,6,2024-Q4-06
237559,183945,93371.0,CRMD,1.002476,-0.469697,1.102824,0.036577,0.035815,0.011743,-0.538093,0.652809,1.213374,2024,Q4,2024-12-31,0.102163,2024-Q4,0,2024-Q4-00


In [8]:
df_clustered

Unnamed: 0,gvkey,permno,ticker,cumretx_q,beme,roa,operating_margin,gross_margin,revenue_growth,capex_intensity,roa_stability,revenue_growth_stability,year,quarter,jdate,beme_pct_rank,year_quarter,cluster,group_id
0,2504,10890.0,BGH,1.205298,1.092623,0.224869,0.034908,0.033447,-0.011460,0.359554,-0.352652,-0.230341,1975,Q1,1975-03-31,0.252214,1975-Q1,7,1975-Q1-07
1,3144,11308.0,KO,1.490566,0.010045,0.376797,0.037971,0.019406,-0.008650,0.031822,-0.356569,-0.283328,1975,Q1,1975-03-31,0.147155,1975-Q1,10,1975-Q1-10
2,4021,11690.0,DM,1.249283,0.386919,0.525212,0.036463,0.032450,-0.010259,0.369277,-0.361284,-0.242001,1975,Q1,1975-03-31,0.182376,1975-Q1,5,1975-Q1-05
3,4194,11754.0,EK,1.467198,0.148968,0.309265,0.035074,0.032314,-0.012117,-0.031929,-0.339578,-0.239373,1975,Q1,1975-03-31,0.160114,1975-Q1,5,1975-Q1-05
4,5686,12319.0,HM,1.329861,0.213929,0.354117,0.041465,0.016762,-0.006837,1.124468,-0.286274,-0.309547,1975,Q1,1975-03-31,0.166084,1975-Q1,3,1975-Q1-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237556,1414,93330.0,PRI,1.023647,0.327064,0.230340,0.035707,0.031454,-0.008239,-0.535102,-0.387538,-0.402213,2024,Q4,2024-12-31,0.173690,2024-Q4,0,2024-Q4-00
237557,179841,93345.0,CDXS,1.548703,-0.467967,-0.464391,0.032366,0.034932,-0.000578,-0.157092,0.144220,0.311711,2024,Q4,2024-12-31,0.102349,2024-Q4,0,2024-Q4-00
237558,183974,93356.0,SPSC,0.947571,-0.971505,0.278081,0.034960,0.034110,-0.007934,-0.346588,-0.402619,-0.407511,2024,Q4,2024-12-31,0.062014,2024-Q4,6,2024-Q4-06
237559,183945,93371.0,CRMD,1.002476,-0.469697,1.102824,0.036577,0.035815,0.011743,-0.538093,0.652809,1.213374,2024,Q4,2024-12-31,0.102163,2024-Q4,0,2024-Q4-00


In [9]:
# Map Q1, Q2, Q3, Q4 to end-of-quarter dates
quarter_end_map = {'Q1': '-03-31', 'Q2': '-06-30', 'Q3': '-09-30', 'Q4': '-12-31'}
df_clustered['quarter_end'] = df_clustered['year'].astype(str) + df_clustered['quarter'].map(quarter_end_map)
df_clustered['quarter_end'] = pd.to_datetime(df_clustered['quarter_end'])

# Create quarter_end first (if not already)
quarter_end_map = {'Q1': '-03-31', 'Q2': '-06-30', 'Q3': '-09-30', 'Q4': '-12-31'}
df_clustered['quarter_end'] = pd.to_datetime(
    df_clustered['year'].astype(str) + df_clustered['quarter'].map(quarter_end_map)
)

# Define trading_start = beginning of next quarter
df_clustered['trading_start'] = df_clustered['quarter_end'] + pd.offsets.QuarterBegin(startingMonth=1)

df_clustered

Unnamed: 0,gvkey,permno,ticker,cumretx_q,beme,roa,operating_margin,gross_margin,revenue_growth,capex_intensity,...,revenue_growth_stability,year,quarter,jdate,beme_pct_rank,year_quarter,cluster,group_id,quarter_end,trading_start
0,2504,10890.0,BGH,1.205298,1.092623,0.224869,0.034908,0.033447,-0.011460,0.359554,...,-0.230341,1975,Q1,1975-03-31,0.252214,1975-Q1,7,1975-Q1-07,1975-03-31,1975-04-01
1,3144,11308.0,KO,1.490566,0.010045,0.376797,0.037971,0.019406,-0.008650,0.031822,...,-0.283328,1975,Q1,1975-03-31,0.147155,1975-Q1,10,1975-Q1-10,1975-03-31,1975-04-01
2,4021,11690.0,DM,1.249283,0.386919,0.525212,0.036463,0.032450,-0.010259,0.369277,...,-0.242001,1975,Q1,1975-03-31,0.182376,1975-Q1,5,1975-Q1-05,1975-03-31,1975-04-01
3,4194,11754.0,EK,1.467198,0.148968,0.309265,0.035074,0.032314,-0.012117,-0.031929,...,-0.239373,1975,Q1,1975-03-31,0.160114,1975-Q1,5,1975-Q1-05,1975-03-31,1975-04-01
4,5686,12319.0,HM,1.329861,0.213929,0.354117,0.041465,0.016762,-0.006837,1.124468,...,-0.309547,1975,Q1,1975-03-31,0.166084,1975-Q1,3,1975-Q1-03,1975-03-31,1975-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237556,1414,93330.0,PRI,1.023647,0.327064,0.230340,0.035707,0.031454,-0.008239,-0.535102,...,-0.402213,2024,Q4,2024-12-31,0.173690,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
237557,179841,93345.0,CDXS,1.548703,-0.467967,-0.464391,0.032366,0.034932,-0.000578,-0.157092,...,0.311711,2024,Q4,2024-12-31,0.102349,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
237558,183974,93356.0,SPSC,0.947571,-0.971505,0.278081,0.034960,0.034110,-0.007934,-0.346588,...,-0.407511,2024,Q4,2024-12-31,0.062014,2024-Q4,6,2024-Q4-06,2024-12-31,2025-01-01
237559,183945,93371.0,CRMD,1.002476,-0.469697,1.102824,0.036577,0.035815,0.011743,-0.538093,...,1.213374,2024,Q4,2024-12-31,0.102163,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01


In [10]:
df_clustered.to_csv('knn-dataset.csv', index = False)