In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("stocks_portfolio_overall.csv")

# Filter Growth and Value portfolios
df_growth = df[df['portfolio_type'] == 'Growth'].copy()
df_value = df[df['portfolio_type'] == 'Value'].copy()

In [3]:
def prepare_and_cluster_portfolio(df, portfolio_type='growth', k_clusters=11):
    target_features = ['operating_margin', 'gross_margin', 'revenue_growth']
    df = df.sort_values(['permno', 'jdate'])

    # Create lagged features (previous quarter values)
    for col in target_features:
        df[col + '_prev'] = df.groupby('permno')[col].shift(1)

    # Compute average percentage changes (deltas)
    def compute_deltas(x):
        deltas = {}
        for col in target_features:
            current_mean = x[col].mean()
            prev_mean = x[col + '_prev'].mean()
            if pd.notna(prev_mean) and prev_mean != 0:
                deltas[col + '_delta'] = (current_mean - prev_mean) / prev_mean
            else:
                deltas[col + '_delta'] = None
        return pd.Series(deltas)

    avg_deltas = df.groupby('jdate').apply(compute_deltas)
    df = df.merge(avg_deltas, left_on='jdate', right_index=True, how='left')

    # Extrapolate missing values using prev × (1 + delta)
    for col in target_features:
        df[col] = df.apply(
            lambda row: row[col + '_prev'] * (1 + row[col + '_delta'])
            if pd.isna(row[col]) and pd.notna(row[col + '_prev']) and pd.notna(row[col + '_delta'])
            else row[col],
            axis=1
        )

    # Clean temporary columns
    df.drop(columns=[f + '_prev' for f in target_features] + [f + '_delta' for f in target_features], inplace=True)

    # Remove inf and NaN rows for key features
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.dropna(subset=target_features)

    # Create year_quarter column
    df['year_quarter'] = df['year'].astype(str) + '-' + df['quarter'].astype(str)

    # Features for Clustering
    cluster_features = [
        'beme', 'roa', 'operating_margin', 'gross_margin', 'revenue_growth',
        'capex_intensity', 'roa_stability', 'revenue_growth_stability'
    ]

    df = df.dropna(subset=cluster_features)
    scaler = StandardScaler()
    df[cluster_features] = scaler.fit_transform(df[cluster_features])

    # Apply KMeans Clustering
    def apply_kmeans(group, k=k_clusters):
        if len(group) < k:
            group['cluster'] = np.arange(len(group))
        else:
            model = KMeans(n_clusters=k, random_state=0, n_init='auto')
            group = group.copy()
            group['cluster'] = model.fit_predict(group[cluster_features])
        return group

    df_clustered = df.groupby('year_quarter').apply(apply_kmeans).reset_index(drop=True)

    # Create readable group_id
    df_clustered['group_id'] = (
        df_clustered['year'].astype(str) + '-' +
        df_clustered['quarter'].astype(str) + '-' +
        df_clustered['cluster'].astype(str).str.zfill(2)
    )

    # 📌 Add Quarter-End and Trading Start Dates
    quarter_end_map = {'Q1': '-03-31', 'Q2': '-06-30', 'Q3': '-09-30', 'Q4': '-12-31'}
    df_clustered['quarter_end'] = pd.to_datetime(
        df_clustered['year'].astype(str) + df_clustered['quarter'].map(quarter_end_map)
    )
    df_clustered['trading_start'] = df_clustered['quarter_end'] + pd.offsets.QuarterBegin(startingMonth=1)

    return df_clustered

In [4]:
df_growth_clustered = prepare_and_cluster_portfolio(df_growth)
df_value_clustered = prepare_and_cluster_portfolio(df_value)
df_overall = prepare_and_cluster_portfolio(df)

  deltas[col + '_delta'] = (current_mean - prev_mean) / prev_mean
  deltas[col + '_delta'] = (current_mean - prev_mean) / prev_mean
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  avg_deltas = df.groupby('jdate').apply(compute_deltas)
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dis

In [5]:
df_growth_clustered.isnull().sum()

gvkey                       0
permno                      0
ticker                      0
cumretx_q                   0
beme                        0
roa                         0
operating_margin            0
gross_margin                0
revenue_growth              0
capex_intensity             0
roa_stability               0
revenue_growth_stability    0
year                        0
quarter                     0
jdate                       0
beme_pct_rank               0
portfolio_type              0
year_quarter                0
cluster                     0
group_id                    0
quarter_end                 0
trading_start               0
dtype: int64

In [6]:
df_value_clustered.isnull().sum()

gvkey                        0
permno                       0
ticker                      23
cumretx_q                    0
beme                         0
roa                          0
operating_margin             0
gross_margin                 0
revenue_growth               0
capex_intensity              0
roa_stability                0
revenue_growth_stability     0
year                         0
quarter                      0
jdate                        0
beme_pct_rank                0
portfolio_type               0
year_quarter                 0
cluster                      0
group_id                     0
quarter_end                  0
trading_start                0
dtype: int64

In [7]:
df_overall.isnull().sum()

gvkey                        0
permno                       0
ticker                      23
cumretx_q                    0
beme                         0
roa                          0
operating_margin             0
gross_margin                 0
revenue_growth               0
capex_intensity              0
roa_stability                0
revenue_growth_stability     0
year                         0
quarter                      0
jdate                        0
beme_pct_rank                0
portfolio_type               0
year_quarter                 0
cluster                      0
group_id                     0
quarter_end                  0
trading_start                0
dtype: int64

In [8]:
df_growth_clustered.to_csv('knn-dataset_growth.csv', index = False)
df_value_clustered.to_csv('knn-dataset_value.csv', index = False)
df_overall.to_csv('knn-dataset.csv', index = False)

In [9]:
df_growth_clustered

Unnamed: 0,gvkey,permno,ticker,cumretx_q,beme,roa,operating_margin,gross_margin,revenue_growth,capex_intensity,...,year,quarter,jdate,beme_pct_rank,portfolio_type,year_quarter,cluster,group_id,quarter_end,trading_start
0,2504,10890.0,BGH,1.205298,1.092623,0.224892,0.034909,0.033446,-0.011460,0.359557,...,1975,Q1,1975-03-31,0.252214,Growth,1975-Q1,7,1975-Q1-07,1975-03-31,1975-04-01
1,3144,11308.0,KO,1.490566,0.010045,0.376821,0.037971,0.019406,-0.008650,0.031825,...,1975,Q1,1975-03-31,0.147155,Growth,1975-Q1,10,1975-Q1-10,1975-03-31,1975-04-01
2,4021,11690.0,DM,1.249283,0.386919,0.525238,0.036463,0.032450,-0.010259,0.369281,...,1975,Q1,1975-03-31,0.182376,Growth,1975-Q1,5,1975-Q1-05,1975-03-31,1975-04-01
3,4194,11754.0,EK,1.467198,0.148968,0.309288,0.035074,0.032314,-0.012117,-0.031926,...,1975,Q1,1975-03-31,0.160114,Growth,1975-Q1,5,1975-Q1-05,1975-03-31,1975-04-01
4,5686,12319.0,HM,1.329861,0.213929,0.354142,0.041465,0.016762,-0.006837,1.124472,...,1975,Q1,1975-03-31,0.166084,Growth,1975-Q1,3,1975-Q1-03,1975-03-31,1975-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237556,1414,93330.0,PRI,1.023647,0.327064,0.230362,0.035707,0.031453,-0.008238,-0.535100,...,2024,Q4,2024-12-31,0.173689,Growth,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
237557,179841,93345.0,CDXS,1.548703,-0.467967,-0.464379,0.032366,0.034932,-0.000578,-0.157089,...,2024,Q4,2024-12-31,0.102348,Growth,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
237558,183974,93356.0,SPSC,0.947571,-0.971505,0.278104,0.034960,0.034110,-0.007934,-0.346586,...,2024,Q4,2024-12-31,0.062014,Growth,2024-Q4,6,2024-Q4-06,2024-12-31,2025-01-01
237559,183945,93371.0,CRMD,1.002476,-0.469697,1.102859,0.036577,0.035815,0.011743,-0.538091,...,2024,Q4,2024-12-31,0.102163,Growth,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01


In [10]:
df_value_clustered

Unnamed: 0,gvkey,permno,ticker,cumretx_q,beme,roa,operating_margin,gross_margin,revenue_growth,capex_intensity,...,year,quarter,jdate,beme_pct_rank,portfolio_type,year_quarter,cluster,group_id,quarter_end,trading_start
0,1010,10006.0,ACF,1.246213,-0.391590,0.192721,0.012822,0.009364,-0.009040,0.429298,...,1975,Q1,1975-03-31,0.775666,Value,1975-Q1,4,1975-Q1-04,1975-03-31,1975-04-01
1,1098,10057.0,NCM,1.101695,0.408882,0.145881,0.014344,-0.007924,-0.008010,-0.228074,...,1975,Q1,1975-03-31,0.960170,Value,1975-Q1,0,1975-Q1-00,1975-03-31,1975-04-01
2,1279,10137.0,AYP,1.196078,0.072299,0.219777,0.014383,0.010773,-0.002736,-0.041744,...,1975,Q1,1975-03-31,0.925073,Value,1975-Q1,0,1975-Q1-00,1975-03-31,1975-04-01
3,1300,10145.0,ACD,1.290749,-0.434907,0.206638,0.012646,0.009260,-0.008288,0.133880,...,1975,Q1,1975-03-31,0.749138,Value,1975-Q1,10,1975-Q1-10,1975-03-31,1975-04-01
4,1308,10153.0,AH,1.403846,1.584873,0.174483,0.013642,-0.007816,-0.008267,-0.365684,...,1975,Q1,1975-03-31,0.989669,Value,1975-Q1,6,1975-Q1-06,1975-03-31,1975-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218294,183224,93274.0,CNET,0.409091,-0.461216,-0.838286,0.010397,0.008459,-0.014900,-0.494786,...,2024,Q4,2024-12-31,0.722911,Value,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
218295,184167,93304.0,FIBK,1.058344,-0.434156,0.107089,0.014098,0.012264,-0.007182,-0.484279,...,2024,Q4,2024-12-31,0.741361,Value,2024-Q4,5,2024-Q4-05,2024-12-31,2025-01-01
218296,133764,93368.0,EBMT,0.970869,-0.127110,0.105302,0.013755,0.012814,-0.007631,-0.421495,...,2024,Q4,2024-12-31,0.877684,Value,2024-Q4,5,2024-Q4-05,2024-12-31,2025-01-01
218297,185138,93426.0,VPG,0.906178,-0.331807,0.106114,0.012162,0.010989,-0.008266,-0.268691,...,2024,Q4,2024-12-31,0.800949,Value,2024-Q4,5,2024-Q4-05,2024-12-31,2025-01-01


In [11]:
df_overall

Unnamed: 0,gvkey,permno,ticker,cumretx_q,beme,roa,operating_margin,gross_margin,revenue_growth,capex_intensity,...,year,quarter,jdate,beme_pct_rank,portfolio_type,year_quarter,cluster,group_id,quarter_end,trading_start
0,1010,10006.0,ACF,1.246213,0.123975,0.201636,0.025524,0.021344,-0.008678,0.224113,...,1975,Q1,1975-03-31,0.775666,Value,1975-Q1,8,1975-Q1-08,1975-03-31,1975-04-01
1,1098,10057.0,NCM,1.101695,1.075916,0.162258,0.026810,0.005364,-0.008068,-0.299151,...,1975,Q1,1975-03-31,0.960170,Value,1975-Q1,0,1975-Q1-00,1975-03-31,1975-04-01
2,1279,10137.0,AYP,1.196078,0.675644,0.224382,0.026842,0.022647,-0.004950,-0.150833,...,1975,Q1,1975-03-31,0.925073,Value,1975-Q1,8,1975-Q1-08,1975-03-31,1975-04-01
3,1300,10145.0,ACD,1.290749,0.072462,0.213336,0.025375,0.021248,-0.008233,-0.011037,...,1975,Q1,1975-03-31,0.749138,Value,1975-Q1,8,1975-Q1-08,1975-03-31,1975-04-01
4,1308,10153.0,AH,1.403846,2.474431,0.186304,0.026216,0.005464,-0.008220,-0.408688,...,1975,Q1,1975-03-31,0.989669,Value,1975-Q1,10,1975-Q1-10,1975-03-31,1975-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455853,133764,93368.0,EBMT,0.970869,0.438502,0.128145,0.026311,0.024534,-0.007845,-0.453113,...,2024,Q4,2024-12-31,0.877684,Value,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
455854,183945,93371.0,CRMD,1.002476,-0.578882,1.204838,0.027603,0.026470,0.017904,-0.508345,...,2024,Q4,2024-12-31,0.102163,Growth,2024-Q4,5,2024-Q4-05,2024-12-31,2025-01-01
455855,185138,93426.0,VPG,0.906178,0.195072,0.128827,0.024966,0.022846,-0.008220,-0.331482,...,2024,Q4,2024-12-31,0.800949,Value,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
455856,184259,93434.0,SANW,2.273116,0.571107,-0.106040,0.020517,0.022380,-0.013376,-0.482749,...,2024,Q4,2024-12-31,0.904525,Value,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
