In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("stocks_portfolio_overall.csv")

### **KNN Clustering**
Form 11 cluster every quarter. The number 11 is based on the Global Industry Classification Standard (GICS)

In [5]:
def prepare_and_cluster_portfolio(df, portfolio_type='growth', k_clusters=11):
    target_features = ['operating_margin', 'gross_margin', 'revenue_growth']
    df = df.sort_values(['permno', 'jdate'])

    # Create lagged features (previous quarter values)
    for col in target_features:
        df[col + '_prev'] = df.groupby('permno')[col].shift(1)

    # Compute average percentage changes (deltas)
    def compute_deltas(x):
        deltas = {}
        for col in target_features:
            current_mean = x[col].mean()
            prev_mean = x[col + '_prev'].mean()
            if pd.notna(prev_mean) and prev_mean != 0:
                deltas[col + '_delta'] = (current_mean - prev_mean) / prev_mean
            else:
                deltas[col + '_delta'] = None
        return pd.Series(deltas)

    avg_deltas = df.groupby('jdate').apply(compute_deltas)
    df = df.merge(avg_deltas, left_on='jdate', right_index=True, how='left')

    # Extrapolate missing values using prev × (1 + delta)
    for col in target_features:
        df[col] = df.apply(
            lambda row: row[col + '_prev'] * (1 + row[col + '_delta'])
            if pd.isna(row[col]) and pd.notna(row[col + '_prev']) and pd.notna(row[col + '_delta'])
            else row[col],
            axis=1
        )

    # Clean temporary columns
    df.drop(columns=[f + '_prev' for f in target_features] + [f + '_delta' for f in target_features], inplace=True)

    # Remove inf and NaN rows for key features
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.dropna(subset=target_features)

    # Create year_quarter column
    df['year_quarter'] = df['year'].astype(str) + '-' + df['quarter'].astype(str)

    # Features for Clustering
    cluster_features = [
        'beme', 'roa', 'operating_margin', 'gross_margin', 'revenue_growth',
        'capex_intensity', 'roa_stability', 'revenue_growth_stability'
    ]

    df = df.dropna(subset=cluster_features)
    scaler = StandardScaler()
    df[cluster_features] = scaler.fit_transform(df[cluster_features])

    # Apply KMeans Clustering
    def apply_kmeans(group, k=k_clusters):
        if len(group) < k:
            group['cluster'] = np.arange(len(group))
        else:
            model = KMeans(n_clusters=k, random_state=0, n_init='auto')
            group = group.copy()
            group['cluster'] = model.fit_predict(group[cluster_features])
        return group

    df_clustered = df.groupby('year_quarter').apply(apply_kmeans).reset_index(drop=True)

    # Create readable group_id
    df_clustered['group_id'] = (
        df_clustered['year'].astype(str) + '-' +
        df_clustered['quarter'].astype(str) + '-' +
        df_clustered['cluster'].astype(str).str.zfill(2)
    )

    # Add Quarter-End and Trading Start Dates
    quarter_end_map = {'Q1': '-03-31', 'Q2': '-06-30', 'Q3': '-09-30', 'Q4': '-12-31'}
    df_clustered['quarter_end'] = pd.to_datetime(
        df_clustered['year'].astype(str) + df_clustered['quarter'].map(quarter_end_map)
    )
    df_clustered['trading_start'] = df_clustered['quarter_end'] + pd.offsets.QuarterBegin(startingMonth=1)

    return df_clustered

In [6]:
df_overall = prepare_and_cluster_portfolio(df)

  deltas[col + '_delta'] = (current_mean - prev_mean) / prev_mean
  deltas[col + '_delta'] = (current_mean - prev_mean) / prev_mean
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  avg_deltas = df.groupby('jdate').apply(compute_deltas)
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  r

In [7]:
df_overall.isnull().sum()

gvkey                       0
permno                      0
ticker                      0
cumretx_q                   0
beme                        0
roa                         0
operating_margin            0
gross_margin                0
revenue_growth              0
capex_intensity             0
roa_stability               0
revenue_growth_stability    0
year                        0
quarter                     0
jdate                       0
beme_pct_rank               0
portfolio_type              0
year_quarter                0
cluster                     0
group_id                    0
quarter_end                 0
trading_start               0
dtype: int64

In [8]:
df_overall.to_csv('knn-dataset.csv', index = False) # save data

Note: The Q1/Q2/Q3/Q4 on group_id is based on the quarter when the report was published. 
If the report was published in second quarter, the middle term in group_id is Q2 which is based on Q1 performance. 
It's used to trade on Q3 in the market. 

In [9]:
df_overall

Unnamed: 0,gvkey,permno,ticker,cumretx_q,beme,roa,operating_margin,gross_margin,revenue_growth,capex_intensity,...,year,quarter,jdate,beme_pct_rank,portfolio_type,year_quarter,cluster,group_id,quarter_end,trading_start
0,12994,10001.0,EGAS,1.021016,-0.004166,0.308733,0.032853,0.027210,-0.021865,1.375250,...,2012,Q1,2012-03-31,0.727562,Value,2012-Q1,3,2012-Q1-03,2012-03-31,2012-04-01
1,11903,10025.0,AEPI,1.173712,-0.410733,0.132764,0.032374,0.026992,-0.008966,0.130465,...,2012,Q1,2012-01-31,0.282336,Growth,2012-Q1,0,2012-Q1-00,2012-03-31,2012-04-01
2,12096,10028.0,DGSE,1.001339,-0.479062,0.291150,0.032423,0.027176,-0.005776,0.015994,...,2012,Q1,2012-03-31,0.177050,Growth,2012-Q1,0,2012-Q1-00,2012-03-31,2012-04-01
3,12141,10107.0,MSFT,1.242488,-0.429878,0.481958,0.033437,0.029251,-0.010182,-0.200700,...,2012,Q1,2012-03-31,0.252457,Growth,2012-Q1,0,2012-Q1-00,2012-03-31,2012-04-01
4,12138,10138.0,TROW,1.146620,-0.456932,0.524397,0.033646,0.028042,-0.010634,-0.237260,...,2012,Q1,2012-03-31,0.211741,Growth,2012-Q1,0,2012-Q1-00,2012-03-31,2012-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96583,183945,93371.0,CRMD,1.002476,-0.491993,1.057806,0.033690,0.029759,0.018752,-0.526158,...,2024,Q4,2024-12-31,0.159319,Growth,2024-Q4,7,2024-Q4-07,2024-12-31,2025-01-01
96584,184899,93374.0,FAF,0.945917,-0.039725,0.165855,0.032570,0.026874,-0.007421,-0.278877,...,2024,Q4,2024-12-31,0.703441,Value,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
96585,185138,93426.0,VPG,0.906178,0.235632,0.139918,0.032332,0.027969,-0.011521,-0.217131,...,2024,Q4,2024-12-31,0.866314,Value,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
96586,184259,93434.0,SANW,2.273116,0.589157,-0.060435,0.030039,0.027738,-0.017496,-0.481434,...,2024,Q4,2024-12-31,0.939381,Value,2024-Q4,0,2024-Q4-00,2024-12-31,2025-01-01
