In [None]:
_components_range = range(9, 20)  # try 2–15 clusters
models = []
bics = []

selected_cols = ['pred_proba', 'win_streak_diff', 'lose_streak_diff', 'num_fights_red', 'num_fights_blue',
                 'red_fav_counts', 'red_dog_counts', 'blue_fav_counts', 'blue_dog_counts', 'age_diff', 'math_blue', 'math_red',
                 'open_blue', 'open_red', 'dec_fair_close1_blue',
       'dec_fair_close2_blue', 'dec_fair_open_blue', 'dec_fair_close1_red',
       'dec_fair_close2_red', 'dec_fair_open_red', 'event_age']

for n in n_components_range:
    gmm = GaussianMixture(n_components=n, covariance_type="full", random_state=42)
    gmm.fit(X_train_copy[selected_cols])
    models.append(gmm)
    bics.append(gmm.bic(X_train_copy[selected_cols]))

# Pick the model with lowest BIC
best_index = np.argmin(bics)
best_gmm = models[best_index]
print(f"Best number of clusters: {n_components_range[best_index]}")

# ------------------------------
# 4. Assign clusters
# ------------------------------

clusters = best_gmm.predict(X_test_copy[selected_cols])

red_odds_col = 'dec_fair_close1_red'
blue_odds_col = 'dec_fair_close1_blue'

df = final_df[model_cols].dropna().iloc[train_len:].copy()
df['pred_winner'] = X_test_copy['pred_winner']
df['pred_proba'] = X_test_copy['pred_proba']
df["cluster"] = clusters

cluster_stats = []
n_clusters = n_components_range[best_index]

for c in range(n_clusters):
    cluster_df = df[df['cluster'] == c]

    units = 0
    for _, row in cluster_df.iterrows():
        if row['pred_winner'] == row['winner']:
            # Correct prediction → profit
            if row['pred_winner'] == 1:
                units += row[red_odds_col] 
            else:
                units += row[blue_odds_col] 
        else:
            # Wrong prediction → lose 1 unit
            units -= 1

    cluster_stats.append({
        "cluster": c,
        "num_fights": len(cluster_df),
        "avg_units_gained": units / (len(cluster_df) + 1e-10),
        'accuracy': np.sum(cluster_df['pred_winner'] == cluster_df['winner'])/cluster_df.shape[0]
    })

df_cluster_test = pd.DataFrame(cluster_stats)
df_cluster_test.head(n_components_range[best_index])

df = final_df[model_cols].dropna().iloc[:train_len].copy()
clusters = best_gmm.predict(X_train_copy[selected_cols])

df['pred_winner'] = X_train_copy['pred_winner']
df['pred_proba'] = X_train_copy['pred_proba']
df['cluster'] = clusters
cluster_stats = []
n_clusters = n_components_range[best_index]

for c in range(n_clusters):
    cluster_df = df[df['cluster'] == c]

    units = 0
    for _, row in cluster_df.iterrows():
        if row['pred_winner'] == row['winner']:
            # Correct prediction → profit
            if row['pred_winner'] == 1:
                units += row[red_odds_col] 
            else:
                units += row[blue_odds_col] 
        else:
            # Wrong prediction → lose 1 unit
            units -= 1

    cluster_stats.append({
        "cluster": c,
        "num_fights": len(cluster_df),
        "avg_units_gained": units / len(cluster_df),
        'accuracy': np.sum(cluster_df['pred_winner'] == cluster_df['winner'])/cluster_df.shape[0]
    })

df_cluster_train = pd.DataFrame(cluster_stats)
df_cluster_train.head(n_components_range[best_index])

In [None]:
print(best_gmm.score(X_train_copy.drop('winner', axis=1)))
print(best_gmm.score(X_test_copy.drop('winner', axis=1)))

print(f"Mean log-likelihood: {best_gmm.score(X_train_copy.drop('winner', axis=1)):.4f}")

In [None]:
def cluster_odds(df, red_odds_col, blue_odds_col, winner_col, pred_winner_col, n_clusters=25):
    """
    Cluster fights by decimal odds and compute units gained per cluster.

    Args:
        df (pd.DataFrame): DataFrame with fight data
        red_odds_col (str): column name for red fighter odds
        blue_odds_col (str): column name for blue fighter odds
        winner_col (str): column with fight outcome (1=red, 0=blue)
        pred_winner_col (str): column with predicted outcome (1=red, 0=blue)
        n_clusters (int): number of clusters for KMeans

    Returns:
        pd.DataFrame: summary table with cluster stats
    """
    # Select odds
    odds = df[[red_odds_col, blue_odds_col]].copy()

    # Scale odds for clustering
    scaler = StandardScaler()
    odds_scaled = scaler.fit_transform(odds)

    # KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters)
    df = df.copy()
    df['cluster'] = kmeans.fit_predict(odds_scaled)

    # Compute units gained per cluster
    cluster_stats = []
    for c in range(n_clusters):
        cluster_df = df[df['cluster'] == c]

        units = 0
        for _, row in cluster_df.iterrows():
            if row[pred_winner_col] == row[winner_col]:
                # Correct prediction → profit
                if row[pred_winner_col] == 1:
                    units += row[red_odds_col] 
                else:
                    units += row[blue_odds_col] 
            else:
                # Wrong prediction → lose 1 unit
                units -= 1

        cluster_stats.append({
            "cluster": c,
            "num_fights": len(cluster_df),
            "avg_units_gained": units / len(cluster_df),
        })

    return pd.DataFrame(cluster_stats)
    
df_copy_train['pred_winner'] = np.where(nn_preds>=.5, 1,0)
df_copy_train['winner'] = y_train

df_cluster = cluster_odds(df_copy_train,'dec_fair_close1_red','dec_fair_close1_blue','winner', 'pred_winner' )