In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

# Load the data
df = pd.read_csv('men_2026_teams_training.csv', index_col=0)

print("="*80)
print("BUILDING CUSTOM COMPOSITE SCORING MODELS")
print("="*80)

# User's custom categorization
offensive_vars = [
    '3man_dbpm', '5man_obpm', 'assist_to_usage_ratio', 'ast%',
    'effective_possession_rate', 'efg%', 'four_factors_composite',
    'free_throw_advantage', 'lineup_depth_quality', 'mid_range_reliance',
    'net_rebounding_margin', 'net_turnover_margin', 'off_3pt_fg%',
    'off_3pt_share', 'off_close2_fg%', 'off_close2_share', 'off_dunk_fg%',
    'off_dunk_share', 'off_far2_fg%', 'off_far2_share',
    'offensive_versatility_score', 'orb%', 'paint_touch_rate',
    'perimeter_efficiency', 'rim_efficiency', 'rim_to_three_ratio',
    'shot_quality_variance', 'size_speed_index', 'three_point_volume_efficiency',
    'tor', 'kenpom_off', 'torvik_off'
]

defensive_vars = [
    '3man_obpm', '5man_dbpm', 'ast%d', 'def_3pt_fg%', 'def_3pt_share',
    'def_assist_suppression', 'def_close2_fg%', 'def_close2_share',
    'def_dunk_fg%', 'def_dunk_share', 'def_effective_possession_rate',
    'def_experience_impact', 'def_far2_fg%', 'def_far2_share',
    'def_four_factors_composite', 'def_free_throw_advantage',
    'def_lineup_depth_quality', 'def_mid_range_reliance',
    'def_net_rebounding_margin', 'def_net_turnover_margin',
    'def_paint_touch_rate', 'def_perimeter_efficiency', 'def_rim_efficiency',
    'def_rim_to_three_ratio', 'def_shot_quality_variance',
    'def_size_speed_index', 'def_three_point_volume_efficiency',
    'defensive_versatility_score', 'drb%', 'efgd%', 'kenpom_def',
    'tord', 'torvik_def'
]

overall_vars = [
    '3man_bpm', '5man_bpm', 'bench_scoring_ratio', 'elite_outcome_probability',
    'kenpom_rtg', 'rotation_balance', 'torvik_rtg', 'wab'
]

# Filter to only variables that exist in dataset
offensive_vars = [v for v in offensive_vars if v in df.columns]
defensive_vars = [v for v in defensive_vars if v in df.columns]
overall_vars = [v for v in overall_vars if v in df.columns]

print(f"\nVariables used:")
print(f"  Offensive: {len(offensive_vars)}")
print(f"  Defensive: {len(defensive_vars)}")
print(f"  Overall: {len(overall_vars)}")

# Create performance target
df['performance'] = df['weekend']


BUILDING CUSTOM COMPOSITE SCORING MODELS

Variables used:
  Offensive: 32
  Defensive: 33
  Overall: 8


In [3]:
# ============================================================================
# OFFENSIVE COMPOSITE SCORE
# ============================================================================
print("\n" + "="*80)
print("BUILDING OFFENSIVE SCORE")
print("="*80)

# Prepare data
offense_df = df[offensive_vars].copy()
offense_df = offense_df.fillna(offense_df.median())

# Calculate importance weights
X = offense_df.values
y = df['performance'].values

# Correlation importance
correlations = {}
for metric in offensive_vars:
    corr = df[metric].corr(df['performance'])
    correlations[metric] = abs(corr)

# Random Forest importance
rf = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf.fit(X, y)
rf_importance = dict(zip(offensive_vars, rf.feature_importances_))

# Combine correlation and RF importance (50/50)
combined_importance = {}
for metric in offensive_vars:
    corr_norm = correlations[metric] / sum(correlations.values())
    rf_norm = rf_importance[metric] / sum(rf_importance.values())
    combined_importance[metric] = (corr_norm + rf_norm) / 2

# Normalize to sum to 1
total = sum(combined_importance.values())
weights_off = {k: v/total for k, v in combined_importance.items()}

# Calculate weighted composite
scaler = StandardScaler()
offense_scaled = scaler.fit_transform(offense_df)
offense_weighted = np.average(offense_scaled, axis=1,
                              weights=[weights_off[m] for m in offensive_vars])

# Normalize to 0-10 and round to 2 decimals
min_max = MinMaxScaler(feature_range=(0, 10))
df['offensive_score'] = np.round(min_max.fit_transform(offense_weighted.reshape(-1, 1)).flatten(), 2)

print(f"Offensive scores created (range: {df['offensive_score'].min():.2f} - {df['offensive_score'].max():.2f})")



BUILDING OFFENSIVE SCORE
Offensive scores created (range: 0.00 - 10.00)


In [4]:
# ============================================================================
# DEFENSIVE COMPOSITE SCORE
# ============================================================================
print("\n" + "="*80)
print("BUILDING DEFENSIVE SCORE")
print("="*80)

# Prepare data
defense_df = df[defensive_vars].copy()
defense_df = defense_df.fillna(defense_df.median())

# Calculate importance weights
X_def = defense_df.values

# Correlation importance
correlations_def = {}
for metric in defensive_vars:
    corr = df[metric].corr(df['performance'])
    correlations_def[metric] = abs(corr)

# Random Forest importance
rf_def = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_def.fit(X_def, y)
rf_importance_def = dict(zip(defensive_vars, rf_def.feature_importances_))

# Combine
combined_importance_def = {}
for metric in defensive_vars:
    corr_norm = correlations_def[metric] / sum(correlations_def.values())
    rf_norm = rf_importance_def[metric] / sum(rf_importance_def.values())
    combined_importance_def[metric] = (corr_norm + rf_norm) / 2

# Normalize
total_def = sum(combined_importance_def.values())
weights_def = {k: v/total_def for k, v in combined_importance_def.items()}

# Calculate weighted composite
scaler_def = StandardScaler()
defense_scaled = scaler_def.fit_transform(defense_df)
defense_weighted = np.average(defense_scaled, axis=1,
                              weights=[weights_def[m] for m in defensive_vars])

# Normalize to 0-10 and round to 2 decimals
df['defensive_score'] = np.round(min_max.fit_transform(defense_weighted.reshape(-1, 1)).flatten(), 2)

print(f"Defensive scores created (range: {df['defensive_score'].min():.2f} - {df['defensive_score'].max():.2f})")



BUILDING DEFENSIVE SCORE
Defensive scores created (range: 0.00 - 10.00)


In [5]:

# ============================================================================
# OVERALL COMPOSITE SCORE
# ============================================================================
print("\n" + "="*80)
print("BUILDING OVERALL SCORE")
print("="*80)

# Prepare data
overall_df = df[overall_vars].copy()
overall_df['offensive_score'] = df['offensive_score']
overall_df['defensive_score'] = df['defensive_score']
overall_df = overall_df.fillna(overall_df.median())

all_overall_vars = overall_vars + ['offensive_score', 'defensive_score']

# Calculate importance weights
X_ovr = overall_df.values

# Correlation importance
correlations_ovr = {}
for metric in all_overall_vars:
    corr = overall_df[metric].corr(df['performance'])
    correlations_ovr[metric] = abs(corr)

# Random Forest importance
rf_ovr = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_ovr.fit(X_ovr, y)
rf_importance_ovr = dict(zip(all_overall_vars, rf_ovr.feature_importances_))

# Combine
combined_importance_ovr = {}
for metric in all_overall_vars:
    corr_norm = correlations_ovr[metric] / sum(correlations_ovr.values())
    rf_norm = rf_importance_ovr[metric] / sum(rf_importance_ovr.values())
    combined_importance_ovr[metric] = (corr_norm + rf_norm) / 2

# Normalize
total_ovr = sum(combined_importance_ovr.values())
weights_ovr = {k: v/total_ovr for k, v in combined_importance_ovr.items()}

# Calculate weighted composite
scaler_ovr = StandardScaler()
overall_scaled = scaler_ovr.fit_transform(overall_df)
overall_weighted = np.average(overall_scaled, axis=1,
                              weights=[weights_ovr[m] for m in all_overall_vars])

# Normalize to 0-10 and round to 2 decimals
df['overall_score'] = np.round(min_max.fit_transform(overall_weighted.reshape(-1, 1)).flatten(), 2)

print(f"Overall scores created (range: {df['overall_score'].min():.2f} - {df['overall_score'].max():.2f})")



BUILDING OVERALL SCORE
Overall scores created (range: 0.00 - 10.00)


In [6]:
# ============================================================================
# DISPLAY RESULTS
# ============================================================================
print("\n" + "="*80)
print("CORRELATION WITH TOURNAMENT PERFORMANCE")
print("="*80)

print(f"\nOverall Score:   {df['overall_score'].corr(df['performance']):.4f}")
print(f"Offensive Score: {df['offensive_score'].corr(df['performance']):.4f}")
print(f"Defensive Score: {df['defensive_score'].corr(df['performance']):.4f}")

print("\n" + "="*80)
print("CHAMPIONS ANALYSIS")
print("="*80)

champions = df[df['finish'] == 'Champion']
print(f"\nChampions Average Scores:")
print(f"  Overall:   {champions['overall_score'].mean():.2f}")
print(f"  Offensive: {champions['offensive_score'].mean():.2f}")
print(f"  Defensive: {champions['defensive_score'].mean():.2f}")

print("\n" + "="*80)
print("TOP 10 TEAMS (OVERALL SCORE)")
print("="*80)

top_10 = df.nlargest(10, 'overall_score')[['team', 'year', 'overall_score',
                                            'offensive_score', 'defensive_score', 'finish']]
print(f"\n{top_10.to_string(index=False)}")



CORRELATION WITH TOURNAMENT PERFORMANCE

Overall Score:   0.4786
Offensive Score: 0.4648
Defensive Score: 0.4470

CHAMPIONS ANALYSIS

Champions Average Scores:
  Overall:   8.29
  Offensive: 8.28
  Defensive: 7.16

TOP 10 TEAMS (OVERALL SCORE)

        team  year  overall_score  offensive_score  defensive_score      finish
    Kentucky  2015          10.00             9.50            10.00  Final Four
      Auburn  2025           9.52             9.18             8.00  Final Four
        Duke  2025           9.51            10.00             8.42  Final Four
 Connecticut  2024           9.41             9.97             7.98    Champion
     Florida  2025           9.03             8.92             7.83    Champion
     Houston  2025           9.00             7.31             8.38   Runner Up
   Wisconsin  2015           8.89             8.67             8.55   Runner Up
     Houston  2024           8.87             7.12             8.54    Sweet 16
Michigan St.  2016           8.86 

In [7]:
# ============================================================================
# SAVE RESULTS
# ============================================================================
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Save team scores
output_df = df[['team_id', 'year', 'team', 'seed', 'finish', 'weekend', 'conference',
                'overall_score', 'offensive_score', 'defensive_score']].copy()

output_df.to_csv('composite_scores_final.csv', index=False)
print("\nTeam scores saved to: composite_scores_final.csv")

# Save variable weights
weights_summary = pd.DataFrame({
    'Variable': list(weights_off.keys()),
    'Weight': list(weights_off.values()),
    'Score': 'Offensive'
})

weights_summary = pd.concat([
    weights_summary,
    pd.DataFrame({
        'Variable': list(weights_def.keys()),
        'Weight': list(weights_def.values()),
        'Score': 'Defensive'
    }),
    pd.DataFrame({
        'Variable': list(weights_ovr.keys()),
        'Weight': list(weights_ovr.values()),
        'Score': 'Overall'
    })
])

weights_summary.to_csv('model_weights_final.csv', index=False)
print("Variable weights saved to: model_weights_final.csv")

print("\n" + "="*80)
print("COMPLETE!")
print("="*80)
print("\nAll scores rounded to 2 decimal places")
print("Files saved to /mnt/user-data/outputs/")


SAVING RESULTS

Team scores saved to: composite_scores_final.csv
Variable weights saved to: model_weights_final.csv

COMPLETE!

All scores rounded to 2 decimal places
Files saved to /mnt/user-data/outputs/
