In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# LOAD AND JOIN DATA
# ============================================================================

print("="*80)
print("WOMEN'S BASKETBALL COMPOSITE SCORING MODEL")
print("="*80 + "\n")

# Load tournament data (basic info: team, year, seed, finish)
tournament = pd.read_csv('wncaat_teams_historical.csv')

# Load Torvik statistics (advanced metrics)
torvik = pd.read_csv('torvik_women_historical.csv')

# Join on torvik_id
df = tournament.merge(torvik, on='torvik_id', how='left', suffixes=('', '_torvik'))

print(f"✓ Loaded {len(df)} teams from {df['year'].min()}-{df['year'].max()}\n")



WOMEN'S BASKETBALL COMPOSITE SCORING MODEL

✓ Loaded 336 teams from 2021-2025



In [13]:
# ============================================================================
# CREATE PERFORMANCE TARGET VARIABLE
# ============================================================================

print("Creating performance target variable...")

# Encode tournament finish as numerical performance (1-7)
finish_mapping = {
    'First Round': 1,
    'Second Round': 2,
    'Sweet 16': 3,
    'Elite Eight': 4,
    'Final Four': 5,
    'Runner Up': 6,
    'Champion': 7
}

df['performance'] = df['finish'].map(finish_mapping)
print(f"✓ Performance variable created (range: {df['performance'].min()}-{df['performance'].max()})\n")

# ============================================================================
# INVERT DEFENSIVE FEATURES (LOWER IS BETTER → HIGHER IS BETTER)
# ============================================================================

print("Inverting defensive features where lower = better...")
df['adj_de_inv'] = -df['adj_de']       # Lower defensive efficiency = better
df['efgd_pct_inv'] = -df['efgd_pct']   # Lower opponent shooting = better
df['ftrd_inv'] = -df['ftrd']           # Lower opponent FTs = better
df['2pd_pct_inv'] = -df['2pd_pct']     # Lower opponent 2P% = better
df['3pd_pct_inv'] = -df['3pd_pct']     # Lower opponent 3P% = better
df['3prd_inv'] = -df['3prd']           # Lower opponent 3P rate = better
print("✓ Inverted: adj_de, efgd_pct, ftrd, 2pd_pct, 3pd_pct, 3prd\n")

# ============================================================================
# FEATURE CATEGORIZATION
# ============================================================================

# Offensive features (higher = better)
offensive_vars = [
    'adj_oe',      # Adjusted offensive efficiency
    'efg_pct',     # Effective field goal percentage
    'orb_pct',     # Offensive rebounding percentage
    'ftr',         # Free throw rate
    '2p_pct',      # 2-point percentage
    '3p_pct',      # 3-point percentage
    '3pr',         # 3-point rate
    'tor'          # Turnover rate
]

# Defensive features (inverted so higher = better)
defensive_vars = [
    'adj_de_inv',     # Adjusted defensive efficiency (inverted)
    'efgd_pct_inv',   # Opponent effective FG% (inverted)
    'drb_pct',        # Defensive rebounding percentage
    'ftrd_inv',       # Opponent free throw rate (inverted)
    '2pd_pct_inv',    # Opponent 2P% (inverted)
    '3pd_pct_inv',    # Opponent 3P% (inverted)
    '3prd_inv',       # Opponent 3P rate (inverted)
    'tord'            # Opponent turnovers (higher = better)
]

# Overall features
overall_vars = ['barthag', 'wab']

print(f"Feature counts:")
print(f"  Offensive: {len(offensive_vars)}")
print(f"  Defensive: {len(defensive_vars)}")
print(f"  Overall: {len(overall_vars)}\n")

Creating performance target variable...
✓ Performance variable created (range: 1-7)

Inverting defensive features where lower = better...
✓ Inverted: adj_de, efgd_pct, ftrd, 2pd_pct, 3pd_pct, 3prd

Feature counts:
  Offensive: 8
  Defensive: 8
  Overall: 2



In [14]:
# ============================================================================
# BUILD OFFENSIVE COMPOSITE SCORE
# ============================================================================

print("="*80)
print("BUILDING OFFENSIVE SCORE")
print("="*80 + "\n")

# Prepare data
offense_df = df[offensive_vars].copy()
offense_df = offense_df.fillna(offense_df.median())

# Get target variable
X = offense_df.values
y = df['performance'].values

# Calculate correlation importance
correlations = {}
for metric in offensive_vars:
    corr = df[metric].corr(df['performance'])
    correlations[metric] = abs(corr)

# Calculate Random Forest importance
rf = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf.fit(X, y)
rf_importance = dict(zip(offensive_vars, rf.feature_importances_))

# Combine correlation and RF importance (50/50 weight)
combined_importance = {}
for metric in offensive_vars:
    corr_norm = correlations[metric] / sum(correlations.values())
    rf_norm = rf_importance[metric] / sum(rf_importance.values())
    combined_importance[metric] = (corr_norm + rf_norm) / 2

# Normalize weights to sum to 1
total = sum(combined_importance.values())
weights_off = {k: v/total for k, v in combined_importance.items()}

print("Offensive Feature Weights:")
for metric, weight in sorted(weights_off.items(), key=lambda x: x[1], reverse=True):
    print(f"  {metric:15s}: {weight:.4f}")

# Calculate weighted composite score
scaler = StandardScaler()
offense_scaled = scaler.fit_transform(offense_df)
offense_weighted = np.average(offense_scaled, axis=1,
                              weights=[weights_off[m] for m in offensive_vars])

# Scale to 0-10 and round to 2 decimals
min_max = MinMaxScaler(feature_range=(0, 10))
df['offensive_score'] = np.round(
    min_max.fit_transform(offense_weighted.reshape(-1, 1)).flatten(), 2
)

print(f"\n✓ Offensive scores created (range: {df['offensive_score'].min():.2f} - {df['offensive_score'].max():.2f})\n")


BUILDING OFFENSIVE SCORE

Offensive Feature Weights:
  adj_oe         : 0.3945
  orb_pct        : 0.1213
  3pr            : 0.1152
  2p_pct         : 0.1017
  efg_pct        : 0.0922
  tor            : 0.0743
  3p_pct         : 0.0698
  ftr            : 0.0310

✓ Offensive scores created (range: 0.00 - 10.00)



In [15]:
# ============================================================================
# BUILD DEFENSIVE COMPOSITE SCORE
# ============================================================================

print("="*80)
print("BUILDING DEFENSIVE SCORE")
print("="*80 + "\n")

# Prepare data
defense_df = df[defensive_vars].copy()
defense_df = defense_df.fillna(defense_df.median())

# Calculate importance weights
X_def = defense_df.values

# Correlation importance
correlations_def = {}
for metric in defensive_vars:
    corr = df[metric].corr(df['performance'])
    correlations_def[metric] = abs(corr)

# Random Forest importance
rf_def = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_def.fit(X_def, y)
rf_importance_def = dict(zip(defensive_vars, rf_def.feature_importances_))

# Combine
combined_importance_def = {}
for metric in defensive_vars:
    corr_norm = correlations_def[metric] / sum(correlations_def.values())
    rf_norm = rf_importance_def[metric] / sum(rf_importance_def.values())
    combined_importance_def[metric] = (corr_norm + rf_norm) / 2

# Normalize
total_def = sum(combined_importance_def.values())
weights_def = {k: v/total_def for k, v in combined_importance_def.items()}

print("Defensive Feature Weights:")
for metric, weight in sorted(weights_def.items(), key=lambda x: x[1], reverse=True):
    print(f"  {metric:15s}: {weight:.4f}")

# Calculate weighted composite
scaler_def = StandardScaler()
defense_scaled = scaler_def.fit_transform(defense_df)
defense_weighted = np.average(defense_scaled, axis=1,
                              weights=[weights_def[m] for m in defensive_vars])

# Scale to 0-10 and round to 2 decimals
df['defensive_score'] = np.round(
    min_max.fit_transform(defense_weighted.reshape(-1, 1)).flatten(), 2
)

print(f"\n✓ Defensive scores created (range: {df['defensive_score'].min():.2f} - {df['defensive_score'].max():.2f})\n")



BUILDING DEFENSIVE SCORE

Defensive Feature Weights:
  adj_de_inv     : 0.4287
  drb_pct        : 0.1218
  efgd_pct_inv   : 0.1101
  2pd_pct_inv    : 0.1059
  ftrd_inv       : 0.0926
  3pd_pct_inv    : 0.0501
  tord           : 0.0491
  3prd_inv       : 0.0416

✓ Defensive scores created (range: 0.00 - 10.00)



In [16]:
# ============================================================================
# BUILD OVERALL COMPOSITE SCORE
# ============================================================================

print("="*80)
print("BUILDING OVERALL SCORE")
print("="*80 + "\n")

# Prepare data - include offensive and defensive scores
overall_df = df[overall_vars].copy()
overall_df['offensive_score'] = df['offensive_score']
overall_df['defensive_score'] = df['defensive_score']
overall_df = overall_df.fillna(overall_df.median())

all_overall_vars = overall_vars + ['offensive_score', 'defensive_score']

# Calculate importance weights
X_ovr = overall_df.values

# Correlation importance
correlations_ovr = {}
for metric in all_overall_vars:
    corr = overall_df[metric].corr(df['performance'])
    correlations_ovr[metric] = abs(corr)

# Random Forest importance
rf_ovr = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_ovr.fit(X_ovr, y)
rf_importance_ovr = dict(zip(all_overall_vars, rf_ovr.feature_importances_))

# Combine
combined_importance_ovr = {}
for metric in all_overall_vars:
    corr_norm = correlations_ovr[metric] / sum(correlations_ovr.values())
    rf_norm = rf_importance_ovr[metric] / sum(rf_importance_ovr.values())
    combined_importance_ovr[metric] = (corr_norm + rf_norm) / 2

# Normalize
total_ovr = sum(combined_importance_ovr.values())
weights_ovr = {k: v/total_ovr for k, v in combined_importance_ovr.items()}

print("Overall Feature Weights:")
for metric, weight in sorted(weights_ovr.items(), key=lambda x: x[1], reverse=True):
    print(f"  {metric:20s}: {weight:.4f}")

# Calculate weighted composite
scaler_ovr = StandardScaler()
overall_scaled = scaler_ovr.fit_transform(overall_df)
overall_weighted = np.average(overall_scaled, axis=1,
                              weights=[weights_ovr[m] for m in all_overall_vars])

# Scale to 0-10 and round to 2 decimals
df['overall_score'] = np.round(
    min_max.fit_transform(overall_weighted.reshape(-1, 1)).flatten(), 2
)

print(f"\n✓ Overall scores created (range: {df['overall_score'].min():.2f} - {df['overall_score'].max():.2f})\n")


BUILDING OVERALL SCORE

Overall Feature Weights:
  barthag             : 0.4110
  wab                 : 0.2686
  defensive_score     : 0.1632
  offensive_score     : 0.1572

✓ Overall scores created (range: 0.00 - 10.00)



In [17]:
# ============================================================================
# DISPLAY RESULTS
# ============================================================================

print("="*80)
print("CORRELATION WITH TOURNAMENT PERFORMANCE")
print("="*80 + "\n")

print(f"Overall Score:   {df['overall_score'].corr(df['performance']):.4f}")
print(f"Offensive Score: {df['offensive_score'].corr(df['performance']):.4f}")
print(f"Defensive Score: {df['defensive_score'].corr(df['performance']):.4f}")

print("\n" + "="*80)
print("CHAMPIONS ANALYSIS")
print("="*80 + "\n")

champions = df[df['finish'] == 'Champion']
print(f"Champions (n={len(champions)}):")
print(champions[['team', 'year', 'overall_score', 'offensive_score', 'defensive_score']].to_string(index=False))

print(f"\nChampions Average Scores:")
print(f"  Overall:   {champions['overall_score'].mean():.2f}")
print(f"  Offensive: {champions['offensive_score'].mean():.2f}")
print(f"  Defensive: {champions['defensive_score'].mean():.2f}")

print("\n" + "="*80)
print("TOP 20 TEAMS (OVERALL SCORE)")
print("="*80 + "\n")

top_20 = df.nlargest(20, 'overall_score')[['year', 'team', 'seed', 'finish',
                                            'overall_score', 'offensive_score', 'defensive_score']]
print(top_20.to_string(index=False))

CORRELATION WITH TOURNAMENT PERFORMANCE

Overall Score:   0.6111
Offensive Score: 0.5144
Defensive Score: 0.5090

CHAMPIONS ANALYSIS

Champions (n=5):
          team  year  overall_score  offensive_score  defensive_score
      Stanford  2021           9.64             8.06             8.94
South Carolina  2022           9.84             6.26            10.00
           LSU  2023           8.68             7.63             6.84
South Carolina  2024           9.87             8.11             9.23
   Connecticut  2025           9.69             9.02             8.69

Champions Average Scores:
  Overall:   9.54
  Offensive: 7.82
  Defensive: 8.74

TOP 20 TEAMS (OVERALL SCORE)

 year               team  seed       finish  overall_score  offensive_score  defensive_score
 2023     South Carolina     1   Final Four          10.00             8.02             9.57
 2025     South Carolina     1    Runner Up           9.99             7.25             8.98
 2024     South Carolina     1     Cha

In [18]:

# ============================================================================
# SAVE RESULTS
# ============================================================================

output_cols = [
    'team_id', 'year', 'team', 'region', 'seed', 'finish', 'weekend',
    'conf', 'adj_oe', 'adj_de', 'barthag', 'wab',
    'offensive_score', 'defensive_score', 'overall_score'
]

df[output_cols].to_csv('women_composite_scores_final.csv', index=False)
print("\n✓ Saved: women_composite_scores_final.csv")
print(f"  {len(df)} teams with composite scores\n")

print("="*80)
print("COMPLETE!")
print("="*80)


✓ Saved: women_composite_scores_final.csv
  336 teams with composite scores

COMPLETE!
