In [1]:
import statsmodels.api as sm
import pandas as pd
import zipfile
import numpy as np 
zip_path = "march-machine-learning-mania-2025.zip"
inner_file = "MRegularSeasonDetailedResults.csv"
conference_file = "MTeamConferences.csv"
mmassey_rankings = 'MMasseyOrdinals.csv'

with zipfile.ZipFile(zip_path) as z:
    with z.open(inner_file) as f1, z.open(conference_file) as f2, z.open(mmassey_rankings) as f3:
        df_teams = pd.read_csv(f1)
        df_conferences = pd.read_csv(f2)
        df_massey = pd.read_csv(f3)
def calculate_glm_quality(df_games):
    """
    Use GLM coefficients as team quality measure
    """
    
    print("\n4. Calculating GLM Quality Metrics...")
    
    # Prepare data
    game_data = df_games[['Season', 'HomeTeamID', 'AwayTeamID', 'HomeSpread']].copy()
    game_data['HomeWin'] = (game_data['HomeSpread'] > 0).astype(int)
    
    quality_results = []
    
    for season in game_data['Season'].unique():
        season_data = game_data[game_data['Season'] == season].copy()
        
        
        try:
            # Get unique teams
            all_teams = pd.unique(season_data[['HomeTeamID', 'AwayTeamID']].values.ravel())
            
            # Create dummy variables manually
            home_dummies = pd.get_dummies(season_data['HomeTeamID'], prefix='Home')
            away_dummies = pd.get_dummies(season_data['AwayTeamID'], prefix='Away')
            
            # Combine dummies
            X = pd.concat([home_dummies, away_dummies], axis=1)
            y = season_data['HomeWin']
            
            # Fit GLM
            glm = sm.GLM(y, X, family=sm.families.Binomial()).fit()
            
            # Extract coefficients
            coeffs = pd.DataFrame({
                'Variable': glm.params.index,
                'Coefficient': glm.params.values
            })
            
            # Parse team IDs and calculate quality
            team_quality = []
            
            for team_id in all_teams:
                # Get home coefficient
                home_var = f'Home_{team_id}'
                away_var = f'Away_{team_id}'
                
                home_coef = coeffs[coeffs['Variable'] == home_var]['Coefficient'].values
                away_coef = coeffs[coeffs['Variable'] == away_var]['Coefficient'].values
                
                home_coef = home_coef[0] if len(home_coef) > 0 else 0
                away_coef = away_coef[0] if len(away_coef) > 0 else 0
                
                # Average the coefficients
                avg_quality = (home_coef + away_coef) / 2
                
                team_quality.append({
                    'Season': season,
                    'TeamID': team_id,
                    'quality_raw': avg_quality
                })
            
            # Convert to dataframe and normalize
            season_quality = pd.DataFrame(team_quality)
            
            # Normalize quality scores
            mean_quality = season_quality['quality_raw'].mean()
            std_quality = season_quality['quality_raw'].std()
            
            if std_quality > 0:
                season_quality['quality'] = (season_quality['quality_raw'] - mean_quality) / std_quality
                season_quality['quality'] = np.exp(season_quality['quality'])
            else:
                season_quality['quality'] = 1.0
            
            quality_results.append(season_quality[['Season', 'TeamID', 'quality']])
            
        except Exception as e:
            print(f"  GLM failed for season {season}: {e}")
            continue
    
    glm_quality = pd.concat(quality_results, ignore_index=True)
    return glm_quality


def calculate_point_spread(df_teams):
    """
    Calculate home team point spread correctly
    WLoc refers to the WINNING team's location:
    - 'H': Winner was home team
    - 'A': Winner was away team  
    - 'N': Neutral site
    """
    df = df_teams.copy()
    
    # Determine home and away teams based on WLoc
    # If WLoc == 'H': WTeamID is home, LTeamID is away
    # If WLoc == 'A': LTeamID is home, WTeamID is away
    # If WLoc == 'N': Neutral site (we'll use WTeamID as "team1", LTeamID as "team2")
    
    df['HomeTeamID'] = np.where(df['WLoc'] == 'H', df['WTeamID'],
                        np.where(df['WLoc'] == 'A', df['LTeamID'],
                                df['WTeamID']))  # For neutral, just pick one
    
    df['AwayTeamID'] = np.where(df['WLoc'] == 'H', df['LTeamID'],
                        np.where(df['WLoc'] == 'A', df['WTeamID'],
                                df['LTeamID']))
    
    df['HomeScore'] = np.where(df['WLoc'] == 'H', df['WScore'],
                      np.where(df['WLoc'] == 'A', df['LScore'],
                              df['WScore']))  # For neutral, just pick one
    
    df['AwayScore'] = np.where(df['WLoc'] == 'H', df['LScore'],
                      np.where(df['WLoc'] == 'A', df['WScore'],
                              df['LScore']))
    
    # Point spread from home team perspective
    # Positive = home team won by X points
    # Negative = home team lost by X points
    df['HomeSpread'] = df['HomeScore'] - df['AwayScore']
    
    # Mark neutral site games
    df['IsNeutral'] = (df['WLoc'] == 'N').astype(int)
    
    return df

df_games = calculate_point_spread(df_teams)


In [2]:
df_games = df_games[df_games['Season']>=2015]
glm_quality = calculate_glm_quality(df_games)
glm_quality.to_csv('glm_quality.csv')
#glm_quality = pd.read_csv('glm_quality.csv')


4. Calculating GLM Quality Metrics...


  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
