In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn import ensemble

In [2]:
# Load datasets
games = pd.read_csv('processed_data/games.csv')
players = pd.read_csv('processed_data/players.csv')
test = pd.read_csv('processed_data/test.csv')

# Function to compute aggregated player stats for a single game
def compute_team_stats(player_ids, year, players_df):
    # Filter player data by the specific year and player_ids involved in the game
    filtered_players = players_df[(players_df['player_id'].isin(player_ids)) & (players_df['year'] == year)]
    
    # Calculate team-level statistics (e.g., mean player rating)
    if not filtered_players.empty:
        return filtered_players['overall_rating'].mean()
    return 0  # Return 0 if no players match (as a fallback)

# Iterate through each game and calculate aggregated player stats
home_team_ratings = []
away_team_ratings = []

for index, game in games.iterrows():
    year = game['year']
    
    # Get the home and away player IDs
    home_players = game[['home_player_1', 'home_player_2', 'home_player_3', 'home_player_4', 
                         'home_player_5', 'home_player_6', 'home_player_7', 'home_player_8', 
                         'home_player_9', 'home_player_10', 'home_player_11']]
    
    away_players = game[['away_player_1', 'away_player_2', 'away_player_3', 'away_player_4', 
                         'away_player_5', 'away_player_6', 'away_player_7', 'away_player_8', 
                         'away_player_9', 'away_player_10', 'away_player_11']]

    # Compute the mean player rating for home and away teams
    home_rating = compute_team_stats(home_players, year, players)
    away_rating = compute_team_stats(away_players, year, players)

    # Store the computed team ratings
    home_team_ratings.append(home_rating)
    away_team_ratings.append(away_rating)

# Append the new features to the games DataFrame
games['home_team_rating'] = home_team_ratings
games['away_team_rating'] = away_team_ratings
games['rating_diff'] = games['home_team_rating'] - games['away_team_rating']

# Now we can proceed with the model training as before
# Define the target: win_by value (home_team_goal - away_team_goal)
games['win_by'] = games['home_team_goal'] - games['away_team_goal']

# Features and target
features = ['home_team_rating', 'away_team_rating', 'rating_diff']
X = games[features]
y = games['win_by']


In [17]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

# Predicting win_by value
win_by_model_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values by filling with the mean
    ('regressor', ensemble.GradientBoostingRegressor(**params))
])
win_by_model_pipeline.fit(X_train, y_train)

y_pred_winby = win_by_model_pipeline.predict(X_val)
mse = mean_squared_error(y_val, y_pred_winby)
print(f'Mean Squared Error: {mse}')

# Predicting is_home_win
games['is_home_win'] = games['win_by'].apply(lambda x: 1 if x > 0 else 1)
y_winner = games['is_home_win']

is_home_winner_model_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values by filling with the mean
    ('regressor', ensemble.GradientBoostingRegressor(**params))
])
is_home_winner_model_pipeline.fit(X_train, y_winner.loc[y_train.index])

y_is_home_win = is_home_winner_model_pipeline.predict(X_val)
mse = mean_squared_error(y_val, y_is_home_win)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 2.6023900631586088
Mean Squared Error: 3.442267108833283


In [9]:
# Function to compute aggregated player stats for a single team using the latest available rating for each player
def compute_team_stats_latest(player_ids, players_df):
    # Remove NA values from player_ids list
    player_ids = player_ids.dropna().tolist()

    # If no valid player IDs, return a default value (e.g., 0)
    if len(player_ids) == 0:
        return 0
    
    # Filter player data to only include the relevant players
    filtered_players = players_df[players_df['player_id'].isin(player_ids)]
    
    # For each player, use the most recent (highest year) rating available
    if not filtered_players.empty:
        latest_ratings = filtered_players.groupby('player_id')['overall_rating'].max()  # Get the latest rating per player
        return latest_ratings.mean()
    
    return 0  # Return 0 if no players match (as a fallback)


# Lists to store the computed ratings for home and away teams
home_team_ratings = []
away_team_ratings = []
rating_diffs = []

# Iterate through each test game and calculate aggregated player stats for both home and away teams
for index, game in test.iterrows():
    # Get the home and away player IDs from the test game
    home_players = game[['home_player_1', 'home_player_2', 'home_player_3', 'home_player_4', 
                         'home_player_5', 'home_player_6', 'home_player_7', 'home_player_8', 
                         'home_player_9', 'home_player_10', 'home_player_11']]
    
    away_players = game[['away_player_1', 'away_player_2', 'away_player_3', 'away_player_4', 
                         'away_player_5', 'away_player_6', 'away_player_7', 'away_player_8', 
                         'away_player_9', 'away_player_10', 'away_player_11']]

    # Compute the mean player ratings for the home and away teams using the latest available ratings
    home_rating = compute_team_stats_latest(home_players, players)
    away_rating = compute_team_stats_latest(away_players, players)

    # Store the computed team ratings
    home_team_ratings.append(home_rating)
    away_team_ratings.append(away_rating)

    # Calculate the rating difference (home team rating minus away team rating)
    rating_diffs.append(home_rating - away_rating)

# Append the new features to the test DataFrame
test['home_team_rating'] = home_team_ratings
test['away_team_rating'] = away_team_ratings
test['rating_diff'] = rating_diffs

# Output the result
print(test[['home_team_id', 'away_team_id', 'home_team_rating', 'away_team_rating', 'rating_diff']])


     home_team_id  away_team_id  home_team_rating  away_team_rating  \
0           239.0         144.0         75.272727         83.272727   
1           117.0         246.0         72.000000         74.090909   
2            92.0           NaN         71.700000         72.500000   
3            71.0           NaN         75.500000         69.000000   
4           182.0         176.0         73.444444         70.454545   
..            ...           ...               ...               ...   
995         111.0         204.0         80.500000         70.700000   
996         224.0         131.0         73.272727         66.833333   
997         261.0          69.0         68.363636         80.700000   
998         143.0         135.0         69.750000         69.333333   
999         129.0           NaN         67.857143         75.000000   

     rating_diff  
0      -8.000000  
1      -2.090909  
2      -0.800000  
3       6.500000  
4       2.989899  
..           ...  
995     9.8000

In [21]:
test_features = test[features]
y_reg_pred = win_by_model_pipeline.predict(test_features)
y_is_home_winner_pred = is_home_winner_model_pipeline.predict(test_features)


In [22]:
y_is_home_winner_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [15]:
df = pd.DataFrame({
    'winner': y_reg_pred,
    'winby': 1 
})

# Save the DataFrame to a CSV file
df.to_csv('example_winner.csv', index=False)

In [16]:
df = pd.read_csv('data/example.csv')
df['winby'] = y_class_pred

df.to_csv('example_winby.csv', index=False)