In [3]:
import pandas as pd
import os

data_folder = 'processed_data'
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]
data_frames = {file: pd.read_csv(os.path.join(data_folder, file)) for file in csv_files}


In [34]:
col_to_remove = ['player_id', 'year']
columns = list(data_frames['players.csv'].columns)
for col in col_to_remove:
    if col in columns:    
        columns.remove(col)


In [43]:
columns

['overall_rating',
 'potential',
 'attacking_work_rate',
 'defensive_work_rate',
 'crossing',
 'preferred_foot_left',
 'preferred_foot_right']

In [4]:
# Function to extract player stats for home and away players from a single game
def get_player_stats(game_row, players_df):
    player_stats = []
    year = game_row['year']
    
    # Extract stats for home players
    for i in range(1, 12):
        home_player_id = game_row[f'home_player_{i}']
        home_player_stats = players_df[(players_df['player_id'] == home_player_id) & (players_df['year'] == year)]
        
        # Check if player exists in the dataset for the given year
        # If not, look for the most recent data available
        while year > 2008 and home_player_stats.empty:
            year -= 1
            home_player_stats = players_df[(players_df['player_id'] == home_player_id) & (players_df['year'] == year)]
        if not home_player_stats.empty:
            home_player_stats = home_player_stats.drop(columns=col_to_remove)
            player_stats.append(home_player_stats.iloc[0])
        else:
            player_stats.append([0] * (players_df.shape[1] - 2))  # If no data, append zeroes

    # Extract stats for away players
    for i in range(1, 12):
        away_player_id = game_row[f'away_player_{i}']
        away_player_stats = players_df[(players_df['player_id'] == away_player_id) & (players_df['year'] == year)]
        
        # Check if player exists in the dataset for the given year
        # If not, look for the most recent data available
        while year > 2008 and away_player_stats.empty:
            year -= 1
            away_player_stats = players_df[(players_df['player_id'] == home_player_id) & (players_df['year'] == year)]
        if not away_player_stats.empty:
            away_player_stats = away_player_stats.drop(columns=col_to_remove)
            player_stats.append(away_player_stats.iloc[0])
        else:
            player_stats.append([0] * (players_df.shape[1] - 2))  # If no data, append zeroes

    return player_stats



In [26]:
from sklearn.model_selection import train_test_split


home_players_formation = [f"home_player_X{i}" for i in range(1, 12)]
home_players_formation.extend([f"home_player_Y{i}" for i in range(1, 12)])
away_players_formation = [f"away_player_X{i}" for i in range(1, 12)]
away_players_formation.extend([f"away_player_Y{i}" for i in range(1, 12)])

# Load the datasets
# games = data_frames['games.csv']
games = data_frames['games.csv']
players = data_frames['players.csv']

# Prepare the features and labels (assuming 'home_team_win' as the target)
X = []
y = games['win_by'] 

# Extract features dynamically for each game
for index, game_row in games.iterrows():
    player_stats = get_player_stats(game_row, players)
    x_col = []
    x_col.extend([item for sublist in player_stats for item in sublist])
    x_col.extend([game_row[col] for col in home_players_formation])
    x_col.extend([game_row[col] for col in away_players_formation]) 
    X.append(x_col) 

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [59]:
from sklearn import ensemble
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values by filling with the mean
    ('classifier', ensemble.GradientBoostingRegressor(**params))
])

pipeline.fit(X_train, y_train)
y_win_by = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_win_by)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 1.3700798227836752


In [17]:
def get_player_stats_test(game_row, players_df):
    player_stats = []
    
    # Extract stats for home players
    for i in range(1, 12):
        home_player_id = game_row[f'home_player_{i}']
        home_player_stats = players_df[(players_df['player_id'] == home_player_id)]
        
        if not home_player_stats.empty:
            home_player_stats = home_player_stats.drop(columns=['player_id'])
            player_stats.append(home_player_stats.iloc[0])
        else:
            player_stats.append([0] * (players_df.shape[1] - 1))  # If no data, append zeroes

    # Extract stats for away players
    for i in range(1, 12):
        away_player_id = game_row[f'away_player_{i}']
        away_player_stats = players_df[(players_df['player_id'] == away_player_id)]
         
        if not away_player_stats.empty:
            away_player_stats = away_player_stats.drop(columns=['player_id'])
            player_stats.append(away_player_stats.iloc[0])
        else:
            player_stats.append([0] * (players_df.shape[1] - 1))  # If no data, append zeroes

    return player_stats

In [61]:
predict_X = []

tests = data_frames["test.csv"]
player_mean = data_frames['players.csv'].groupby('player_id').mean().drop('year', axis=1).reset_index()

for index, test_row in tests.iterrows():
    player_stats = get_player_stats_test(test_row, player_mean)
    x_col = []
    x_col.extend([item for sublist in player_stats for item in sublist])
    # x_col.extend([test_row[col] for col in home_players_formation])
    # x_col.extend([test_row[col] for col in away_players_formation]) 
    predict_X.append(x_col) 

In [62]:
predict_res = pipeline.predict(predict_X)


In [64]:
df = pd.read_csv('winner_boost.csv')
df['winby'] = predict_res
df.to_csv('winby_and_winner.csv', index=False)