In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import ensemble
import pandas as pd
import numpy as np
import pandas as pd
import os


In [2]:
data_folder = 'processed_data'
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]
data_frames = {file: pd.read_csv(os.path.join(data_folder, file)) for file in csv_files}

In [4]:
data_frames['games.csv']['home_team_win'] = (data_frames['games.csv']['home_team_goal'] > data_frames['games.csv']['away_team_goal']).astype(float)
data_frames['games.csv']['win_by']= (data_frames['games.csv']['home_team_goal'] - data_frames['games.csv']['away_team_goal']).abs().astype(int)

In [5]:
# Function to extract player stats for home and away players from a single game
def get_player_stats(game_row, players_df):
    player_stats = []
    year = game_row['year']
    
    # Extract stats for home players
    for i in range(1, 12):
        home_player_id = game_row[f'home_player_{i}']
        home_player_stats = players_df[(players_df['player_id'] == home_player_id) & (players_df['year'] == year)]
        
        # Check if player exists in the dataset for the given year
        # If not, look for the most recent data available
        while year > 2008 and home_player_stats.empty:
            year -= 1
            home_player_stats = players_df[(players_df['player_id'] == home_player_id) & (players_df['year'] == year)]
        if not home_player_stats.empty:
            home_player_stats = home_player_stats.drop(columns=['player_id', 'year'])
            player_stats.append(home_player_stats.iloc[0])
        else:
            player_stats.append([0] * (players_df.shape[1] - 2))  # If no data, append zeroes

    # Extract stats for away players
    for i in range(1, 12):
        away_player_id = game_row[f'away_player_{i}']
        away_player_stats = players_df[(players_df['player_id'] == away_player_id) & (players_df['year'] == year)]
        
        # Check if player exists in the dataset for the given year
        # If not, look for the most recent data available
        while year > 2008 and away_player_stats.empty:
            year -= 1
            away_player_stats = players_df[(players_df['player_id'] == home_player_id) & (players_df['year'] == year)]
        if not away_player_stats.empty:
            away_player_stats = away_player_stats.drop(columns=['player_id', 'year'])
            player_stats.append(away_player_stats.iloc[0])
        else:
            player_stats.append([0] * (players_df.shape[1] - 2))  # If no data, append zeroes

    return player_stats



In [6]:

home_players_formation = [f"home_player_X{i}" for i in range(1, 12)]
home_players_formation.extend([f"home_player_Y{i}" for i in range(1, 12)])
away_players_formation = [f"away_player_X{i}" for i in range(1, 12)]
away_players_formation.extend([f"away_player_Y{i}" for i in range(1, 12)])

# Load the datasets
# games = data_frames['games.csv']
games = data_frames['games.csv']
players = data_frames['players.csv']

# Prepare the features and labels (assuming 'home_team_win' as the target)
X = []
y = games['home_team_win']  # For example, you could have this as a binary target (1 for win, 0 for loss/draw)

# Extract features dynamically for each game
for index, game_row in games.iterrows():
    player_stats = get_player_stats(game_row, players)
    x_col = []
    x_col.extend([item for sublist in player_stats for item in sublist])
    # Adding formation data
    x_col.extend([game_row[col] for col in home_players_formation])
    x_col.extend([game_row[col] for col in away_players_formation]) 
    X.append(x_col) 

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)


In [None]:
# Train a model using RandomForestClassifier as an example
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values by filling with the mean
    ('classifier', RandomForestRegressor())
])

pipeline.fit(X_train, y_train)

In [None]:
def get_player_stats_test(game_row, players_df):
    player_stats = []
    
    # Extract stats for home players
    for i in range(1, 12):
        home_player_id = game_row[f'home_player_{i}']
        home_player_stats = players_df[(players_df['player_id'] == home_player_id)]
        
        if not home_player_stats.empty:
            home_player_stats = home_player_stats.drop(columns=['player_id'])
            player_stats.append(home_player_stats.iloc[0])
        else:
            player_stats.append([0] * (players_df.shape[1] - 1))  # If no data, append zeroes

    # Extract stats for away players
    for i in range(1, 12):
        away_player_id = game_row[f'away_player_{i}']
        away_player_stats = players_df[(players_df['player_id'] == away_player_id)]
         
        if not away_player_stats.empty:
            away_player_stats = away_player_stats.drop(columns=['player_id'])
            player_stats.append(away_player_stats.iloc[0])
        else:
            player_stats.append([0] * (players_df.shape[1] - 1))  # If no data, append zeroes

    return player_stats


    

In [None]:
predict_X = []

tests = data_frames["test.csv"]
player_mean = data_frames['players.csv'].groupby('player_id').mean().drop('year', axis=1).reset_index()

for index, test_row in tests.iterrows():
    player_stats = get_player_stats_test(test_row, player_mean)
    x_col = []
    x_col.extend([item for sublist in player_stats for item in sublist])
    x_col.extend([test_row[col] for col in home_players_formation])
    x_col.extend([test_row[col] for col in away_players_formation]) 
    predict_X.append(x_col) 

In [None]:
predict_X = pipeline.predict(predict_X)
predict_X_np = np.array(predict_X)

df = pd.DataFrame(predict_X_np, columns=['winner'])

df['winby'] = 1
df.to_csv('winner_small.csv', index=False)


In [None]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

# Train a model using RandomForestClassifier as an example
boosting_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values by filling with the mean
    ('regressor', ensemble.GradientBoostingRegressor(**params))
])

boosting_pipeline.fit(X_train, y_train)

In [None]:
boosting_predict_Y = pipeline.predict(predict_X)
boosting_predict_Y = np.array(boosting_predict_Y)

df = pd.DataFrame(boosting_predict_Y, columns=['winner'])

df['winby'] = 1
df.to_csv('winner_boost.csv', index=False)
