In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import numpy as np

In [2]:
# Load the dataset
data = pd.read_csv('barca_dataset.csv')

In [3]:
# Check for missing values in the target column
print("Missing values in MATCH_OUTCOME:", data['MATCH_OUTCOME'].isnull().sum())

Missing values in MATCH_OUTCOME: 0


In [7]:
# Map the outcomes to numerical values
outcome_mapping = {'Win': 0, 'Draw': 1, 'Loss': 2}
data['MATCH_OUTCOME'] = data['MATCH_OUTCOME'].map(outcome_mapping)

In [9]:
# Convert percentage columns to float
percentage_columns = ['POSSESSION', 'PASS_ACCURACY']
for col in percentage_columns:
    data[col] = data[col].str.rstrip('%').astype(float) / 100.0

In [11]:
# Encode home/away and opponent names
data['HOME_OR_AWAY'] = data['HOME_OR_AWAY'].map({'Home': 0, 'Away': 1})
label_encoder = LabelEncoder()
data['OPPONENT'] = label_encoder.fit_transform(data['OPPONENT'])

In [13]:
# Feature Engineering: Add new features
# 1. Average goals scored and conceded in the last 5 matches
data['AVG_GOALS_SCORED_LAST_5'] = data['GOALS_SCORED'].rolling(window=5, min_periods=1).mean()
data['AVG_GOALS_CONCEDED_LAST_5'] = data['GOALS_CONCEDED'].rolling(window=5, min_periods=1).mean()

# 2. Difference between expected goals and actual goals
data['GOALS_DIFF'] = data['GOALS_SCORED'] - data['EXPECTED_GOALS']

# 3. Interaction feature: Possession * Shots on Target
data['POSSESSION_SHOTS'] = data['POSSESSION'] * data['SHOTS_ON_TARGET']

# 4. Win/Loss streak (last 3 matches)
data['WIN_STREAK'] = data['MATCH_OUTCOME'].rolling(window=3, min_periods=1).apply(lambda x: (x == 0).sum())
data['LOSS_STREAK'] = data['MATCH_OUTCOME'].rolling(window=3, min_periods=1).apply(lambda x: (x == 2).sum())

In [15]:
# Select features and target
features = [
    'HOME_OR_AWAY', 'OPPONENT', 'POSSESSION', 'SHOTS_ON_TARGET', 'PASS_ACCURACY', 
    'EXPECTED_GOALS', 'AVG_GOALS_SCORED_LAST_5', 'AVG_GOALS_CONCEDED_LAST_5', 
    'GOALS_DIFF', 'POSSESSION_SHOTS', 'WIN_STREAK', 'LOSS_STREAK'
]
target = 'MATCH_OUTCOME'

X = data[features]
y = data[target]

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Standardization: Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
# Feature Selection: Use Recursive Feature Elimination (RFE)
model_for_rfe = RandomForestClassifier(random_state=42)
rfe = RFE(estimator=model_for_rfe, n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)
X_test_rfe = rfe.transform(X_test_scaled)

In [23]:
# Hyperparameter Tuning for XGBoost
param_grid = {
    'n_estimators': [50,100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}

xgb_model = XGBClassifier(random_state=42, objective='multi:softprob')
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_rfe, y_train)

print("Best Parameters for XGBoost:", grid_search.best_params_)

Best Parameters for XGBoost: {'colsample_bytree': 0.9, 'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 50, 'subsample': 1.0}


In [24]:
# Train the final model with the best parameters
final_model = grid_search.best_estimator_
final_model.fit(X_train_rfe, y_train)

In [25]:
# Evaluate the model on the test set
y_pred = final_model.predict(X_test_rfe)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on Test Set: {accuracy:.2%}")

Model Accuracy on Test Set: 72.22%


In [26]:
# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(final_model, X_train_rfe, y_train, cv=kfold, scoring='accuracy')
print(f"K-Fold Cross-Validation Accuracy: {cv_scores.mean():.2%} (±{cv_scores.std():.2%})")

K-Fold Cross-Validation Accuracy: 73.52% (±9.63%)


In [27]:
# Function to predict outcome
def predict_outcome(opponent_name, home_or_away):
    # Encode the opponent name
    try:
        opponent_encoded = label_encoder.transform([opponent_name])[0]
    except ValueError:
        print(f"Opponent '{opponent_name}' not found in the dataset. Please enter a valid opponent name.")
        return None

    # Encode home/away
    home_or_away_encoded = 0 if home_or_away.lower() == 'home' else 1

    # Use average values for other features
    input_data = [[
        home_or_away_encoded, opponent_encoded, X_train['POSSESSION'].mean(), 
        X_train['SHOTS_ON_TARGET'].mean(), X_train['PASS_ACCURACY'].mean(), 
        X_train['EXPECTED_GOALS'].mean(), X_train['AVG_GOALS_SCORED_LAST_5'].mean(), 
        X_train['AVG_GOALS_CONCEDED_LAST_5'].mean(), X_train['GOALS_DIFF'].mean(), 
        X_train['POSSESSION_SHOTS'].mean(), X_train['WIN_STREAK'].mean(), 
        X_train['LOSS_STREAK'].mean()
    ]]

    # Scale the input data
    input_data_scaled = scaler.transform(input_data)

    # Transform input data using RFE
    input_data_rfe = rfe.transform(input_data_scaled)

    # Predict probabilities
    probabilities = final_model.predict_proba(input_data_rfe)[0]
    return {'Win': probabilities[0], 'Draw': probabilities[1], 'Loss': probabilities[2]}

In [71]:
# Ask user for input
while True:
    print("\nEnter opponent name and home/away status to predict the match outcome.")
    opponent_name = input("Enter opponent name: ")
    home_or_away = input("Is the match Home or Away? (Enter 'Home' or 'Away'): ")

    # Validate home/away input
    if home_or_away.lower() not in ['home', 'away']:
        print("Invalid input for home/away. Please enter 'Home' or 'Away'.")
        continue

    # Predict outcome
    outcome = predict_outcome(opponent_name, home_or_away)
    if outcome:
        print(f"\nPredicted Outcome Probabilities:")
        print(f"Win: {outcome['Win']:.2%}")
        print(f"Draw: {outcome['Draw']:.2%}")
        print(f"Loss: {outcome['Loss']:.2%}")

    # Ask if the user wants to predict another match
    another_prediction = input("\nDo you want to predict another match? (Enter 'yes' or 'no'): ")
    if another_prediction.lower() != 'yes':
        break


Enter opponent name and home/away status to predict the match outcome.


Enter opponent name:  PSG
Is the match Home or Away? (Enter 'Home' or 'Away'):  Home





Predicted Outcome Probabilities:
Win: 89.34%
Draw: 10.02%
Loss: 0.64%



Do you want to predict another match? (Enter 'yes' or 'no'):  yes



Enter opponent name and home/away status to predict the match outcome.


Enter opponent name:  PSG
Is the match Home or Away? (Enter 'Home' or 'Away'):  Away





Predicted Outcome Probabilities:
Win: 83.70%
Draw: 15.38%
Loss: 0.92%



Do you want to predict another match? (Enter 'yes' or 'no'):  no
