In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the datasets
df_20_21 = pd.read_excel("Premier league 20_21.xlsx")  # Load premier league validation dataset
df_21_22 = pd.read_excel("Premier league 21_22.xlsx")
df_22_23 = pd.read_excel("Premier league 22_23.xlsx")
df_23_24 = pd.read_excel("Premier league 23_24.xlsx")

In [None]:
# Combine Premier League datasets for training and testing
df = pd.concat([df_21_22, df_22_23, df_23_24], ignore_index=True)

In [None]:
# Select relevant columns
selected_columns = [
    'FTHG', 'FTAG', 'HTHG', 'HTAG',             # Goals scored (full-time and halftime)
    'B365H', 'B365D', 'B365A',                  # Bet365 odds for home win, draw, away win
    'IWH', 'IWD', 'IWA',                        # Alternative odds for win, draw, and loss
    'AHCh',                                     # Asian Handicap for home win
    'B365CAHH', 'B365CAHA',                     # Asian Handicap betting odds from Bet365
    'P>2.5', 'P<2.5',                           # Probability of over/under 2.5 goals
    'FTR'                                       # Target variable: Full-Time Result
]


In [None]:
# Filter the training data
df = df[selected_columns]

In [None]:
# Handle missing values
missing_value_threshold = 0.3
for column in df.columns:
    if df[column].isnull().mean() > missing_value_threshold:
        df.drop(column, axis=1, inplace=True)
df.fillna(df.mean(numeric_only=True), inplace=True)

In [None]:
# Add a TotalGoals column for the target variable
df['TotalGoals'] = df['FTHG'] + df['FTAG']
target = df['TotalGoals']
features = df.drop(columns=['TotalGoals', 'FTR'])  # Drop non-relevant columns for training

In [None]:
# Process Bundesliga validation dataset
df_20_21 = df_20_21[selected_columns]  # Ensure the same columns are present
df_20_21.fillna(df_20_21.mean(numeric_only=True), inplace=True)
df_20_21['TotalGoals'] = df_20_21['FTHG'] + df_20_21['FTAG']  # Add TotalGoals column
validation_features = df_20_21.drop(columns=['TotalGoals', 'FTR'])
validation_target = df_20_21['TotalGoals']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_20_21.fillna(df_20_21.mean(numeric_only=True), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_20_21['TotalGoals'] = df_20_21['FTHG'] + df_20_21['FTAG']  # Add TotalGoals column


In [None]:
# Split the Premier League data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
validation_features = scaler.transform(validation_features)  # Scale validation data

In [None]:
# Train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred_test = model.predict(X_test)
y_pred_validation = model.predict(validation_features)

In [None]:
# Evaluate model performance
print("Evaluation on Test Data:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_test))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_test))
print("R^2 Score:", r2_score(y_test, y_pred_test))

print("\nEvaluation on Validation Data (Bundesliga 20-21):")
print("Mean Absolute Error:", mean_absolute_error(validation_target, y_pred_validation))
print("Mean Squared Error:", mean_squared_error(validation_target, y_pred_validation))
print("R^2 Score:", r2_score(validation_target, y_pred_validation))

Evaluation on Test Data:
Mean Absolute Error: 0.02026315789473682
Mean Squared Error: 0.009473684210526313
R^2 Score: 0.9964423382552663

Evaluation on Validation Data (Bundesliga 20-21):
Mean Absolute Error: 0.030473684210526302
Mean Squared Error: 0.023902631578947363
R^2 Score: 0.9922273816386827
