## Data Pre-processing

In [9]:
import pandas as pd
import numpy as np

df= pd.read_csv('results.csv')
wc_games = pd.read_excel('wc_results.xlsx')
N = len(wc_games)
df = pd.concat([df, wc_games], axis=0)
df['date'] = pd.to_datetime(df['date'], dayfirst=True)

# Only include data from 1995 onwards. This marks roughly the beginning of the professional era
df = df[df['date'] > '1995-01-01']

def determine_winner_and_loser(row):
    # Function which adds winner and loser columns to the df
    if row['home_score'] > row['away_score']:
        winner = row['home_team']
        loser = row['away_team']
    elif row['home_score'] < row['away_score']:
        winner = row['away_team']
        loser = row['home_team']
    else:
        winner = 'Draw'
        loser = 'Draw'
    return winner, loser

def calculate_team_form(df, team_name, current_row_index, n_games):
    # Function which calculates form of each team over the last N games
    team_form = []
    count = 0
    games = df.copy()
    games_b = games.iloc[:current_row_index]
    
    for index in range(len(games_b)-1, -1, -1):
        row1 = games_b.iloc[index]

        if row1['home_team'] == team_name or row1['away_team'] == team_name:

            if row1['winner'] == 'draw':
                team_form.append(0.5)
            elif row1['winner'] == team_name:
                team_form.append(1)
            else:
                team_form.append(0)


       
    
    
    return sum(team_form[:n_games])

# Apply the function to create new 'winner' and 'loser' columns
df[['winner', 'loser']] = df.apply(determine_winner_and_loser, axis=1).apply(pd.Series)

# Initialise columns to store the live ranking points of both teams
df['ranking_points_home'] = 0
df['ranking_points_away'] = 0

# Add a column representing the margin in favour of the 'home' team
df['margin'] = df['home_score'] - df['away_score']

# Add a column which specifies match result as either home_win away_win or draw
df['result'] = df['margin'].apply(lambda x: 'home_win' if x > 0 else ('away_win' if x < 0 else 'draw'))

# Initialise rankings dictionary
ranking_points = {'Scotland': 80, 'England': 80, 'Wales': 80, 'Italy': 80, 'France': 80, 'Ireland': 80, 'New Zealand': 80, 'Argentina': 80, 'South Africa': 80, 'Australia': 80 }

for i, row in df.iterrows():
    # For each match in the dataframe add the live rankings of both teams which are kept track of in the dictionary ranking_points
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Update ranking_points_home and ranking_points_away
    df.at[i, 'ranking_points_home'] = ranking_points[home_team]
    df.at[i, 'ranking_points_away'] = ranking_points[away_team]
    if row['neutral'] == True:
        home_points = ranking_points[home_team]
    else:
        home_points = ranking_points[home_team] + 3
    away_points = ranking_points[away_team]
    gap = home_points - away_points
    if gap < -10:
        gap = -10
    elif gap > 10:
        gap = 10
    if row['winner'] == 'Draw':
        core = gap*0.1
    elif row['winner'] == home_team:
        core = 1 - (gap*0.1)
    else:
        core = 1 + (gap*0.1)
        
    if np.abs(row['home_score'] - row['away_score']) > 15:
        core *= 1.5
        
    if row['world_cup'] == True:
        core *= 2
        
    if row['winner'] != 'Draw':
        ranking_points[row['winner']] += core
        ranking_points[row['loser']] -= core
    else:
        ranking_points[home_team] -= core
        ranking_points[away_team] += core
        
    
df.reset_index(drop=True, inplace=True)

for idx, row in df.iterrows():
    df.at[idx, 'home_form'] = calculate_team_form(df, row['home_team'], idx, n_games=5)
    df.at[idx, 'away_form'] = calculate_team_form(df, row['away_team'], idx, n_games=5)
    
wc_games = df.iloc[-N:]
df_sliced = df.iloc[:-N]


## Predicting Points Margin between Team 1 and Team 2. 

A positive margin is in favour of team 1 (or the team designated as home in the dataframe).

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


# Consider games from 1996 onwards so the rankings have had time to calibrate
data = df_sliced.copy()
data = data[data['date'] > '1996-01-01']

# Encode the neutral and world cup columns as binary indicator variables
data['neutral'] = data['neutral'].astype(int)
data['world_cup'] = data['world_cup'].astype(int)

# Split into train and test datasets, using 1/1/2017 as the cut-off point. 
# Models are trained on data from games before this date and evaluated on data from games after this date
train_data = data[data['date'] < '2017-01-01']
test_data = data[data['date'] >= '2017-01-01']
X_train = train_data[['neutral', 'world_cup', 'ranking_points_home', 'ranking_points_away', 'home_form', 'away_form']]
y_train = train_data['margin']
X_test = test_data[['neutral', 'world_cup', 'ranking_points_home', 'ranking_points_away', 'home_form', 'away_form']]
y_test = test_data['margin']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Support Vector Regressor': SVR()
}

for model_name, model in models.items():
    # Fit the model to the training data
    model.fit(X_train_scaled, y_train)
    
    # Predict on the test data
    y_pred = model.predict(X_test_scaled)
    
    # Calculate mean squared error (MSE) and R-squared (R2) for evaluation
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print results
    print(f"Model: {model_name}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R-squared (R2): {r2:.2f}")
    print("="*50)

Model: Linear Regression
Mean Squared Error (MSE): 203.76
R-squared (R2): 0.45
Model: Random Forest Regressor
Mean Squared Error (MSE): 226.61
R-squared (R2): 0.38
Model: Support Vector Regressor
Mean Squared Error (MSE): 234.31
R-squared (R2): 0.36


## Predicting Match Winner

In [11]:
y_train = train_data['result']
y_test = test_data['result']

# Initialise and fit a Random Forest Classifier

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred = clf.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print results for Random Forest
print('Random Forest Results')
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_rep)



# Initialize and fit a Logistic Regression classifier
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_logreg = logreg.predict(X_test_scaled)

# Evaluate the Logistic Regression model
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
classification_rep_logreg = classification_report(y_test, y_pred_logreg)

# Print results for Logistic Regression
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_logreg:.2f}")
print("Classification Report:")
print(classification_rep_logreg)
print("="*50)



# Initialize and fit a Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_decision_tree = decision_tree.predict(X_test_scaled)

# Evaluate the Decision Tree model
accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
classification_rep_decision_tree = classification_report(y_test, y_pred_decision_tree)

# Print results for Decision Tree
print("Decision Tree Results:")
print(f"Accuracy: {accuracy_decision_tree:.2f}")
print("Classification Report:")
print(classification_rep_decision_tree)
print("="*50)


Random Forest Results
Accuracy: 0.68
Classification Report:
              precision    recall  f1-score   support

    away_win       0.63      0.60      0.61       121
        draw       0.00      0.00      0.00         9
    home_win       0.71      0.77      0.74       185

    accuracy                           0.68       315
   macro avg       0.45      0.45      0.45       315
weighted avg       0.66      0.68      0.67       315

Logistic Regression Results:
Accuracy: 0.74
Classification Report:
              precision    recall  f1-score   support

    away_win       0.71      0.65      0.68       121
        draw       0.00      0.00      0.00         9
    home_win       0.75      0.83      0.79       185

    accuracy                           0.74       315
   macro avg       0.49      0.49      0.49       315
weighted avg       0.71      0.74      0.72       315

Decision Tree Results:
Accuracy: 0.57
Classification Report:
              precision    recall  f1-score   supp

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## RWC 2023

In [12]:
# Train on all data pre RWC 2023 and test on the RWC games
X_train = data[['neutral', 'world_cup', 'ranking_points_home', 'ranking_points_away', 'home_form', 'away_form']]
y_train = data['result']
X_test = wc_games[['neutral', 'world_cup', 'ranking_points_home', 'ranking_points_away', 'home_form', 'away_form']]
y_test = wc_games['result']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and fit a Logistic Regression classifier
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_logreg = logreg.predict(X_test_scaled)

# Evaluate the Logistic Regression model
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
classification_rep_logreg = classification_report(y_test, y_pred_logreg)

# Print results for Logistic Regression
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_logreg:.2f}")
print("Classification Report:")
print(classification_rep_logreg)
print("="*50)

Logistic Regression Results:
Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

    away_win       1.00      1.00      1.00         1
    home_win       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



So far, all of the matches have been predicted correctly, perhaps surprisingly as SA beat Scotland.