# Predicting Manchester United's Match Outcomes: A Machine Learning Approach

This project is about creating a computer program to predict the results of Manchester United's soccer games in the English Premier League. First, it collects past game data from websites, cleans it up, and then uses charts to understand the data better. The data is divided into two parts: one for training the computer program and the other for testing it. The program uses TensorFlow, a tool for building machine learning models, to learn from the training data. After training, the program is tested to see how well it can predict game outcomes. In the end, this trained program is saved for future use in guessing the results of Manchester United's games, showing how computer science can be used in sports.

# Importing the dependencies

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import tensorflow as tf
import requests
from sklearn.model_selection import train_test_split
import re
from tensorflow.keras import layers
from tensorflow.keras import regularizers
import matplotlib.pyplot as plt

# Scraping Data and Cleaning the Data

In [None]:
url1 = np.array([
    'https://fbref.com/en/squads/19538871/2023-2024/matchlogs/c9/schedule/Manchester-United-Scores-and-Fixtures-Premier-League',
    'https://fbref.com/en/squads/19538871/2022-2023/matchlogs/c9/schedule/Manchester-United-Scores-and-Fixtures-Premier-League',
    'https://fbref.com/en/squads/19538871/2021-2022/matchlogs/c9/schedule/Manchester-United-Scores-and-Fixtures-Premier-League',
    'https://fbref.com/en/squads/19538871/2020-2021/matchlogs/c9/schedule/Manchester-United-Scores-and-Fixtures-Premier-League',
    'https://fbref.com/en/squads/19538871/2019-2020/matchlogs/c9/schedule/Manchester-United-Scores-and-Fixtures-Premier-League',
    'https://fbref.com/en/squads/19538871/2018-2019/matchlogs/c9/schedule/Manchester-United-Scores-and-Fixtures-Premier-League',
    'https://fbref.com/en/squads/19538871/2017-2018/matchlogs/c9/schedule/Manchester-United-Scores-and-Fixtures-Premier-League'])

url2 = np.array([
    'https://fbref.com/en/comps/9/Premier-League-Stats',
    'https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats',
    'https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats',
    'https://fbref.com/en/comps/9/2020-2021/2020-2021-Premier-League-Stats',
    'https://fbref.com/en/comps/9/2019-2020/2019-2020-Premier-League-Stats',
    'https://fbref.com/en/comps/9/2018-2019/2018-2019-Premier-League-Stats',
    'https://fbref.com/en/comps/9/2017-2018/2017-2018-Premier-League-Stats'])

df_matches = pd.DataFrame()
result_df = pd.DataFrame()

for i in range(len(url1)):
    url_1 = url1[i]
    url_2 = url2[i]
    
    # Send an HTTP GET request to the URL1
    response1 = requests.get(url_1)
    
    # Check if the request to URL1 was successful (status code 200)
    if response1.status_code == 200:
        # Use pandas to read HTML tables from the response
        tables1 = pd.read_html(response1.text)

        # Assuming the table you want is the first one on the page
        if len(tables1) > 0:
            df_matches = tables1[0][['Venue', 'Result', 'Opponent', 'Attendance']].dropna()
            
        else:
            print(f"No tables found on the webpage {url_1}.")
    else:
        print(f"Failed to retrieve the webpage {url_1}. Status code: {response1.status_code}")

    # Send an HTTP GET request to the URL2
    response2 = requests.get(url_2)
    
    # Check if the request to URL2 was successful (status code 200)
    if response2.status_code == 200:
        # Use pandas to read HTML tables from the response
        tables2 = pd.read_html(response2.text)

        # Assuming the table you want is the third one on the page
        if len(tables2) > 2:
            df_strength = tables2[2]
            
            df_strength.columns = df_strength.columns.get_level_values(1)
            df_strength = df_strength.iloc[:, [0] + list(range(22, df_strength.shape[1]))]
            df_oppstrength = df_strength.add_suffix('_Opp')
            df_oppstrength = df_oppstrength.rename(columns={'Squad_Opp': 'Opponent'})

            df_MUStrength = df_strength.loc[df_strength['Squad'] == 'Manchester Utd'].drop(columns=['Squad'], axis=1).add_suffix('_MU')
            
            
            
            df_merged = pd.merge(df_oppstrength, df_matches, on='Opponent', how='inner')
            df_merged = pd.concat([pd.concat([df_MUStrength]*len(df_merged), ignore_index=True), df_merged], axis=1)
            df_merged.columns = [re.sub(r'[^A-Za-z0-9_.\\/>-]', '_', col) for col in df_merged.columns]
            
            result_df = pd.concat([result_df, df_merged],ignore_index=True)
            print(result_df)
            
        else:
            print(f"No tables found on the webpage {url_2}.")
    else:
        print(f"Failed to retrieve the webpage {url_2}. Status code: {response2.status_code}")

result_mapping = {'W': 2, 'L': 0, 'D': 1}

# Replace values in the "Result" column
result_df['Result'] = result_df['Result'].replace(result_mapping)

result_df['Result'] = pd.to_numeric(result_df['Result'], errors='coerce')


# Visualising the Data

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(result_df.drop(columns=['Opponent', 'Venue']).corr(), cmap='YlGnBu', annot=True)

# Splitting the Data into train and test dataset

In [None]:
result_df = result_df.drop(columns=['Opponent', 'Attendance'])
y = result_df['Result']
X = result_df.drop(columns=['Result'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train = tf.one_hot(y_train, depth=3)
y_test = tf.one_hot(y_test, depth=3)

In [None]:
X_train_features_dict = {name: np.array(value) for name, value in X_train.items()}
X_test_features_dict = {name: np.array(value) for name, value in X_test.items()}

In [None]:
X_train_ds = tf.data.Dataset.from_tensor_slices((X_train_features_dict, y_train))
X_test_ds = tf.data.Dataset.from_tensor_slices((X_test_features_dict, y_test))

# Preprocessing the Dataset

In [None]:
inputs = {}
for name, column in X_train.items():
    dtype= column.dtype
    if dtype == object:
        dtype = tf.string
    else:
        dtype = tf.float32
    
    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

In [None]:
numeric_inputs = {name:input for name,input in inputs.items() if input.dtype == tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = layers.Normalization()
norm.adapt(np.array(X_train[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

In [None]:
preprocessed_inputs = [all_numeric_inputs]
preprocessed_inputs

In [None]:
for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue
    
    lookup = layers.StringLookup(vocabulary=np.unique(X_train[name]))
    one_hot = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())
    
    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)

In [None]:
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
X_train_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

In [None]:
X_train_features_dict = {name: np.array(value) for name, value in X_train.items()}

# Creating the models

In [None]:
def epl_model(preprocessing_head, inputs):
  body = tf.keras.Sequential([
    layers.Dense(32, kernel_regularizer=regularizers.l2(0.001), activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(32, kernel_regularizer=regularizers.l2(0.001), activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(3, activation='softmax')
  ])

  preprocessed_inputs = preprocessing_head(inputs)
  result = body(preprocessed_inputs)
  model = tf.keras.Model(inputs, result)
  print(inputs)
  print(result)

  model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])
  return model

epl_model = epl_model(X_train_preprocessing, inputs)

In [None]:
X_train_batches = X_train_ds.shuffle(len(y_train)).batch(16)
X_test_batches = X_test_ds.shuffle(len(y_test)).batch(16)
X_test_batches

# Train and evaluate the model

In [None]:
epl_model.fit(X_train_batches)

In [None]:
test_loss, test_acc = epl_model.evaluate(X_test_batches, verbose=0)
print('test accuracy: ', test_acc)

# Save the model

In [None]:
epl_model.save('MUMatchPredictor_Latest')