<a href="https://www.kaggle.com/code/prashantronsa/predict-football-match-winner?scriptVersionId=143694011" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Load the dataset

In [None]:
import pandas as pd

# Load the dataset
df_matches = pd.read_csv('/kaggle/input/english-premier-league-2021-2022-matches/EPL matches 2021-2022.csv', index_col=0)

# Display the first few rows
df_matches.head()

In [None]:
df_matches.shape

#Exploratory Data Analysis

EDA will help us understand the data and uncover insights, trends, and potential anomalies.

###Summary Statistics & Missing Values

In [None]:
# Summary statistics
df_summary = df_matches.describe()

# Check for missing values
df_missing = df_matches.isnull().sum()

df_summary, df_missing

In [None]:
# Check data types of each column
data_types = df_matches.dtypes
data_types

In [None]:
# Check unique teams involved
unique_teams = df_matches['team'].unique()
unique_teams

- We have data for 23 teams

In [None]:
# Check unique rounds in the dataset
unique_rounds = df_matches['round'].unique()
unique_rounds

###Visualizations

Distribution of goals scored per match

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up visualization style
sns.set_style("whitegrid")

# Plotting distribution of goals scored by teams
plt.figure(figsize=(12, 6))
sns.histplot(df_matches['gf'], bins=10, kde=True, color='blue')
plt.title('Distribution of Goals Scored')
plt.xlabel('Goals Scored')
plt.ylabel('Number of Matches')
plt.show()

Wins per team

In [None]:
team_wins = df_matches[df_matches['result'] == 'W']['team'].value_counts()

# Plot
team_wins.plot(kind='bar', color='skyblue')
plt.title('Number of Wins per Team')
plt.xlabel('Team')
plt.ylabel('Wins')
plt.xticks(rotation=90)
plt.show()

Frequency of home vs. away wins

In [None]:
home_away_wins = df_matches[df_matches['result'] == 'W']['venue'].value_counts()

# Plot
home_away_wins.plot(kind='bar', color='lightgreen')
plt.title('Frequency of Home vs Away Wins')
plt.xlabel('Venue')
plt.ylabel('Wins')
plt.show()

In [None]:
!pip install pandas_profiling

In [None]:
# import pandas_profiling

# # Generate the report
# profile = pandas_profiling.ProfileReport(df_matches)
# profile.to_notebook_iframe()

can run this in jupyter or colab notebook

In [None]:
numerical_columns = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
correlation_matrix = df_matches[numerical_columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix Heatmap')
plt.show()

#Data Preprocessing

In [None]:
df_matches["team"].value_counts()

In [None]:
df_matches[df_matches['team'] == 'Liverpool'].sort_values(by='date')

- Liverpool played 52 matches in the 2021 season

In [None]:
del df_matches['comp']

In [None]:
del df_matches['notes']

In [None]:
df_matches["date"] = pd.to_datetime(df_matches["date"])

In [None]:
df_matches["target"] = (df_matches["result"] == "W").astype("int")

In [None]:
df_matches.head()

In [None]:
df_matches["venue_code"] = df_matches["venue"].astype("category").cat.codes

In [None]:
df_matches["opp_code"] = df_matches["opponent"].astype("category").cat.codes

In [None]:
df_matches["hour"] = df_matches["time"].str.replace(":.+", "", regex=True).astype("int")

df_matches["day_code"] = df_matches["date"].dt.dayofweek

df_matches

In [None]:
# Distribution of match rounds for each season
match_rounds_per_season = df_matches.groupby('season')['round'].value_counts().unstack()
match_rounds_per_season

- For the 2021 season, there are 20 matches for each match week, which is consistent with a typical Premier League season where each of the 20 teams plays once per match week.

- For the 2022 season, the match weeks mostly have 19 matches. Additionally, data for the last few match weeks (from Matchweek 35 to Matchweek 38) is missing. This could imply that the dataset does not include all the matches for the 2022 season.

Given the missing matches in the 2022 season, we'll acknowledge this and use the available data for our analyses

#Model Building

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [None]:
train = df_matches[df_matches["date"] < '2022-01-01']
test = df_matches[df_matches["date"] > '2022-01-01']

In [None]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [None]:
rf.fit(train[predictors], train["target"])

In [None]:
preds = rf.predict(test[predictors])

In [None]:
from sklearn.metrics import accuracy_score

error = accuracy_score(test["target"], preds)
error

In [None]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

In [None]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

In [None]:
from sklearn.metrics import precision_score

precision_score(test["target"], preds)

In [None]:
group_matches = df_matches.groupby("team")

group = group_matches.get_group("Manchester City").sort_values("date")

In [None]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [None]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group, cols, new_cols)

In [None]:
matches_rolling = df_matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling

In [None]:
matches_rolling = matches_rolling.droplevel('team')
matches_rolling

In [None]:
matches_rolling.index = range(matches_rolling.shape[0])

In [None]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [None]:
combined, error = make_predictions(matches_rolling, predictors + new_cols)
error

In [None]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined.head()

In [None]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"}
mapping = MissingDict(**map_values)

In [None]:
combined["new_team"] = combined["team"].map(mapping)

merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [None]:
merged

In [None]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()