In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import missingno as msno

df = pd.read_csv("..\\datasets\\results.csv")
df.shape

DATA EXPLORATION SECTION
1. Feature Exploration

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

b) Summary statistics

In [None]:
categorical = df.select_dtypes(include='object')
categorical.describe()

In [None]:
numerical = df.select_dtypes(include='number')
numerical.describe()

2. Data Cleaning

a) Null Zero Analysis

In [None]:
df.isnull().sum()

b) Checking for duplicates

In [None]:
duplicated_rows = df[df.duplicated()]
duplicated_rows.shape[0]

c) Conversion to preferred datatypes

In [None]:
df['date'] = pd.to_datetime(df.date)
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['month_name'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['day_name'] = df['date'].dt.day_name()
df.head(3)

d) Outlier detection

In [None]:
outlier = df[df.home_team == df.away_team]
outlier.shape # None so we continue

3. Exploratory Data Analysis(EDA)

In [None]:
def plot_categorical_frequency(
        data, n, x_label, y_label, title, x_rotation=None, y_rotation=None, horizontal=None, vertical=None):
    frequency_count = data.value_counts().head(n)
    plt.figure(figsize=(10, 6))
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)

    if x_rotation:
        plt.xticks(rotation=x_rotation)
    if y_rotation:
        plt.yticks(rotation=y_rotation)
    if horizontal:
        plt.barh(frequency_count.values, frequency_count.index)
    if vertical:
        plt.bar(frequency_count.index, frequency_count.values)
    plt.show()

plot_categorical_frequency(df.day_name, 6, 'Category', 'Frequency', 'Category-Frequency Bar Graph', vertical=True)
# As show on the graph most games are played on Sunday, followed by Wednesday

In [None]:
plot_categorical_frequency(df.tournament, 5, 'Category', 'Frequency', 'Category-Frequency Bar Graph', vertical=True, x_rotation=15)
# As shown on the graph the tournament which had most games are friendlies followed by FIFA World Cup qualification
# as well as Euro qualifications

In [None]:
plot_categorical_frequency( df.month_name, 12, 'Category', 'Frequency', 'Category-Frequency Bar Graph', vertical=True, x_rotation=15)
# June had the most matches throughout followed by October and November

FIFA WORLD CUP

In [None]:
wc_tournament = df[df.tournament == 'FIFA World Cup'][['home_team', 'away_team']]
wc_tournament.shape[0]
# 964 WC games have been played

In [None]:
wc_teams = pd.concat([wc_tournament.home_team, wc_tournament.away_team])
wc_count = wc_teams.value_counts()
len(wc_count.index)
# There have been 18 countries which hosted the wc

In [None]:
# Relationship between location of game with win-rate in friendlies
# To check if a team playing friendlies at home is more likely to win
friendly = df[df.tournament == 'Friendly']
friendly['winner'] = np.select(
    [(friendly.home_score > friendly.away_score), (friendly.home_score < friendly.away_score)], ['Home Team',
                                                                                                 'Away Team'],
    default=0)
friendly['winner']

In [None]:
friendly_counts = friendly['winner'].value_counts()
proportions = friendly_counts / friendly_counts.sum()
plt.figure(figsize=(10, 6))
plt.pie(proportions, labels=proportions.index, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Win rate vs Location')
plt.show()
# As shown on the pie chart, a team playing friendlies at home is more likely to win by 47.2% while away is 27.8%
# and draw by 25%

In [None]:
# Heatmap
duration = df[df['year'] >= 2010].groupby(['year', 'month_name']).size().unstack().fillna(0)
print(duration)
plt.figure(figsize=(10, 5))
sns.heatmap(duration, cmap='Blues', annot=True, fmt='.1f')
plt.show()