In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import datetime as dt
import itertools
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder


In [3]:
df = pd.read_csv("/content/merged_data.csv")

  df = pd.read_csv("/content/merged_data.csv")


In [4]:
print(len(df))

7260


# **Dataset Analysis**

- shape
- columns
- head()
- sample()
- describe()
- finding categorical features
- checking if any null/nan values
- filtering data instances based on indices
- filtering data instances based on conditional statements


In [6]:
df.shape


(7260, 145)

In [8]:
# Basic Dataset Information
# Shape - shows number of rows and columns
print("Dataset Shape:", df.shape)

# Display column names
print("\nColumns:", df.columns.tolist())

# View first few rows
print("\nFirst 5 rows:")
print(df.head())

# View random sample of rows
print("\nRandom sample of 5 rows:")
print(df.sample(n=5))

# Statistical summary of numerical columns
print("\nDescriptive Statistics:")
print(df.describe())

# Finding categorical features
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
print("\nCategorical columns:", categorical_columns.tolist())

# Check for null values
print("\nNull value counts per column:")
print(df.isnull().sum())

Dataset Shape: (7260, 145)

Columns: ['Unnamed: 0', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Attendance', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HHW', 'AHW', 'HC', 'AC', 'HF', 'AF', 'HO', 'AO', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'GBH', 'GBD', 'GBA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'SBH', 'SBD', 'SBA', 'WHH', 'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD', 'VCA', 'BSH', 'BSD', 'BSA', 'Bb1X2', 'BbMxH', 'BbAvH', 'BbMxD', 'BbAvD', 'BbMxA', 'BbAvA', 'BbOU', 'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5', 'BbAH', 'BbAHh', 'BbMxAHH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA', 'PSH', 'PSD', 'PSA', 'PSCH', 'PSCD', 'PSCA', 'Time', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA', 'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 'Avg>2.5', 'Avg<2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA', 'B365CH', 'B365CD', 'B365CA', 'BWCH', 'BWCD', 'BWCA', 'IWCH', 'IWCD', 

In [9]:
# Check for duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())


Number of duplicate rows: 0


In [10]:
for cat_col in categorical_columns:
    print(f"\nValue counts for {cat_col}:")
    print(df[cat_col].value_counts())


Value counts for Div:
Div
E0    7260
Name: count, dtype: int64

Value counts for Date:
Date
26/12/08      10
11/05/14      10
11/05/2003    10
11/05/08      10
26/12/2002    10
              ..
20/08/00       1
08/03/10       1
10/03/10       1
15/03/10       1
19/02/2020     1
Name: count, Length: 1921, dtype: int64

Value counts for HomeTeam:
HomeTeam
Chelsea             364
Everton             364
Arsenal             364
Man United          363
Liverpool           362
Tottenham           361
Man City            344
Newcastle           326
Aston Villa         317
West Ham            307
Sunderland          266
Fulham              254
Southampton         229
West Brom           228
Bolton              209
Blackburn           209
Stoke               190
Middlesbrough       171
Leicester           154
Wigan               152
Crystal Palace      134
Portsmouth          133
Swansea             133
Charlton            133
Birmingham          133
Norwich             108
Wolves             

# ⚽ Football Match Dataset Summary

## Dataset Overview
- **Total Records**: 7,260 matches
- **Features**: 145 columns
- **Time Span**: 2000 - 2020

## Main Features
1. **Match Identifiers**:
   - **Date**: Match date
   - **Time**: Match start time
   - **Teams**: `HomeTeam`, `AwayTeam`

2. **Scores**:
   - **Full-Time**:
     - `FTHG` (Full-Time Home Goals)
     - `FTAG` (Full-Time Away Goals)
     - `FTR` (Full-Time Result)
   - **Half-Time**:
     - `HTHG` (Half-Time Home Goals)
     - `HTAG` (Half-Time Away Goals)
     - `HTR` (Half-Time Result)

3. **Match Statistics**:
   - **Attendance**: Crowd size
   - **Referee**: Official overseeing the game

4. **Team Performance Metrics**:
   - **Shots**: `HS` (Home Shots), `AS` (Away Shots)
   - **Shots on Target**: `HST` (Home Shots on Target), `AST` (Away Shots on Target)

5. **Betting Odds**:
   - Odds from multiple bookmakers (e.g., B365, BW, GB, IW)

---



## **Team Performance Statistics:**

In [11]:
# Calculate win percentages and average goals for each team
team_stats = pd.DataFrame()

# Home performance
home_stats = df.groupby('HomeTeam').agg({
    'FTHG': ['mean', 'sum'],
    'FTAG': ['mean'],
    'FTR': lambda x: (x == 'H').mean()  # Win percentage
}).round(3)

# Rename columns for clarity
home_stats.columns = ['Avg_Goals_Scored_Home', 'Total_Goals_Home',
                     'Avg_Goals_Conceded_Home', 'Home_Win_Rate']

# Sort by win rate
print("Home Performance (Top 10 teams by win rate):")
print(home_stats.sort_values('Home_Win_Rate', ascending=False).head(10))

Home Performance (Top 10 teams by win rate):
             Avg_Goals_Scored_Home  Total_Goals_Home  Avg_Goals_Conceded_Home  \
HomeTeam                                                                        
Man United                   2.118               769                    0.713   
Arsenal                      2.181               794                    0.865   
Chelsea                      2.115               770                    0.799   
Liverpool                    1.964               711                    0.757   
Man City                     2.015               693                    0.942   
Tottenham                    1.803               651                    1.028   
Middlesboro                  1.895                36                    1.105   
Everton                      1.607               585                    1.063   
Newcastle                    1.457               475                    1.178   
Fulham                       1.413               359            

## **Home/Away Pattern Analysis:**

In [12]:
# Away performance
away_stats = df.groupby('AwayTeam').agg({
    'FTAG': ['mean', 'sum'],
    'FTHG': ['mean'],
    'FTR': lambda x: (x == 'A').mean()  # Win percentage
}).round(3)

away_stats.columns = ['Avg_Goals_Scored_Away', 'Total_Goals_Away',
                     'Avg_Goals_Conceded_Away', 'Away_Win_Rate']

print("\nAway Performance (Top 10 teams by win rate):")
print(away_stats.sort_values('Away_Win_Rate', ascending=False).head(10))


Away Performance (Top 10 teams by win rate):
            Avg_Goals_Scored_Away  Total_Goals_Away  Avg_Goals_Conceded_Away  \
AwayTeam                                                                       
Man United                  1.647               598                    1.066   
Chelsea                     1.597               578                    0.975   
Arsenal                     1.638               593                    1.185   
Liverpool                   1.549               564                    1.187   
Man City                    1.506               518                    1.291   
Leeds                       1.289                98                    1.539   
Tottenham                   1.315               480                    1.422   
Ipswich                     1.237                47                    1.763   
Leicester                   1.253               193                    1.597   
Blackpool                   1.316                25                    2.1

## **Scoring Patterns Analysis:**

In [13]:
# First Half vs Second Half scoring
df['SecondHalfHomeGoals'] = df['FTHG'] - df['HTHG']
df['SecondHalfAwayGoals'] = df['FTAG'] - df['HTAG']

scoring_patterns = {
    'First Half Home Goals (Avg)': df['HTHG'].mean(),
    'First Half Away Goals (Avg)': df['HTAG'].mean(),
    'Second Half Home Goals (Avg)': df['SecondHalfHomeGoals'].mean(),
    'Second Half Away Goals (Avg)': df['SecondHalfAwayGoals'].mean()
}

print("\nScoring Patterns:")
for key, value in scoring_patterns.items():
    print(f"{key}: {value:.2f}")

# Goals by time period comparison
print("\nGoals Distribution:")
print("First Half Goals (Avg):", df['HTHG'].mean() + df['HTAG'].mean())
print("Second Half Goals (Avg):",
      df['SecondHalfHomeGoals'].mean() + df['SecondHalfAwayGoals'].mean())


Scoring Patterns:
First Half Home Goals (Avg): 0.68
First Half Away Goals (Avg): 0.50
Second Half Home Goals (Avg): 0.84
Second Half Away Goals (Avg): 0.64

Goals Distribution:
First Half Goals (Avg): 1.1790633608815426
Second Half Goals (Avg): 1.4853994490358127


## **Referee Analysis:**

In [15]:
# Calculate referee statistics
ref_stats = df.groupby('Referee').agg({
    'FTR': ['count', lambda x: (x == 'H').mean()],  # Total matches and home win rate
    'HY': 'mean',    # Avg home yellows
    'AY': 'mean',    # Avg away yellows
    'HR': 'mean',    # Avg home reds
    'AR': 'mean'     # Avg away reds
}).round(3)

# Flatten the multi-level columns and rename them
ref_stats.columns = ['Matches', 'Home_Win_Rate', 'Avg_Home_Yellows',
                    'Avg_Away_Yellows', 'Avg_Home_Reds', 'Avg_Away_Reds']

# Filter refs with minimum 50 matches
experienced_refs = ref_stats[ref_stats['Matches'] >= 50].sort_values('Matches', ascending=False)

print("\nReferee Statistics (minimum 50 matches):")
print(experienced_refs.head(10))


Referee Statistics (minimum 50 matches):
               Matches  Home_Win_Rate  Avg_Home_Yellows  Avg_Away_Yellows  \
Referee                                                                     
M Dean             454          0.436             1.778             1.910   
M Atkinson         381          0.457             1.496             1.882   
A Marriner         312          0.455             1.433             1.776   
P Dowd             301          0.482             1.449             2.090   
H Webb             296          0.449             1.439             1.872   
M Clattenburg      293          0.471             1.485             1.768   
L Mason            263          0.502             1.384             1.730   
C Foy              256          0.500             1.152             1.566   
M Oliver           237          0.443             1.464             1.700   
A Taylor           230          0.457             1.678             1.813   

               Avg_Home_Reds  Avg

## **Additional Team Performance Metrics:**

In [16]:
# Calculate shots conversion rate and shooting efficiency
team_efficiency = df.groupby('HomeTeam').agg({
    'HS': 'mean',    # Avg shots
    'HST': 'mean',   # Avg shots on target
    'FTHG': 'mean',  # Avg goals
}).round(3)

team_efficiency['Shooting_Accuracy'] = (team_efficiency['HST'] / team_efficiency['HS']).round(3)
team_efficiency['Conversion_Rate'] = (team_efficiency['FTHG'] / team_efficiency['HST']).round(3)

print("\nTeam Efficiency Metrics (Top 10 by Conversion Rate):")
print(team_efficiency.sort_values('Conversion_Rate', ascending=False).head(10))


Team Efficiency Metrics (Top 10 by Conversion Rate):
                 HS    HST   FTHG  Shooting_Accuracy  Conversion_Rate
HomeTeam                                                             
Brighton     11.513  3.641  1.359              0.316            0.373
Middlesboro  12.316  5.632  1.895              0.457            0.336
Bournemouth  12.684  4.278  1.430              0.337            0.334
Cardiff      12.786  3.750  1.143              0.293            0.305
Watford      12.278  4.052  1.186              0.330            0.293
Leicester    12.305  4.831  1.409              0.393            0.292
Stoke        11.479  4.495  1.305              0.392            0.290
Man City     15.427  7.270  2.015              0.471            0.277
Chelsea      16.401  7.786  2.115              0.475            0.272
Burnley      11.216  4.082  1.093              0.364            0.268


## **Season Trends:**

In [17]:
# Extract season from date and analyze scoring trends
df['Season'] = df['Date'].str.split('/').str[-1].astype(int)
df['Season'] = df['Season'].apply(lambda x: 1900+x if x > 99 else 2000+x)

season_stats = df.groupby('Season').agg({
    'FTHG': 'mean',
    'FTAG': 'mean',
    'FTR': lambda x: (x == 'H').mean()
}).round(3)

season_stats.columns = ['Avg_Home_Goals', 'Avg_Away_Goals', 'Home_Win_Rate']
season_stats['Total_Goals_Per_Game'] = season_stats['Avg_Home_Goals'] + season_stats['Avg_Away_Goals']

print("\nScoring Trends by Season:")
print(season_stats.sort_index())


Scoring Trends by Season:
        Avg_Home_Goals  Avg_Away_Goals  Home_Win_Rate  Total_Goals_Per_Game
Season                                                                     
2000             1.659           1.029          0.507                 2.688
2001             1.421           1.155          0.426                 2.576
2002             1.505           1.137          0.473                 2.642
2003             1.452           1.122          0.431                 2.574
2004             1.510           1.161          0.436                 2.671
2005             1.455           1.027          0.484                 2.482
2006             1.477           0.957          0.520                 2.434
2007             1.553           1.124          0.456                 2.677
2008             1.412           1.135          0.438                 2.547
2009             1.593           1.058          0.503                 2.651
2010             1.591           1.056          0.471        

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def create_visualizations(home_stats, away_stats, scoring_patterns, ref_stats, team_efficiency, season_stats):
    # Set basic style parameters
    plt.rcParams['figure.figsize'] = (12, 6)
    plt.rcParams['axes.titlesize'] = 12
    plt.rcParams['axes.labelsize'] = 10
    plt.rcParams['xtick.labelsize'] = 9
    plt.rcParams['ytick.labelsize'] = 9

    def plot_team_performance():
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

        # Home Performance
        top_home = home_stats.sort_values('Home_Win_Rate', ascending=False).head(8)
        x = np.arange(len(top_home))
        width = 0.35

        ax1.bar(x - width/2, top_home['Home_Win_Rate'], width, label='Win Rate')
        ax1.bar(x + width/2, top_home['Avg_Goals_Scored_Home'], width, label='Avg Goals')
        ax1.set_xticks(x)
        ax1.set_xticklabels(top_home.index, rotation=45)
        ax1.set_title('Top 8 Teams - Home Performance')
        ax1.legend()

        # Away Performance
        top_away = away_stats.sort_values('Away_Win_Rate', ascending=False).head(8)
        x = np.arange(len(top_away))

        ax2.bar(x - width/2, top_away['Away_Win_Rate'], width, label='Win Rate')
        ax2.bar(x + width/2, top_away['Avg_Goals_Scored_Away'], width, label='Avg Goals')
        ax2.set_xticks(x)
        ax2.set_xticklabels(top_away.index, rotation=45)
        ax2.set_title('Top 8 Teams - Away Performance')
        ax2.legend()

        plt.tight_layout()
        plt.savefig('team_performance.png', dpi=300, bbox_inches='tight')
        plt.close()

    def plot_scoring_patterns():
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # First vs Second Half
        halves = ['First Half', 'Second Half']
        home_goals = [scoring_patterns['First Half Home Goals (Avg)'],
                     scoring_patterns['Second Half Home Goals (Avg)']]
        away_goals = [scoring_patterns['First Half Away Goals (Avg)'],
                     scoring_patterns['Second Half Away Goals (Avg)']]

        x = np.arange(len(halves))
        width = 0.35

        ax1.bar(x - width/2, home_goals, width, label='Home')
        ax1.bar(x + width/2, away_goals, width, label='Away')
        ax1.set_xticks(x)
        ax1.set_xticklabels(halves)
        ax1.set_title('Goals Distribution by Half')
        ax1.legend()

        # Goals trend over seasons
        ax2.plot(season_stats.index, season_stats['Total_Goals_Per_Game'])
        ax2.set_xticks(season_stats.index[::2])
        ax2.set_xticklabels(season_stats.index[::2], rotation=45)
        ax2.set_title('Goals per Game Trend by Season')

        plt.tight_layout()
        plt.savefig('scoring_patterns.png', dpi=300, bbox_inches='tight')
        plt.close()

    def plot_referee_analysis():
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

        top_refs = ref_stats.sort_values('Matches', ascending=False).head(8)
        x = np.arange(len(top_refs))
        width = 0.35

        # Cards distribution
        ax1.bar(x - width/2, top_refs['Avg_Home_Yellows'], width, label='Home Yellows')
        ax1.bar(x + width/2, top_refs['Avg_Away_Yellows'], width, label='Away Yellows')
        ax1.set_xticks(x)
        ax1.set_xticklabels(top_refs.index, rotation=45)
        ax1.set_title('Yellow Cards Distribution by Referee')
        ax1.legend()

        # Home advantage
        ax2.bar(x, top_refs['Home_Win_Rate'], width*1.5)
        ax2.set_xticks(x)
        ax2.set_xticklabels(top_refs.index, rotation=45)
        ax2.set_title('Home Win Rate by Referee')

        plt.tight_layout()
        plt.savefig('referee_analysis.png', dpi=300, bbox_inches='tight')
        plt.close()

    def plot_team_efficiency():
        top_efficient = team_efficiency.sort_values('Conversion_Rate', ascending=False).head(8)
        x = np.arange(len(top_efficient))
        width = 0.35

        fig, ax = plt.subplots(figsize=(12, 6))
        ax.bar(x - width/2, top_efficient['Shooting_Accuracy'], width, label='Shooting Accuracy')
        ax.bar(x + width/2, top_efficient['Conversion_Rate'], width, label='Conversion Rate')

        ax.set_xticks(x)
        ax.set_xticklabels(top_efficient.index, rotation=45)
        ax.set_title('Team Shooting Efficiency')
        ax.legend()

        plt.tight_layout()
        plt.savefig('team_efficiency.png', dpi=300, bbox_inches='tight')
        plt.close()

    def plot_season_trends():
        fig, ax = plt.subplots(figsize=(12, 6))

        ax.plot(season_stats.index, season_stats['Home_Win_Rate'], label='Home Win Rate')
        ax.plot(season_stats.index, season_stats['Total_Goals_Per_Game'], label='Goals per Game')

        ax.set_xticks(season_stats.index[::2])
        ax.set_xticklabels(season_stats.index[::2], rotation=45)
        ax.set_title('Season Trends')
        ax.legend()

        plt.tight_layout()
        plt.savefig('season_trends.png', dpi=300, bbox_inches='tight')
        plt.close()

    # Create all visualizations
    plot_team_performance()
    plot_scoring_patterns()
    plot_referee_analysis()
    plot_team_efficiency()
    plot_season_trends()

# Usage example:
if __name__ == "__main__":
    create_visualizations(
        home_stats=home_stats,
        away_stats=away_stats,
        scoring_patterns=scoring_patterns,
        ref_stats=ref_stats,
        team_efficiency=team_efficiency,
        season_stats=season_stats
    )

In [21]:
create_visualizations(
    home_stats=home_stats,
    away_stats=away_stats,
    scoring_patterns=scoring_patterns,
    ref_stats=ref_stats,
    team_efficiency=team_efficiency,
    season_stats=season_stats
)