In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle as pk
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/premier-league-matches.csv'
final_dataset = pd.read_csv(file_path)


In [None]:
final_dataset.head()

Unnamed: 0,Season_End_Year,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR
0,1993,1,1992-08-15,Coventry City,2,1,Middlesbrough,H
1,1993,1,1992-08-15,Leeds United,2,1,Wimbledon,H
2,1993,1,1992-08-15,Sheffield Utd,2,1,Manchester Utd,H
3,1993,1,1992-08-15,Crystal Palace,3,3,Blackburn,D
4,1993,1,1992-08-15,Arsenal,2,4,Norwich City,A


In [None]:

# Initialize LabelEncoder
team_encoder = LabelEncoder()

# Fit and transform home and away teams
final_dataset['home_team_encoded'] = team_encoder.fit_transform(final_dataset['Home'])
final_dataset['away_team_encoded'] = team_encoder.transform(final_dataset['Away'])

# Show the encoded values
print(final_dataset[['Home', 'home_team_encoded', 'Away', 'away_team_encoded']].head())


             Home  home_team_encoded            Away  away_team_encoded
0   Coventry City                 15   Middlesbrough                 28
1    Leeds United                 23       Wimbledon                 48
2   Sheffield Utd                 36  Manchester Utd                 27
3  Crystal Palace                 16       Blackburn                  4
4         Arsenal                  0    Norwich City                 30


In [None]:
# Initialize columns for total goals scored and conceded
final_dataset['home_team_total_goals_scored'] = 0
final_dataset['away_team_total_goals_scored'] = 0
final_dataset['home_team_total_goals_conceded'] = 0
final_dataset['away_team_total_goals_conceded'] = 0

# Dictionary to keep track of cumulative statistics
team_stats = {}

# Process each game
for index, row in final_dataset.iterrows():
    home_team = row['Home']
    away_team = row['Away']
    home_goals = row['HomeGoals']
    away_goals = row['AwayGoals']
    season = row['Season_End_Year']

    # Ensure dictionaries are initialized for each team
    if home_team not in team_stats:
        team_stats[home_team] = {'season': season, 'total_goals_scored': 0, 'total_goals_conceded': 0}

    if away_team not in team_stats:
        team_stats[away_team] = {'season': season, 'total_goals_scored': 0, 'total_goals_conceded': 0}

    # Update goals scored and conceded
    if team_stats[home_team]['season'] == season:
        team_stats[home_team]['total_goals_scored'] += home_goals
        team_stats[home_team]['total_goals_conceded'] += away_goals
    else:
        # Reset for new season
        team_stats[home_team] = {'season': season, 'total_goals_scored': home_goals, 'total_goals_conceded': away_goals}

    if team_stats[away_team]['season'] == season:
        team_stats[away_team]['total_goals_scored'] += away_goals
        team_stats[away_team]['total_goals_conceded'] += home_goals
    else:
        # Reset for new season
        team_stats[away_team] = {'season': season, 'total_goals_scored': away_goals, 'total_goals_conceded': home_goals}

    # Store cumulative statistics
    final_dataset.at[index, 'home_team_total_goals_scored'] = team_stats[home_team]['total_goals_scored']
    final_dataset.at[index, 'away_team_total_goals_scored'] = team_stats[away_team]['total_goals_scored']
    final_dataset.at[index, 'home_team_total_goals_conceded'] = team_stats[home_team]['total_goals_conceded']
    final_dataset.at[index, 'away_team_total_goals_conceded'] = team_stats[away_team]['total_goals_conceded']

# Show the updated DataFrame
print(final_dataset)

       Season_End_Year  Wk        Date            Home  HomeGoals  AwayGoals  \
0                 1993   1  1992-08-15   Coventry City          2          1   
1                 1993   1  1992-08-15    Leeds United          2          1   
2                 1993   1  1992-08-15   Sheffield Utd          2          1   
3                 1993   1  1992-08-15  Crystal Palace          3          3   
4                 1993   1  1992-08-15         Arsenal          2          4   
...                ...  ..         ...             ...        ...        ...   
12021             2023  38  2023-05-28         Everton          1          0   
12022             2023  38  2023-05-28  Leicester City          2          1   
12023             2023  38  2023-05-28     Aston Villa          2          1   
12024             2023  38  2023-05-28    Leeds United          1          4   
12025             2023  38  2023-05-28       Brentford          1          0   

                  Away FTR  home_team_e

In [None]:
import pandas as pd

# Initialize columns for cumulative statistics
final_dataset['home_team_pts'] = 0
final_dataset['away_team_pts'] = 0
final_dataset['home_team_formPts'] = 0
final_dataset['away_team_formPts'] = 0

# Dictionaries to keep track of cumulative statistics
team_stats = {}
form_points = {'home': {}, 'away': {}}

# Track the last processed season to reset statistics accordingly
last_season = None

# Process each game
for index, row in final_dataset.iterrows():
    season = row['Season_End_Year']
    home_team = row['home_team_encoded']
    away_team = row['away_team_encoded']
    home_goals = row['HomeGoals']
    away_goals = row['AwayGoals']
    home_result = row['FTR'] == 'H'
    away_result = row['FTR'] == 'A'

    # Check if the season has changed
    if season != last_season:
        # Reset statistics for the new season
        team_stats = {}
        form_points = {'home': {}, 'away': {}}
        last_season = season

    # Initialize team stats for new teams
    if home_team not in team_stats:
        team_stats[home_team] = {'points': 0}
    if away_team not in team_stats:
        team_stats[away_team] = {'points': 0}

    if home_team not in form_points['home']:
        form_points['home'][home_team] = []
    if away_team not in form_points['away']:
        form_points['away'][away_team] = []

    # Calculate points
    if home_result:
        team_stats[home_team]['points'] += 3
    elif away_result:
        team_stats[away_team]['points'] += 3
    else:
        team_stats[home_team]['points'] += 1
        team_stats[away_team]['points'] += 1

    # Update cumulative statistics
    final_dataset.at[index, 'home_team_pts'] = team_stats[home_team]['points']
    final_dataset.at[index, 'away_team_pts'] = team_stats[away_team]['points']

    # Update form points
    form_points['home'][home_team].append(3 if home_result else (1 if row['FTR'] == 'D' else 0))
    form_points['away'][away_team].append(3 if away_result else (1 if row['FTR'] == 'D' else 0))

    if len(form_points['home'][home_team]) > 5:
        form_points['home'][home_team].pop(0)
    if len(form_points['away'][away_team]) > 5:
        form_points['away'][away_team].pop(0)

    # Calculate form points (sum of last 5 games)
    final_dataset.at[index, 'home_team_formPts'] = sum(form_points['home'][home_team])
    final_dataset.at[index, 'away_team_formPts'] = sum(form_points['away'][away_team])

# Calculate the difference between home and away team points and form points
final_dataset['diffPts'] = final_dataset['home_team_pts'] - final_dataset['away_team_pts']
final_dataset['diffFormPts'] = final_dataset['home_team_formPts'] - final_dataset['away_team_formPts']

# Show the updated DataFrame
print(final_dataset.tail())


       Season_End_Year  Wk        Date            Home  HomeGoals  AwayGoals  \
12021             2023  38  2023-05-28         Everton          1          0   
12022             2023  38  2023-05-28  Leicester City          2          1   
12023             2023  38  2023-05-28     Aston Villa          2          1   
12024             2023  38  2023-05-28    Leeds United          1          4   
12025             2023  38  2023-05-28       Brentford          1          0   

                  Away FTR  home_team_encoded  away_team_encoded  \
12021      Bournemouth   H                 18                  7   
12022         West Ham   H                 24                 46   
12023         Brighton   H                  1                 10   
12024        Tottenham   A                 23                 43   
12025  Manchester City   H                  9                 26   

       home_team_total_goals_scored  away_team_total_goals_scored  \
12021                            34      

In [None]:

# Encode team names
team_encoder = {}
team_decoder = {}
current_code = 0

# Encode Home and Away Teams
for team in pd.concat([final_dataset['Home'], final_dataset['Away']]).unique():
    if team not in team_encoder:
        team_encoder[team] = current_code
        team_decoder[current_code] = team
        current_code += 1

# Add encoded columns to the DataFrame
final_dataset['home_team_encoded'] = final_dataset['Home'].map(team_encoder)
final_dataset['away_team_encoded'] = final_dataset['Away'].map(team_encoder)

# Print encoded DataFrame
print(final_dataset)

# Print the mappings
print("Team Encoder:", team_encoder)
print("Team Decoder:", team_decoder)

# Optionally save mappings to CSV files for later use
pd.DataFrame(list(team_encoder.items()), columns=['Team', 'Encoded']).to_csv('team_encoder.csv', index=False)
pd.DataFrame(list(team_decoder.items()), columns=['Encoded', 'Team']).to_csv('team_decoder.csv', index=False)


       Season_End_Year  Wk        Date            Home  HomeGoals  AwayGoals  \
0                 1993   1  1992-08-15   Coventry City          2          1   
1                 1993   1  1992-08-15    Leeds United          2          1   
2                 1993   1  1992-08-15   Sheffield Utd          2          1   
3                 1993   1  1992-08-15  Crystal Palace          3          3   
4                 1993   1  1992-08-15         Arsenal          2          4   
...                ...  ..         ...             ...        ...        ...   
12021             2023  38  2023-05-28         Everton          1          0   
12022             2023  38  2023-05-28  Leicester City          2          1   
12023             2023  38  2023-05-28     Aston Villa          2          1   
12024             2023  38  2023-05-28    Leeds United          1          4   
12025             2023  38  2023-05-28       Brentford          1          0   

                  Away FTR  home_team_e

In [None]:
final_dataset['home_team_avg_goals_scored'] = 0
final_dataset['home_team_avg_goals_conceded'] = 0
final_dataset['home_team_win_ratio'] = 0
final_dataset['away_team_avg_goals_scored'] = 0
final_dataset['away_team_avg_goals_conceded'] = 0
final_dataset['away_team_win_ratio'] = 0

# Dictionary to keep track of historical metrics
team_metrics = {}

# Calculate historical metrics
for index, row in final_dataset.iterrows():
    home_team = row['Home']
    away_team = row['Away']
    home_goals = row['HomeGoals']
    away_goals = row['AwayGoals']
    season = row['Season_End_Year']

    # Initialize metrics for new teams
    if home_team not in team_metrics:
        team_metrics[home_team] = {'games': 0, 'goals_scored': 0, 'goals_conceded': 0, 'wins': 0, 'draws': 0}
    if away_team not in team_metrics:
        team_metrics[away_team] = {'games': 0, 'goals_scored': 0, 'goals_conceded': 0, 'wins': 0, 'draws': 0}

    # Update metrics
    team_metrics[home_team]['games'] += 1
    team_metrics[home_team]['goals_scored'] += home_goals
    team_metrics[home_team]['goals_conceded'] += away_goals
    team_metrics[away_team]['games'] += 1
    team_metrics[away_team]['goals_scored'] += away_goals
    team_metrics[away_team]['goals_conceded'] += home_goals

    # Determine wins
    if home_goals > away_goals:
        team_metrics[home_team]['wins'] += 1
    elif home_goals < away_goals:
        team_metrics[away_team]['wins'] += 1
    else:
        team_metrics[home_team]['draws'] += 1
        team_metrics[away_team]['draws'] += 1

    # Calculate averages and win ratios
    home_team_avg_goals_scored = team_metrics[home_team]['goals_scored'] / team_metrics[home_team]['games']
    home_team_avg_goals_conceded = team_metrics[home_team]['goals_conceded'] / team_metrics[home_team]['games']
    home_team_win_ratio = team_metrics[home_team]['wins'] / team_metrics[home_team]['games']

    away_team_avg_goals_scored = team_metrics[away_team]['goals_scored'] / team_metrics[away_team]['games']
    away_team_avg_goals_conceded = team_metrics[away_team]['goals_conceded'] / team_metrics[away_team]['games']
    away_team_win_ratio = team_metrics[away_team]['wins'] / team_metrics[away_team]['games']

    # Update DataFrame
    final_dataset.at[index, 'home_team_avg_goals_scored'] = home_team_avg_goals_scored
    final_dataset.at[index, 'home_team_avg_goals_conceded'] = home_team_avg_goals_conceded
    final_dataset.at[index, 'home_team_win_ratio'] = home_team_win_ratio
    final_dataset.at[index, 'away_team_avg_goals_scored'] = away_team_avg_goals_scored
    final_dataset.at[index, 'away_team_avg_goals_conceded'] = away_team_avg_goals_conceded
    final_dataset.at[index, 'away_team_win_ratio'] = away_team_win_ratio

    # Reset metrics after each season
    if index < len(final_dataset) - 1 and final_dataset.at[index + 1, 'Season_End_Year'] != season:
        team_metrics = {}

print(final_dataset)


  final_dataset.at[index, 'home_team_avg_goals_conceded'] = home_team_avg_goals_conceded
  final_dataset.at[index, 'home_team_win_ratio'] = home_team_win_ratio
  final_dataset.at[index, 'away_team_avg_goals_conceded'] = away_team_avg_goals_conceded
  final_dataset.at[index, 'home_team_avg_goals_scored'] = home_team_avg_goals_scored
  final_dataset.at[index, 'away_team_win_ratio'] = away_team_win_ratio
  final_dataset.at[index, 'away_team_avg_goals_scored'] = away_team_avg_goals_scored


       Season_End_Year  Wk        Date            Home  HomeGoals  AwayGoals  \
0                 1993   1  1992-08-15   Coventry City          2          1   
1                 1993   1  1992-08-15    Leeds United          2          1   
2                 1993   1  1992-08-15   Sheffield Utd          2          1   
3                 1993   1  1992-08-15  Crystal Palace          3          3   
4                 1993   1  1992-08-15         Arsenal          2          4   
...                ...  ..         ...             ...        ...        ...   
12021             2023  38  2023-05-28         Everton          1          0   
12022             2023  38  2023-05-28  Leicester City          2          1   
12023             2023  38  2023-05-28     Aston Villa          2          1   
12024             2023  38  2023-05-28    Leeds United          1          4   
12025             2023  38  2023-05-28       Brentford          1          0   

                  Away FTR  home_team_e

In [None]:
import pandas as pd

# Ensure 'final_dataset' DataFrame is sorted by date if it isn't already
final_dataset['Date'] = pd.to_datetime(final_dataset['Date'])
final_dataset.sort_values(by='Date', inplace=True)

# Encode team names
team_encoder = {}
team_decoder = {}
current_code = 0

# Corrected code to concatenate 'Home' and 'Away' columns
for team in pd.concat([final_dataset['Home'], final_dataset['Away']]).unique():
    if team not in team_encoder:
        team_encoder[team] = current_code
        team_decoder[current_code] = team
        current_code += 1

final_dataset['home_team_encoded'] = final_dataset['Home'].map(team_encoder)
final_dataset['away_team_encoded'] = final_dataset['Away'].map(team_encoder)

# Initialize columns for cumulative GD and GDform
final_dataset['home_team_GD_cumulative'] = 0
final_dataset['away_team_GD_cumulative'] = 0
final_dataset['home_team_GDform'] = 0
final_dataset['away_team_GDform'] = 0

# Dictionary to keep track of cumulative GD and GDform
team_GD_cumulative = {}
team_GDform = {}

# Variable to keep track of the current season
current_season = None

# Calculate cumulative GD and GDform
for index, row in final_dataset.iterrows():
    home_team = row['home_team_encoded']
    away_team = row['away_team_encoded']
    home_GD = row['HomeGoals'] - row['AwayGoals']
    away_GD = row['AwayGoals'] - row['HomeGoals']
    season = row['Season_End_Year']

    # Reset cumulative values and GDform lists at the start of a new season
    if season != current_season:
        team_GD_cumulative = {}
        team_GDform = {}
        current_season = season

    # Update cumulative GD for home and away teams
    if home_team not in team_GD_cumulative:
        team_GD_cumulative[home_team] = 0
    if away_team not in team_GD_cumulative:
        team_GD_cumulative[away_team] = 0

    team_GD_cumulative[home_team] += home_GD
    team_GD_cumulative[away_team] += away_GD

    # Update cumulative GD columns
    final_dataset.at[index, 'home_team_GD_cumulative'] = team_GD_cumulative[home_team]
    final_dataset.at[index, 'away_team_GD_cumulative'] = team_GD_cumulative[away_team]

    # Initialize GDform lists if they do not exist for the team
    if home_team not in team_GDform:
        team_GDform[home_team] = []
    if away_team not in team_GDform:
        team_GDform[away_team] = []

    # Append current GD to the list of GDform
    team_GDform[home_team].append(home_GD)
    team_GDform[away_team].append(away_GD)

    # Limit GDform list to the last 5 games
    if len(team_GDform[home_team]) > 5:
        team_GDform[home_team].pop(0)
    if len(team_GDform[away_team]) > 5:
        team_GDform[away_team].pop(0)

    # Calculate GDform for the last 5 games
    home_GDform = sum(team_GDform[home_team])
    away_GDform = sum(team_GDform[away_team])

    # Update GDform columns
    final_dataset.at[index, 'home_team_GDform'] = home_GDform
    final_dataset.at[index, 'away_team_GDform'] = away_GDform

# Show the updated DataFrame
print(final_dataset)


       Season_End_Year  Wk       Date            Home  HomeGoals  AwayGoals  \
0                 1993   1 1992-08-15   Coventry City          2          1   
7                 1993   1 1992-08-15     Southampton          0          0   
6                 1993   1 1992-08-15         Everton          1          1   
5                 1993   1 1992-08-15    Ipswich Town          1          1   
8                 1993   1 1992-08-15         Chelsea          1          1   
...                ...  ..        ...             ...        ...        ...   
12018             2023  38 2023-05-28  Manchester Utd          2          1   
12017             2023  38 2023-05-28         Chelsea          1          1   
12016             2023  38 2023-05-28  Crystal Palace          1          1   
12019             2023  38 2023-05-28         Arsenal          5          0   
12025             2023  38 2023-05-28       Brentford          1          0   

                  Away FTR  home_team_encoded  away

In [None]:
# Print unique teams in Home and Away columns
print("Unique Home Teams:", final_dataset['Home'].unique())
print("Unique Away Teams:", final_dataset['Away'].unique())

# Print the total number of unique teams
unique_teams = pd.concat([final_dataset['Home'], final_dataset['Away']]).unique()
print("Total Unique Teams:", len(unique_teams))

# Print encoded teams to verify
print("Encoded Teams Mapping:", team_encoder)
print("Decoded Teams Mapping:", team_decoder)


Unique Home Teams: ['Coventry City' 'Southampton' 'Everton' 'Ipswich Town' 'Chelsea'
 'Crystal Palace' 'Sheffield Utd' 'Leeds United' 'Arsenal'
 "Nott'ham Forest" 'Manchester City' 'Blackburn' 'Wimbledon' 'QPR'
 'Sheffield Weds' 'Manchester Utd' 'Norwich City' 'Tottenham'
 'Oldham Athletic' 'Aston Villa' 'Liverpool' 'Middlesbrough' 'West Ham'
 'Newcastle Utd' 'Swindon Town' 'Leicester City' 'Bolton' 'Derby County'
 'Sunderland' 'Barnsley' 'Charlton Ath' 'Watford' 'Bradford City' 'Fulham'
 'West Brom' 'Birmingham City' 'Portsmouth' 'Wolves' 'Wigan Athletic'
 'Reading' 'Hull City' 'Stoke City' 'Burnley' 'Blackpool' 'Swansea City'
 'Cardiff City' 'Bournemouth' 'Brighton' 'Huddersfield' 'Brentford']
Unique Away Teams: ['Middlesbrough' 'Tottenham' 'Sheffield Weds' 'Aston Villa'
 'Oldham Athletic' 'Blackburn' 'Manchester Utd' 'Wimbledon' 'Norwich City'
 'Liverpool' 'QPR' 'Arsenal' 'Ipswich Town' 'Southampton'
 "Nott'ham Forest" 'Everton' 'Chelsea' 'Coventry City' 'Crystal Palace'
 'Leeds Uni

In [None]:
# Initialize columns for cumulative metrics
final_dataset['home_team_games_played'] = 0
final_dataset['away_team_games_played'] = 0
final_dataset['home_team_loss_ratio'] = 0
final_dataset['away_team_loss_ratio'] = 0
final_dataset['home_team_draw_ratio'] = 0
final_dataset['away_team_draw_ratio'] = 0

# Dictionary to keep track of cumulative metrics
team_metrics = {}

for index, row in final_dataset.iterrows():
    home_team = row['Home']
    away_team = row['Away']
    home_goals = row['HomeGoals']
    away_goals = row['AwayGoals']
    season = row['Season_End_Year']

    # Initialize metrics for new teams
    if home_team not in team_metrics:
        team_metrics[home_team] = {'games': 0, 'losses': 0, 'draws': 0}
    if away_team not in team_metrics:
        team_metrics[away_team] = {'games': 0, 'losses': 0, 'draws': 0}

    # Update metrics
    team_metrics[home_team]['games'] += 1
    team_metrics[away_team]['games'] += 1

    if home_goals > away_goals:
        team_metrics[home_team]['draws'] += 0
        team_metrics[away_team]['draws'] += 0
        team_metrics[home_team]['losses'] += 0
        team_metrics[away_team]['losses'] += 1
    elif home_goals < away_goals:
        team_metrics[home_team]['draws'] += 0
        team_metrics[away_team]['draws'] += 0
        team_metrics[home_team]['losses'] += 1
        team_metrics[away_team]['losses'] += 0
    else:
        team_metrics[home_team]['draws'] += 1
        team_metrics[away_team]['draws'] += 1
        team_metrics[home_team]['losses'] += 0
        team_metrics[away_team]['losses'] += 0

    # Calculate ratios
    home_team_loss_ratio = team_metrics[home_team]['losses'] / team_metrics[home_team]['games']
    away_team_loss_ratio = team_metrics[away_team]['losses'] / team_metrics[away_team]['games']
    home_team_draw_ratio = team_metrics[home_team]['draws'] / team_metrics[home_team]['games']
    away_team_draw_ratio = team_metrics[away_team]['draws'] / team_metrics[away_team]['games']

    # Update DataFrame
    final_dataset.at[index, 'home_team_games_played'] = team_metrics[home_team]['games']
    final_dataset.at[index, 'away_team_games_played'] = team_metrics[away_team]['games']
    final_dataset.at[index, 'home_team_loss_ratio'] = home_team_loss_ratio
    final_dataset.at[index, 'away_team_loss_ratio'] = away_team_loss_ratio
    final_dataset.at[index, 'home_team_draw_ratio'] = home_team_draw_ratio
    final_dataset.at[index, 'away_team_draw_ratio'] = away_team_draw_ratio

    # Reset metrics after each season
    if index < len(final_dataset) - 1 and final_dataset.at[index + 1, 'Season_End_Year'] != season:
        team_metrics = {}

print(final_dataset)

  final_dataset.at[index, 'home_team_draw_ratio'] = home_team_draw_ratio
  final_dataset.at[index, 'away_team_draw_ratio'] = away_team_draw_ratio
  final_dataset.at[index, 'away_team_loss_ratio'] = away_team_loss_ratio
  final_dataset.at[index, 'home_team_loss_ratio'] = home_team_loss_ratio


       Season_End_Year  Wk       Date            Home  HomeGoals  AwayGoals  \
0                 1993   1 1992-08-15   Coventry City          2          1   
7                 1993   1 1992-08-15     Southampton          0          0   
6                 1993   1 1992-08-15         Everton          1          1   
5                 1993   1 1992-08-15    Ipswich Town          1          1   
8                 1993   1 1992-08-15         Chelsea          1          1   
...                ...  ..        ...             ...        ...        ...   
12018             2023  38 2023-05-28  Manchester Utd          2          1   
12017             2023  38 2023-05-28         Chelsea          1          1   
12016             2023  38 2023-05-28  Crystal Palace          1          1   
12019             2023  38 2023-05-28         Arsenal          5          0   
12025             2023  38 2023-05-28       Brentford          1          0   

                  Away FTR  home_team_encoded  away

In [None]:
# Drop columns
columns_to_drop = ['Home','Away','Date']
final_dataset = final_dataset.drop(columns=columns_to_drop)


In [None]:
 final_dataset.columns

 # Initialize LabelEncoder
 le = LabelEncoder()

# Fit and transform the 'FTR' column
final_dataset['FTR_encoded'] = le.fit_transform(final_dataset['FTR'])

# Print the DataFrame to see the results
print(final_dataset)

# Print the mapping of labels to integers
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

       Season_End_Year  Wk  HomeGoals  AwayGoals FTR  home_team_encoded  \
0                 1993   1          2          1   H                  0   
7                 1993   1          0          0   D                  1   
6                 1993   1          1          1   D                  2   
5                 1993   1          1          1   D                  3   
8                 1993   1          1          1   D                  4   
...                ...  ..        ...        ...  ..                ...   
12018             2023  38          2          1   H                 15   
12017             2023  38          1          1   D                  4   
12016             2023  38          1          1   D                  5   
12019             2023  38          5          0   H                  8   
12025             2023  38          1          0   H                 49   

       away_team_encoded  home_team_total_goals_scored  \
0                     21                 

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBRegressor

final_dataset = final_dataset.drop(['FTR'], axis=1)
final_dataset = final_dataset.drop(['HomeGoals'], axis=1)
final_dataset = final_dataset.drop(['AwayGoals'], axis=1)
# Prepare your dataset
y = final_dataset['FTR_encoded'].astype(int)
X = final_dataset.drop(['FTR_encoded'], axis=1)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import numpy as np
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train_scaled, y_train)

# Feature importance
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{X.columns[indices[f]]}: {importances[indices[f]]}")

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X.columns[indices],
    'Importance': importances[indices]
})

Feature ranking:
diffFormPts: 0.1242901086986324
home_team_formPts: 0.05733769227872457
home_team_GDform: 0.05532110540660246
away_team_formPts: 0.05512633479402582
away_team_draw_ratio: 0.05053636539253847
away_team_GDform: 0.04904423857498442
home_team_draw_ratio: 0.044522639810299816
diffPts: 0.04145815281497889
away_team_loss_ratio: 0.03822748251317005
home_team_win_ratio: 0.036624972965732125
away_team_win_ratio: 0.03509104084831904
home_team_loss_ratio: 0.03263385704684551
away_team_avg_goals_conceded: 0.026276211689744915
away_team_avg_goals_scored: 0.025973574828885505
home_team_avg_goals_scored: 0.025615673779459888
home_team_avg_goals_conceded: 0.02553287986400037
home_team_GD_cumulative: 0.024206789436225336
away_team_GD_cumulative: 0.021989481024175868
home_team_encoded: 0.02183629864563542
away_team_encoded: 0.021412366755566747
Season_End_Year: 0.021173505611845955
home_team_pts: 0.02000823847168304
away_team_total_goals_conceded: 0.01990478994306873
away_team_pts: 0.0195

In [None]:
top_features = feature_importance_df.head(15)
top_feature_names = top_features['Feature'].tolist()
print("Top features:", top_feature_names)

# Create a new DataFrame with only the top features
X_train_selected = X_train[top_feature_names]
X_test_selected = X_test[top_feature_names]

# Scale the selected features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

Top features: ['diffFormPts', 'home_team_formPts', 'home_team_GDform', 'away_team_formPts', 'away_team_draw_ratio', 'away_team_GDform', 'home_team_draw_ratio', 'diffPts', 'away_team_loss_ratio', 'home_team_win_ratio', 'away_team_win_ratio', 'home_team_loss_ratio', 'away_team_avg_goals_conceded', 'away_team_avg_goals_scored', 'home_team_avg_goals_scored']


In [None]:
# Define parameter grids for each model

# Logistic Regression parameter grid for hiyperparameter tuning
lr_param_grid = {
    'C': [0.01,0.1, 1, 10,100],
    'penalty': ['l2'],
    'solver': ['liblinear', 'saga']

}



# Perform Grid Search for Logistic Regression

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_grid_search = GridSearchCV(lr_model, lr_param_grid, cv=5, scoring='accuracy', verbose=1)
lr_grid_search.fit(X_train_scaled, y_train)
lr_best_model = lr_grid_search.best_estimator_

# Evaluate the Logistic Regression model
lr_pred = lr_best_model.predict(X_test_scaled)
print("Logistic Regression Best Parameters:")
print(lr_grid_search.best_params_)
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, lr_pred))
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Logistic Regression Best Parameters:
{'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}
Logistic Regression Confusion Matrix:
[[ 751  122  127]
 [ 209  398  341]
 [ 132  133 1395]]

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.75      0.72      1000
           1       0.61      0.42      0.50       948
           2       0.75      0.84      0.79      1660

    accuracy                           0.71      3608
   macro avg       0.68      0.67      0.67      3608
weighted avg       0.70      0.71      0.69      3608



In [None]:
# Decision Tree parameter grid
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30,40],
    'min_samples_split': [2, 5, 10]
}

dt_model = DecisionTreeClassifier(random_state=0)
dt_grid_search = GridSearchCV(dt_model, dt_param_grid, cv=5, scoring='accuracy', verbose=1)
dt_grid_search.fit(X_train_scaled, y_train)
dt_best_model = dt_grid_search.best_estimator_

# Evaluate the Decision Tree model
dt_pred = dt_best_model.predict(X_test_scaled)
print("Decision Tree Best Parameters:")
print(dt_grid_search.best_params_)
print("Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, dt_pred))
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, dt_pred))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Decision Tree Best Parameters:
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 2}
Decision Tree Confusion Matrix:
[[ 718  159  123]
 [ 228  435  285]
 [ 148  206 1306]]

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.72      0.69      1000
           1       0.54      0.46      0.50       948
           2       0.76      0.79      0.77      1660

    accuracy                           0.68      3608
   macro avg       0.65      0.65      0.65      3608
weighted avg       0.68      0.68      0.68      3608



In [None]:
   # Infer input shape
input_shape = X_train_scaled.shape[1]

# Build the model
model = Sequential()
model.add(Dense(128, input_shape=(input_shape,), activation='relu'))  # First layer, specify input shape
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))  # Second layer
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))  # Third layer
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu'))  # Fourth layer
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))  # Output layer

# Modify the model compilation to use sparse categorical crossentropy
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with integer-encoded labels
history = model.fit(X_train_scaled, y_train, epochs=1000, batch_size=32, validation_data=(X_test_scaled, y_test))


# Evaluate the model
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/1000
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.4235 - loss: 1.0881 - val_accuracy: 0.6394 - val_loss: 0.8084
Epoch 2/1000
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6108 - loss: 0.8539 - val_accuracy: 0.6899 - val_loss: 0.7566
Epoch 3/1000
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6327 - loss: 0.8312 - val_accuracy: 0.6924 - val_loss: 0.7260
Epoch 4/1000
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6526 - loss: 0.7961 - val_accuracy: 0.6998 - val_loss: 0.7214
Epoch 5/1000
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6518 - loss: 0.7988 - val_accuracy: 0.6979 - val_loss: 0.7114
Epoch 6/1000
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6672 - loss: 0.7810 - val_accuracy: 0.6973 - val_loss: 0.6974
Epoch 7/1000


In [None]:
import pickle

# Save the best model from GridSearchCV
with open('best_lr_model.pkl', 'wb') as file:
    pickle.dump(lr_best_model, file)


In [None]:
! pip install streamlit -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%%writefile app.py


import streamlit as st
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


# Title of the app
st.title("Match Predictions App")

# Load the saved model
with open('best_lr_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)




#upload the file again

uploaded_file = st.file_uploader("C:\\Users\\rayba\\Downloads\\premier-league-matches.csv", type="csv")
if uploaded_file is not None:
    # Load the data
    final_dataset = pd.read_csv(uploaded_file)
    st.write(final_dataset.head())

    # Show data information
    st.write("Data Information:")
    st.write(final_dataset.info())

    # Select features and target
    if st.checkbox("Select features and target"):
        features = st.multiselect("Select features", final_dataset.columns.tolist())
        target = st.selectbox("Select target variable", final_dataset.columns.tolist())

        # Split the data
        X = final_dataset[features]
        y = final_dataset[target]#
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train a model
        model = RandomForestClassifier()
        model.fit(X_train, y_train)

        # Show model accuracy
        accuracy = model.score(X_test, y_test)
        st.write(f"Model Accuracy: {accuracy:.2f}")

# Visualizations
if st.checkbox("Show Visualizations"):
    st.subheader("Home Goals Distribution")
    sns.histplot(final_dataset['HomeGoals'], kde=True)
    st.pyplot(plt)

    st.subheader("Away Goals Distribution")
    sns.histplot(final_dataset['AwayGoals'], kde=True)
    st.pyplot(plt)




Writing app.py


In [None]:
! wget -q -O - ipv4.icanhazip.com

34.105.46.6


In [None]:
! streamlit run app.py & npx localtunnel --port 8501