# Import necessary functions

In [23]:
#handle data
import pandas as pd
import numpy as np

# Functions

## League features
Set the league goal averages, standard deviaton and win, draw percentages

In [3]:
def league(season):
    
    last_2_season = df[(df["Season"] == season - 1) | (df["Season"] == season - 2)]

    # Compute average and standard deviation for home and away goals
    df.loc[df["Season"] == season, "league_home_goal_for_avg"] = last_2_season["FTHG"].mean()
    df.loc[df["Season"] == season, "league_home_goal_for_std"] = last_2_season["FTHG"].std()
    df.loc[df["Season"] == season, "league_away_goal_for_avg"] = last_2_season["FTAG"].mean()
    df.loc[df["Season"] == season, "league_away_goal_for_std"] = last_2_season["FTAG"].std()

    # Compute win count for home, draw, and away games
    league_home_win_cnt = last_2_season["FTR"].eq(1).sum()
    league_draw_win_cnt = last_2_season["FTR"].eq(0).sum()

    # Compute total game count for each team in the league
    league_game_cnt = len(last_2_season["FTR"])

    # Compute win percentage and draw percentage for the league
    df.loc[df["Season"] == season, "league_home_win_pct"] = league_home_win_cnt / league_game_cnt
    df.loc[df["Season"] == season, "league_draw_pct"] = league_draw_win_cnt / league_game_cnt

    df.loc[df["Season"] == season, "team_cnt"] = df[df["Season"] == season - 1]["HomeTeam"].nunique()

    df.loc[df["Season"] == season, "goal_diff_std"] = (last_2_season["FTHG"] - last_2_season["FTAG"]).std()

    df.loc[df["Season"] == season, "rnd_cnt"] = df[df["Season"] == season - 1]["Round"].max()

## Historic strenght
statistics of teams playing home and away from their current and last 2 seasons.

In [4]:
def historical_strength(i,match):
    season = match["Season"]    
    
    home_team = match["HomeTeam"]
    away_team = match["AwayTeam"]
    
    # filter the the last 2 seasons for home and away team in both venue
    home_team_current_and_last_2_season_home = df[((df["Season"]==season-1) | (df["Season"]==season-2) | (df["Season"]==season)) & (df.index<i) & (df["HomeTeam"]==home_team)]
    away_team_current_and_last_2_season_away = df[((df["Season"]==season-1) | (df["Season"]==season-2) | (df["Season"]==season)) & (df.index<i) & (df["AwayTeam"]==away_team)]
    home_team_current_and_last_2_season_away = df[((df["Season"]==season-1) | (df["Season"]==season-2) | (df["Season"]==season)) & (df.index<i) & (df["AwayTeam"]==home_team)]
    away_team_current_and_last_2_season_home = df[((df["Season"]==season-1) | (df["Season"]==season-2) | (df["Season"]==season)) & (df.index<i) & (df["HomeTeam"]==away_team)]
    
    #Home team
    # Home team home results
    home_team_home_results = home_team_current_and_last_2_season_home["FTR"].value_counts()
    home_team_home_wins = home_team_home_results.get(1, 0)
    home_team_home_losses = home_team_home_results.get(-1, 0)
    home_team_home_draws = home_team_home_results.get(0, 0)

    # Home team away results
    home_team_away_results = home_team_current_and_last_2_season_away["FTR"].value_counts()
    home_team_away_wins = home_team_away_results.get(-1, 0)
    home_team_away_losses = home_team_away_results.get(1, 0)
    home_team_away_draws = home_team_away_results.get(0, 0)

    # Count of played matches
    home_team_home_played_games = len(home_team_current_and_last_2_season_home)
    home_team_away_played_games = len(home_team_current_and_last_2_season_away)
    
    # Home team home result percentages
    if home_team_home_played_games == 0:
        df.loc[i,'home_team_home_win_pct'] = 0.0
        df.loc[i,'home_team_home_draw_pct'] = 0.0
    else:
        df.loc[i,'home_team_home_win_pct'] = home_team_home_wins / home_team_home_played_games
        df.loc[i,'home_team_home_draw_pct'] = home_team_home_draws / home_team_home_played_games
    
    # Home team away result percentages
    if home_team_away_played_games == 0:
        df.loc[i,'home_team_away_win_pct'] = 0.0
        df.loc[i,'home_team_away_draw_pct'] = 0.0
    else:
        df.loc[i,'home_team_away_win_pct'] = home_team_away_wins / home_team_away_played_games
        df.loc[i,'home_team_away_draw_pct'] = home_team_away_draws / home_team_away_played_games
    
    #Home team home goals for
    df.loc[i,'home_team_home_goals_for_avg'] = home_team_current_and_last_2_season_home["FTHG"].mean()
    df.loc[i,'home_team_home_goals_for_std'] = home_team_current_and_last_2_season_home["FTHG"].std()
    #Home team home goals against
    df.loc[i,'home_team_home_goals_against_avg'] = home_team_current_and_last_2_season_home["FTAG"].mean()
    df.loc[i,'home_team_home_goals_against_std'] = home_team_current_and_last_2_season_home["FTAG"].std()
    #Home team away goals for
    df.loc[i,'home_team_away_goals_for_avg'] = home_team_current_and_last_2_season_away["FTAG"].mean()
    df.loc[i,'home_team_away_goals_for_std'] = home_team_current_and_last_2_season_away["FTAG"].std()
    #Home team away goals against
    df.loc[i,'home_team_away_goals_against_avg'] = home_team_current_and_last_2_season_away["FTHG"].mean()
    df.loc[i,'home_team_away_goals_against_std'] = home_team_current_and_last_2_season_away["FTHG"].std()
    
    
    #IDEGENBELI
    # Away team home results
    away_team_home_results = away_team_current_and_last_2_season_home["FTR"].value_counts()
    away_team_home_wins = away_team_home_results.get(1, 0)
    away_team_home_losses = away_team_home_results.get(-1, 0)
    away_team_home_draws = away_team_home_results.get(0, 0)

    # Away team away results
    away_team_away_results = away_team_current_and_last_2_season_away["FTR"].value_counts()
    away_team_away_wins = away_team_away_results.get(-1, 0)
    away_team_away_losses = away_team_away_results.get(1, 0)
    away_team_away_draws = away_team_away_results.get(0, 0)

    # Count of played matches
    away_team_home_played_games = len(away_team_current_and_last_2_season_home)
    away_team_away_played_games = len(away_team_current_and_last_2_season_away)
    
    # Home team home result percentages
    if away_team_home_played_games == 0:
        df.loc[i,'away_team_home_win_pct'] = 0.0
        df.loc[i,'away_team_home_draw_pct'] = 0.0
    else:
        df.loc[i,'away_team_home_win_pct'] = away_team_home_wins / away_team_home_played_games
        df.loc[i,'away_team_home_draw_pct'] = away_team_home_draws / away_team_home_played_games
    
    # Home team away result percentages
    if away_team_away_played_games == 0:
        df.loc[i,'away_team_away_win_pct'] = 0.0
        df.loc[i,'away_team_away_draw_pct'] = 0.0
    else:
        df.loc[i,'away_team_away_win_pct'] = away_team_away_wins / away_team_away_played_games
        df.loc[i,'away_team_away_draw_pct'] = away_team_away_draws / away_team_away_played_games
    
    #Home team home goals for
    df.loc[i,'away_team_home_goals_for_avg'] = away_team_current_and_last_2_season_home["FTHG"].mean()
    df.loc[i,'away_team_home_goals_for_std'] = away_team_current_and_last_2_season_home["FTHG"].std()
    #Home team home goals against
    df.loc[i,'away_team_home_goals_against_avg'] = away_team_current_and_last_2_season_home["FTAG"].mean()
    df.loc[i,'away_team_home_goals_against_std'] = away_team_current_and_last_2_season_home["FTAG"].std()
    #Home team away goals for
    df.loc[i,'away_team_away_goals_for_avg'] = away_team_current_and_last_2_season_away["FTAG"].mean()
    df.loc[i,'away_team_away_goals_for_std'] = away_team_current_and_last_2_season_away["FTAG"].std()
    #Home team away goals against
    df.loc[i,'away_team_away_goals_against_avg'] = away_team_current_and_last_2_season_away["FTHG"].mean()
    df.loc[i,'away_team_away_goals_against_std'] = away_team_current_and_last_2_season_away["FTHG"].std()

## Current form
statistics of teams from their last 5 matches

In [5]:
def current_form(i, current_match):
    # mash the home team's away and home games togheter and filter the last 5
    home_team = current_match["HomeTeam"]
    home_mask = (df["HomeTeam"] == home_team) | (df["AwayTeam"] == home_team)
    home_last_5_match = df.loc[:index-1][home_mask].tail(5)
    
    # mash the away team's away and home games togheter and filter the last 5
    away_team = current_match["AwayTeam"]
    away_mask = (df["HomeTeam"] == away_team) | (df["AwayTeam"] == away_team)
    away_last_5_match = df.loc[:index-1][away_mask].tail(5)

    #Home team
    
    # separate the venues the home team played on in their last 5 mathces
    home_team_home_last_5_match = home_last_5_match[home_last_5_match["HomeTeam"]==home_team]
    home_team_away_last_5_match = home_last_5_match[home_last_5_match["AwayTeam"]==home_team]
    
    # Home team home results
    home_team_home_results = home_team_home_last_5_match["FTR"].value_counts()
    home_team_wins = home_team_home_results.get(1, 0)
    home_team_draws = home_team_home_results.get(0, 0)

    # Home team away results
    home_team_away_results = home_team_away_last_5_match["FTR"].value_counts()
    home_team_wins += home_team_away_results.get(-1, 0)
    home_team_draws += home_team_away_results.get(0, 0)

    # Count of played matches - 0 at the beginning of the database
    home_team_played_games = len(home_last_5_match)
    
    # Home team home result percentages
    if home_team_played_games == 0:
        df.loc[i,'home_team_last_5_win_pct'] = 0.0
        df.loc[i,'home_team_last_5_draw_pct'] = 0.0
    else:
        df.loc[i,'home_team_last_5_win_pct'] = home_team_wins / home_team_played_games
        df.loc[i,'home_team_last_5_draw_pct'] = home_team_draws / home_team_played_games
        
    
    # Calculate the goals goals scored and conceded by home team    
    home_team_goals_for = pd.concat([home_team_home_last_5_match["FTHG"], home_team_away_last_5_match["FTAG"]])
    home_team_goals_against = pd.concat([home_team_home_last_5_match["FTAG"], home_team_away_last_5_match["FTHG"]])

    #Home team goals for
    df.loc[i,'home_team_last_5_goals_for_avg'] = home_team_goals_for.mean() if home_team_played_games!=0 else 0.0
    df.loc[i,'home_team_last_5_goals_for_std'] = home_team_goals_for.std()  if home_team_played_games!=0 else 0.0
    #Home team goals against
    df.loc[i,'home_team_last_5_goals_against_avg'] = home_team_goals_against.mean() if home_team_played_games!=0 else 0.0
    df.loc[i,'home_team_last_5_goals_against_std'] = home_team_goals_against.std() if home_team_played_games!=0 else 0.0
    
    # Calculate rest days for the home team
    home_last_played = df.loc[:index-1, "Date"].where(home_mask).dropna().tail(1)
    if not home_last_played.empty:
        df.loc[i, "home_rest"] = (current_match["Date"] - home_last_played.iloc[0]).days
    else:
        df.loc[i, "home_rest"] = 80
    
    #Away team
    
    # separate the venues the away team played on in their last 5 mathces
    away_team_home_last_5_match = away_last_5_match[away_last_5_match["HomeTeam"]==away_team]
    away_team_away_last_5_match = away_last_5_match[away_last_5_match["AwayTeam"]==away_team]
    
    # Away team home results
    away_team_home_results = away_team_home_last_5_match["FTR"].value_counts()
    away_team_wins = away_team_home_results.get(1, 0)
    away_team_draws = away_team_home_results.get(0, 0)

    # Away team away results
    away_team_away_results = away_team_away_last_5_match["FTR"].value_counts()
    away_team_wins += away_team_away_results.get(-1, 0)
    away_team_draws += away_team_away_results.get(0, 0)
    
    # Count of played matches - 0 at the beginning of the database
    away_team_played_games = len(away_last_5_match)
    
    # Home team home result percentages
    if away_team_played_games == 0:
        df.loc[i,'away_team_last_5_win_pct'] = 0.0
        df.loc[i,'away_team_last_5_draw_pct'] = 0.0
    else:
        df.loc[i,'away_team_last_5_win_pct'] = away_team_wins / away_team_played_games
        df.loc[i,'away_team_last_5_draw_pct'] = away_team_draws / away_team_played_games
    
    # Calculate the goals goals scored and conceded by away team    
    away_team_goals_for = pd.concat([away_team_home_last_5_match["FTHG"], away_team_away_last_5_match["FTAG"]])
    away_team_goals_against = pd.concat([away_team_home_last_5_match["FTAG"], away_team_away_last_5_match["FTHG"]])


    # Away team goals for
    df.loc[i,'away_team_last_5_goals_for_avg'] = away_team_goals_for.mean() if away_team_played_games!=0 else 0.0
    df.loc[i,'away_team_last_5_goals_for_std'] = away_team_goals_for.std() if away_team_played_games!=0 else 0.0
    # Away team goals against
    df.loc[i,'away_team_last_5_goals_against_avg'] = away_team_goals_against.mean() if away_team_played_games!=0 else 0.0
    df.loc[i,'away_team_last_5_goals_against_std'] = away_team_goals_against.std() if away_team_played_games!=0 else 0.0
    
    # Calculate rest days for the away team
    away_last_played = df.loc[:index-1, "Date"].where(away_mask).dropna().tail(1)
    if not away_last_played.empty:
        df.loc[i, "away_rest"] = (current_match["Date"] - away_last_played.iloc[0]).days
    else:
        df.loc[i, "away_rest"] = 80

## Pi - rating
shows the strenght of a team compared to other teams based on goals scored and conceded

In [6]:
# Define a function to calculate the expected goal difference between two teams
def expected_goal_difference(c, ratings, home_team, away_team):
    # Extract the home and away ratings
    home_rating = np.abs(ratings[home_team]["home"])
    away_rating = np.abs(ratings[away_team]["away"])
    
    # Calculate the expected goal difference for the home team
    if ratings[home_team]["home"] < 0:
        expected_home_win_by = -(10 ** (home_rating / c) - 1)
    else:
        expected_home_win_by = 10 ** (home_rating / c) - 1
    
    # Calculate the expected goal difference for the away team
    if ratings[away_team]["away"] < 0:
        expected_away_win_by = -(10 ** (away_rating / c) - 1)
    else:
        expected_away_win_by = 10 ** (away_rating / c) - 1
    
    # Return the expected goal differences for both teams
    return expected_home_win_by, expected_away_win_by

In [7]:
def pi_rating(alpha, beta, c, ratings, i, match):        
    # Extract the home and away teams and their scores
    home_team = match["HomeTeam"]
    away_team = match["AwayTeam"]
    home_score = match["FTHG"]
    away_score = match["FTAG"]

    # Calculate the expected goal difference and adjust for home advantage
    expected_home_win_by, expected_away_win_by = expected_goal_difference(c, ratings, home_team, away_team)
    expected_goal_diff = expected_home_win_by - expected_away_win_by
    goal_diff = home_score - away_score

    # Calculate the error between expected and actual goal difference
    e = abs(goal_diff - expected_goal_diff)
    error = c * math.log10(1 + e)

    # Determine the error for the home and away teams
    error_home = error if expected_goal_diff < goal_diff else -error
    error_away = error if expected_goal_diff > goal_diff else -error

    # Store the updated pi-ratings in the DataFrame
    df.at[i, "home_team_home_Pi"] = ratings[home_team]["home"]
    df.at[i, "home_team_away_Pi"] = ratings[home_team]["away"]
    df.at[i, "away_team_away_Pi"] = ratings[away_team]["away"]
    df.at[i, "away_team_home_Pi"] = ratings[away_team]["home"]
    df.at[i, "exp_GD"] = expected_goal_diff

    # Update the ratings based on the error
    ratings[home_team]["home"] += alpha * error_home
    ratings[home_team]["away"] += beta * (alpha * error_home)
    ratings[away_team]["away"] += alpha * error_away
    ratings[away_team]["home"] += beta * (alpha * error_away)

## Page Rank
shows the strenght of a team compared to other teams based on the outcome of the matches

In [8]:
def calc_page_ranks(df_rank):
    G = nx.MultiDiGraph()

    # Populate the transition matrix with match results - winning team +1; in case of draw both team gets 1 edge
    for _, row in df_rank.iterrows():
        if row["FTR"] == -1:
            G.add_edge(row['HomeTeam'], row['AwayTeam'])
        elif row["FTR"] == 1:
            G.add_edge(row['AwayTeam'], row['HomeTeam'])
        else:
            G.add_edge(row['AwayTeam'], row['HomeTeam'])
            G.add_edge(row['HomeTeam'], row['AwayTeam'])

    # Calculate PageRank
    pagerank = nx.pagerank(G)
    return pagerank

In [10]:
def page_ranking(i, match):
    home_team = match["HomeTeam"]
    away_team = match["AwayTeam"]
    
    season = match["Season"]
    df_rank = df[((df["Season"]==season-1) | (df["Season"]==season-2) | (df["Season"]==season)) & (df.index<i)]
    if season == df["Season"].min():
        next
    else:
        pageranks = calc_page_ranks(df_rank)
    
        # Look up page ranks for home and away teams
        home_pagerank = pageranks.get(home_team)
        away_pagerank = pageranks.get(away_team)

        # Update DataFrame with page ranks
        df.at[i, "Home_PageRank"] = home_pagerank
        df.at[i, "Away_PageRank"] = away_pagerank

## Match importance
how far a team is from the first 6 and last 4 places

In [12]:
def calculate_point_differences(home_team_position, away_team_position, team_points, team_games_played):
    # calculate the number of teams and sort the teams based on their points
    num_teams = len(team_points)
    team_rank = sorted(range(num_teams), key=lambda x: team_points[x], reverse=True)
    
    # get the home and away teams position and games played
    home_pi = team_points[home_team_position]
    home_gi = team_games_played[home_team_position]
    
    away_pi = team_points[away_team_position]
    away_gi = team_games_played[away_team_position]
    
    #calculate the point differences    
    home_point_differences = []
    away_point_differences = []
    for k in range(num_teams):
        home_point_difference = ((team_points[k]-home_pi) / home_gi) if home_gi!=0 else 0.0
        home_point_differences.append(home_point_difference)
        
        away_point_difference = ((team_points[k]-away_pi) / away_gi) if away_gi!=0 else 0.0
        away_point_differences.append(away_point_difference)
        
    # get only the first 6 and last 4
    home_head_tail = home_point_differences[:6] + home_point_differences[-4:]
    away_head_tail = away_point_differences[:6] + away_point_differences[-4:]
    return home_head_tail, away_head_tail

In [13]:
def importance(season):
    sea = df[df["Season"] == season].copy()
        
    #get all teams that played in this season
    teams = sea["HomeTeam"].unique().tolist() + sea["AwayTeam"].unique().tolist()
    teams = list(set(teams))
    
    #set up table
    league_table = pd.DataFrame(columns=["Team", "Points", "Matches Played"])
    league_table["Team"] = teams
    league_table["Points"] = 0
    league_table["Matches Played"] = 0
    
    # Iterate over the matches in the DataFrame
    for i, match in sea.iterrows():
        #get the playing teams
        home_team = match["HomeTeam"]
        away_team = match["AwayTeam"]
        
        #Update rank
        # Sort the DataFrame based on points in descending order
        league_table.sort_values(by="Points", ascending=False, inplace=True)
        # Reset the index of the DataFrame
        league_table.reset_index(drop=True, inplace=True)
        league_table["Rank"] = league_table["Points"].rank(ascending=False, method="min")
    
        # Calculate the importance of the match
        #get the teams points and games played
        team_points = league_table["Points"].values
        team_games_played = league_table["Matches Played"].values
            
        # Find the row numbers where the team matches
        home_team_position = league_table[league_table["Team"] == home_team].index
        away_team_position = league_table[league_table["Team"] == away_team].index
        
        #get home and away distances with the function calculate_point_differences
        home_distances, away_distances = calculate_point_differences(home_team_position, away_team_position, team_points, team_games_played)
        
        # Access the distances using indexing
        for venue in ['home', 'away']:
            distances = eval(f'{venue}_distances')
    
            for fd in range(6):
                df.loc[i, f'{venue}_first_distance_{fd+1}'] = distances[fd]
            for ld in range(6,10):
                df.loc[i, f'{venue}_last_distance_{ld-6}'] = distances[ld]
                
        # update league table with current match                
        # Assign points based on the result
        result = match["FTR"]  # FTR column contains the match result (1 = home win, 0 = draw, -1 = away win)
        if result == 1:
            home_points = 3
            away_points = 0
        elif result == 0:
            home_points = 1
            away_points = 1
        else:
            home_points = 0
            away_points = 3    
    
        # Update home team
        league_table.loc[league_table["Team"]==home_team,"Points"] += home_points
        league_table.loc[league_table["Team"]==home_team,"Matches Played"] += 1
    
        # Update away team
        league_table.loc[league_table["Team"]==away_team,"Points"] += away_points
        league_table.loc[league_table["Team"]==away_team,"Matches Played"] += 1

## Create dataframe

In [14]:
def create_df():
    global df
    df = data.copy().reset_index(drop=True)
    
    # Define the learning rates for updating the ratings
    alpha = 0.035  # Learning rate for the home team
    beta = 0.7  # Learning rate for the away team
    c = 3  # Replace with the desired value of c
    # Initialize the ratings dictionary for each team
    ratings = {team: {"home": 0.0, "away": 0.0} for team in set(df["HomeTeam"]).union(set(df["AwayTeam"]))}
    
    
    seasons = df["Season"].unique()
    #go through each season one by one
    for season in seasons:
        #set rounds
        df.loc[df["Season"] == season, 'Round'] = ((df.loc[df["Season"] == season].index - df[df["Season"] == season].head(1).index) // 10) + 1
        league(season)
        importance(season)
        
        
    for i, match in df.iterrows():
        historical_strength(i,match)
        current_form(i,match)
        pi_rating(alpha, beta, c, ratings, i, match)
        page_ranking(i,match)
        
    #Filter the DataFrame to keep only the matches where the rolling has caught up
    df = df[df["Season"]>1995]
    df.reset_index(inplace=True, drop=True)
    df["Date"] = pd.to_datetime(df["Date"])

# Create DF
using the previous functions

In [19]:
#load raw data
raw_data = pd.read_csv("raw_data.csv", sep=',', engine='python')

In [20]:
#choose the necessary columns
selected_cols = ["Date","Season","HomeTeam","AwayTeam","FTHG","FTAG","FTR",'MaxH','MaxD','MaxA','Max>2.5', 'Max<2.5']
selected_df = raw_data.loc[:, selected_cols]
data=selected_df.copy()

In [21]:
# make sure to have the same data format
data.dropna(subset=['Date'], inplace=True)
data["Date"] = pd.to_datetime(data["Date"], dayfirst=True, errors="coerce")

#sort by Date
data.sort_index(inplace=True)

In [24]:
#add FTR classification
data["FTR"]=np.select(
        [data["FTR"]=="A", data["FTR"]=="H",data["FTR"]=="D"],
        [-1, 1, 0],
    )

#total goal
data["total_goal"]=data["FTHG"]+data["FTAG"]

#add O/U 2.5 classification
data["O2.5"]=np.select(
        [data["total_goal"]>2,data["total_goal"]<=2],
        [1,0])

In [25]:
#look at the tidied up dataframe
data

Unnamed: 0,Date,Season,HomeTeam,AwayTeam,FTHG,FTAG,FTR,MaxH,MaxD,MaxA,Max>2.5,Max<2.5,total_goal,O2.5
0,1993-08-14,1993,Arsenal,Coventry,0.0,3.0,-1,,,,,,3.0,1
1,1993-08-14,1993,Aston Villa,QPR,4.0,1.0,1,,,,,,5.0,1
2,1993-08-14,1993,Chelsea,Blackburn,1.0,2.0,-1,,,,,,3.0,1
3,1993-08-14,1993,Liverpool,Sheffield Weds,2.0,0.0,1,,,,,,2.0,0
4,1993-08-14,1993,Man City,Leeds,1.0,1.0,0,,,,,,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12256,2023-05-28,2022,Everton,Bournemouth,1.0,0.0,1,1.53,5.0,7.25,1.74,2.29,1.0,0
12257,2023-05-28,2022,Leeds,Tottenham,1.0,4.0,-1,2.88,4.1,2.45,1.52,2.72,5.0,1
12258,2023-05-28,2022,Leicester,West Ham,2.0,1.0,1,2.00,4.2,3.82,1.74,2.28,3.0,1
12259,2023-05-28,2022,Man United,Fulham,2.0,1.0,1,1.56,4.9,6.00,1.53,2.80,3.0,1


In [26]:
# add new feature to the existing ones by creating a new dataframe using the create_df() function
create_df()

  df.loc[df["Season"] == season, "league_home_win_pct"] = league_home_win_cnt / league_game_cnt
  df.loc[df["Season"] == season, "league_draw_pct"] = league_draw_win_cnt / league_game_cnt


KeyboardInterrupt: 

In [27]:
# save the dataframe to csv
df.to_csv('cleaned_data_with_new_features.csv', index=False)

Unnamed: 0,Date,Season,HomeTeam,AwayTeam,FTHG,FTAG,FTR,MaxH,MaxD,MaxA,...,away_first_distance_1,away_first_distance_2,away_first_distance_3,away_first_distance_4,away_first_distance_5,away_first_distance_6,away_last_distance_0,away_last_distance_1,away_last_distance_2,away_last_distance_3
0,1993-08-14,1993,Arsenal,Coventry,0.0,3.0,-1,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1993-08-14,1993,Aston Villa,QPR,4.0,1.0,1,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1993-08-14,1993,Chelsea,Blackburn,1.0,2.0,-1,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1993-08-14,1993,Liverpool,Sheffield Weds,2.0,0.0,1,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1993-08-14,1993,Man City,Leeds,1.0,1.0,0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11559,2023-05-28,2022,Everton,Bournemouth,1.0,0.0,1,1.53,5.0,7.25,...,,,,,,,,,,
11560,2023-05-28,2022,Leeds,Tottenham,1.0,4.0,-1,2.88,4.1,2.45,...,,,,,,,,,,
11561,2023-05-28,2022,Leicester,West Ham,2.0,1.0,1,2.00,4.2,3.82,...,,,,,,,,,,
11562,2023-05-28,2022,Man United,Fulham,2.0,1.0,1,1.56,4.9,6.00,...,,,,,,,,,,
