# Get Data

In [44]:
import pandas as pd
import pickle
from scipy.stats import poisson
from bs4 import BeautifulSoup
import requests

In [45]:
def get_matches(year):
    web = f'https://en.wikipedia.org/wiki/{year}_FIFA_Women%27s_World_Cup'
    response = requests.get(web)
    content = response.text
    soup = BeautifulSoup(content, 'lxml')
    matches = soup.find_all('div', class_='footballbox')

    home = []
    score = []
    away = []

    for match in matches:
        home.append(match.find('th', class_='fhome').get_text())
        score.append(match.find('th', class_='fscore').get_text())
        away.append(match.find('th', class_='faway').get_text())

    dict_football = {'home': home, 'score': score, 'away': away}
    df_football = pd.DataFrame(dict_football)
    df_football['year'] = year
    return df_football

# fixture
df_fixture = get_matches(2023)

# Get Groups

In [48]:
import pandas as pd
import pickle
from string import ascii_uppercase as alphabet

# extracting all tables in website
all_tables = pd.read_html('https://en.wikipedia.org/wiki/2023_FIFA_Women%27s_World_Cup')

In [49]:
all_tables[9]

Unnamed: 0,Pos,Teamvte,Pld,W,D,L,GF,GA,GD,Pts,Qualification
0,1,New Zealand (H),0,0,0,0,0,0,0,0,Advance to knockout stage
1,2,Norway,0,0,0,0,0,0,0,0,Advance to knockout stage
2,3,Philippines,0,0,0,0,0,0,0,0,
3,4,Switzerland,0,0,0,0,0,0,0,0,


In [None]:
# A -> H
# 9 -> 7*8 + 9 = 65
all_tables = pd.read_html('https://en.wikipedia.org/wiki/2023_FIFA_Women%27s_World_Cup')
for i in range(9,65,7):
    df = all_tables[i]
    df.rename(columns={df.columns[1]:'Team'}, inplace=True)
    df.pop('Qualification')

In [52]:
all_tables = pd.read_html('https://en.wikipedia.org/wiki/2023_FIFA_Women%27s_World_Cup')

dict_table = {}
for letter, i in zip(alphabet, range(9,65,7)):
    df = all_tables[i]
    df.rename(columns={df.columns[1]:'Team'}, inplace=True)
    df.pop('Qualification')
    dict_table[f'Group {letter}'] = df

In [54]:
dict_table['Group H']

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,Germany,0,0,0,0,0,0,0,0
1,2,Morocco,0,0,0,0,0,0,0,0
2,3,Colombia,0,0,0,0,0,0,0,0
3,4,South Korea,0,0,0,0,0,0,0,0


# Predict

In [55]:
df_historical_data = pd.read_csv('df_historical_data_kaggle.csv')

In [56]:
df_home = df_historical_data[['HomeTeam', 'HomeGoals', 'AwayGoals']]
df_away = df_historical_data[['AwayTeam', 'HomeGoals', 'AwayGoals']]

df_home = df_home.rename(columns={'HomeTeam':'Team', 'HomeGoals': 'GoalsScored', 'AwayGoals': 'GoalsConceded'})
df_away = df_away.rename(columns={'AwayTeam':'Team', 'HomeGoals': 'GoalsConceded', 'AwayGoals': 'GoalsScored'})

df_team_strength = pd.concat([df_home, df_away], ignore_index=True).groupby(['Team']).mean()
df_team_strength

Unnamed: 0_level_0,GoalsScored,GoalsConceded
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Albania,0.945946,2.891892
Algeria,1.552632,1.657895
American Samoa,0.000000,7.000000
Andorra,0.500000,5.500000
Angola,1.333333,1.733333
...,...,...
West Germany,1.000000,2.500000
Western Australia,0.666667,1.666667
Yugoslavia,0.000000,3.000000
Zambia,1.500000,1.800000


In [57]:
def predict_points(home, away):
    if home in df_team_strength.index and away in df_team_strength.index:
        # goals_scored * goals_conceded
        lamb_home = df_team_strength.at[home,'GoalsScored'] * df_team_strength.at[away,'GoalsConceded']
        lamb_away = df_team_strength.at[away,'GoalsScored'] * df_team_strength.at[home,'GoalsConceded']
        prob_home, prob_away, prob_draw = 0, 0, 0
        for x in range(0,11): #number of goals home team
            for y in range(0, 11): #number of goals away team
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p

        points_home = 3 * prob_home + prob_draw
        points_away = 3 * prob_away + prob_draw
        return (points_home, points_away)
    else:
        return (0, 0)

In [58]:
print(predict_points('England', 'United States'))
print(predict_points('Argentina', 'Mexico'))
print(predict_points('Australia (H)', 'Nigeria'))

(0.6579067409799734, 2.183782154061449)
(1.3951906903982843, 1.4447466333378534)
(0, 0)


In [59]:
df_fixture_group_48 = df_fixture[:48].copy()
df_fixture_knockout = df_fixture[48:56].copy()
df_fixture_quarter = df_fixture[56:60].copy()
df_fixture_semi = df_fixture[60:62].copy()
df_fixture_final = df_fixture[62:].copy()

In [None]:
for group in dict_table:
    teams_in_group = dict_table[group]['Team'].values
    df_fixture_group_6 = df_fixture_group_48[df_fixture_group_48['home'].isin(teams_in_group)]
    for index, row in df_fixture_group_6.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += points_home
        dict_table[group].loc[dict_table[group]['Team'] == away, 'Pts'] += points_away

    dict_table[group] = dict_table[group].sort_values('Pts', ascending=False).reset_index()
    dict_table[group] = dict_table[group][['Team', 'Pts']]
    dict_table[group] = dict_table[group].round(0)

In [61]:
dict_table['Group B']

Unnamed: 0,Team,Pts
0,Canada,4.0
1,Nigeria,4.0
2,Republic of Ireland,1.0
3,Australia (H),0.0


In [62]:
df_fixture_knockout

Unnamed: 0,home,score,away,year
48,Winner Group A,Match 49,Runner-up Group C,2023
49,Winner Group C,Match 50,Runner-up Group A,2023
50,Winner Group E,Match 51,Runner-up Group G,2023
51,Winner Group G,Match 52,Runner-up Group E,2023
52,Winner Group D,Match 54,Runner-up Group B,2023
53,Winner Group B,Match 53,Runner-up Group D,2023
54,Winner Group H,Match 56,Runner-up Group F,2023
55,Winner Group F,Match 55,Runner-up Group H,2023


In [63]:
for group in dict_table:
    group_winner = dict_table[group].loc[0, 'Team']
    runners_up = dict_table[group].loc[1, 'Team']
    df_fixture_knockout.replace({f'Winner {group}':group_winner,
                                 f'Runner-up {group}':runners_up}, inplace=True)

df_fixture_knockout['winner'] = '?'
df_fixture_knockout

Unnamed: 0,home,score,away,year,winner
48,Norway,Match 49,Japan,2023,?
49,Spain,Match 50,Switzerland,2023,?
50,United States,Match 51,Italy,2023,?
51,Sweden,Match 52,Netherlands,2023,?
52,England,Match 54,Nigeria,2023,?
53,Canada,Match 53,Denmark,2023,?
54,Germany,Match 56,France,2023,?
55,Brazil,Match 55,South Korea,2023,?


In [64]:
def get_winner(df_fixture_updated):
    for index, row in df_fixture_updated.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        if points_home > points_away:
            winner = home
        else:
            winner = away
        df_fixture_updated.loc[index, 'winner'] = winner
    return df_fixture_updated

## Oitavas

In [65]:
get_winner(df_fixture_knockout)

Unnamed: 0,home,score,away,year,winner
48,Norway,Match 49,Japan,2023,Norway
49,Spain,Match 50,Switzerland,2023,Spain
50,United States,Match 51,Italy,2023,United States
51,Sweden,Match 52,Netherlands,2023,Sweden
52,England,Match 54,Nigeria,2023,England
53,Canada,Match 53,Denmark,2023,Denmark
54,Germany,Match 56,France,2023,Germany
55,Brazil,Match 55,South Korea,2023,Brazil


In [66]:
def update_table(df_fixture_round_1, df_fixture_round_2):
    for index, row in df_fixture_round_1.iterrows():
        winner = df_fixture_round_1.loc[index, 'winner']
        match = df_fixture_round_1.loc[index, 'score']
        df_fixture_round_2.replace({f'Winner {match}':winner}, inplace=True)
    df_fixture_round_2['winner'] = '?'
    return df_fixture_round_2

In [67]:
update_table(df_fixture_knockout, df_fixture_quarter)

Unnamed: 0,home,score,away,year,winner
56,Norway,Match 57,United States,2023,?
57,Spain,Match 58,Sweden,2023,?
58,Denmark,Match 59,Brazil,2023,?
59,England,Match 60,Germany,2023,?


## Quartas

In [68]:
get_winner(df_fixture_quarter)

Unnamed: 0,home,score,away,year,winner
56,Norway,Match 57,United States,2023,United States
57,Spain,Match 58,Sweden,2023,Sweden
58,Denmark,Match 59,Brazil,2023,Brazil
59,England,Match 60,Germany,2023,Germany


In [69]:
update_table(df_fixture_quarter, df_fixture_semi)

Unnamed: 0,home,score,away,year,winner
60,United States,Match 61,Sweden,2023,?
61,Brazil,Match 62,Germany,2023,?


## Semi

In [70]:
get_winner(df_fixture_semi)

Unnamed: 0,home,score,away,year,winner
60,United States,Match 61,Sweden,2023,United States
61,Brazil,Match 62,Germany,2023,Germany


In [71]:
update_table(df_fixture_semi, df_fixture_final)

Unnamed: 0,home,score,away,year,winner
62,Loser Match 61,Match 63,Loser Match 62,2023,?
63,United States,Match 64,Germany,2023,?


## Final

In [72]:
get_winner(df_fixture_final)

Unnamed: 0,home,score,away,year,winner
62,Loser Match 61,Match 63,Loser Match 62,2023,Loser Match 62
63,United States,Match 64,Germany,2023,United States
