## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import pickle
from string import ascii_uppercase as alphabet

In [2]:
def get_missing_data(year):
    web = f'https://en.wikipedia.org/wiki/{year}_Copa_América'
    response = requests.get(web)
    soup = BeautifulSoup(response.text, 'html.parser')

    matches = soup.select('td[align="right"], td[style="text-align:right;"]')

    home = []
    score = []
    away = []

    for match in matches:
        row = match.find_parent('tr')
        cells = row.find_all('td')
        if len(cells) >= 3:
            home.append(cells[0].text.strip())
            score.append(cells[1].text.strip())
            away.append(cells[2].text.strip())

    dict_football = {'home': home, 'score': score, 'away': away}
    df_football = pd.DataFrame(dict_football)
    df_football['year'] = year
    time.sleep(2)
    return df_football

years = [1975, 1979, 1983, 1987, 1989, 1991, 1993, 1995, 1997, 1999,
         2001, 2004, 2007, 2011, 2015, 2016, 2019, 2021]

fifa = [get_missing_data(year) for year in years]

# Display the combined dataframe
fifa_df = pd.concat(fifa, ignore_index=True)
fifa_df.to_csv("copa_america_missing_data.csv", index=False)

In [None]:
def get_matches(year):
    web = f'https://en.wikipedia.org/wiki/{year}_Copa_América'
    response = requests.get(web)
    content = response.text
    soup = BeautifulSoup(content, 'lxml')
    matches = soup.find_all('div', class_='footballbox')

    home = []
    score = []
    away = []

    for match in matches:
        home.append(match.find('th', class_='fhome').get_text())
        score.append(match.find('th', class_='fscore').get_text())
        away.append(match.find('th', class_='faway').get_text())

    dict_football = {'home': home, 'score': score, 'away': away}
    df_football = pd.DataFrame(dict_football)
    df_football['year'] = year
    return df_football


# results: historical data
fifa = [get_matches(year) for year in years]
df_fifa = pd.concat(fifa, ignore_index=True)
df_fifa.to_csv("copa_america_historical_data.csv", index=False)

# fixture
df_fixture = get_matches(2024)
df_fixture.to_csv('copa_america_fixture.csv', index=False)

## Getting current tournament group stage tables

In [None]:
# Extracting all tables in the website
url = 'https://en.wikipedia.org/wiki/2024_Copa_América'

response = requests.get(url)
response.encoding = 'utf-8'  # Ensure the encoding is set to 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')

# Find all tables in the page
tables = soup.find_all('table')

# Use pandas to read the HTML tables
all_tables = pd.read_html(str(tables))


In [None]:
# finding tables in group stage
(all_tables[16])
(all_tables[23])
(all_tables[30])

In [None]:
# groups A -> D
for i in range(16,44,7): # starting at 16, there are 7 tables in between with 4 tables total. 44 is our upper bound 16 lower
    print(i)
    df = all_tables[i]
    df.rename(columns={df.columns[1]:'Team'}, inplace=True)
    df.pop('Qualification')

In [None]:
print(alphabet)

In [None]:
dict_table = {}
for letter, i in zip(alphabet, range(16,44,7)): # A=16, B=23, ...
    df = all_tables[i]
    df.rename(columns={df.columns[1]:'Team'}, inplace=True)
    dict_table[f'Group {letter}'] = df

In [None]:
dict_table.keys()

In [None]:
dict_table['Group D']


In [None]:
# Upload (..verify if uploaded correctly)
with open('dict_table', 'wb') as output:
    pickle.dump(dict_table, output)

## Data Cleaning

In [None]:
df_historical_data = pd.read_csv('./data/copa_america_historical_data.csv')
df_fixture = pd.read_csv('./data/copa_america_fixture.csv')
df_missing_data = pd.read_csv('./data/copa_america_missing_data.csv')

In [None]:
df_fixture['home'] = df_fixture['home'].str.strip()
df_fixture['away'] = df_fixture['away'].str.strip()

In [None]:
df_missing_data.dropna(inplace=True)
df_historical_data = pd.concat([df_historical_data, df_missing_data], ignore_index=True)
df_historical_data.drop_duplicates(inplace=True)
df_historical_data.sort_values('year', inplace=True)
df_historical_data

In [None]:
# cleanning score and home/away columns
df_historical_data['score'] = df_historical_data['score'].str.replace('[^\d–]', '', regex=True)
df_historical_data['home'] = df_historical_data['home'].str.strip()
df_historical_data['away'] = df_historical_data['away'].str.strip()

# splitting score columns into home and away goals and dropping score column
df_historical_data[['HomeGoals', 'AwayGoals']] = df_historical_data['score'].str.split('–', expand=True)
df_historical_data.drop('score', axis=1, inplace=True)

# renaming columns and changing data types
df_historical_data.rename(columns={'home': 'HomeTeam', 'away': 'AwayTeam', 
                                   'year':'Year'}, inplace=True)
df_historical_data = df_historical_data.astype({'HomeGoals': int, 'AwayGoals':int, 'Year': int})

# creating new column "totalgoals"
df_historical_data['TotalGoals'] = df_historical_data['HomeGoals'] + df_historical_data['AwayGoals']
df_historical_data

In [None]:
df_historical_data.to_csv('./data/clean_copa_america_matches.csv',index=False)
df_fixture.to_csv('./data/clean_copa_america_fixture.csv',index=False)

In [None]:
##confirm number of matches per year
for year in years:
    print(year, len(df_historical_data[df_historical_data['Year']==year]))

In [None]:
# verify data collected for a team
print(df_historical_data[df_historical_data['HomeTeam'].str.contains('Colombia')])
print(df_historical_data[df_historical_data['AwayTeam'].str.contains('Colombia')])

## Get last 5 matches   

In [1]:
## get teams and ID from sofascore to scrape
import requests
import pandas as pd

##CONMEBOL
import requests

cookies = {
    '_gcl_au': '1.1.1568534472.1718898726',
    '_ga': 'GA1.1.753054649.1718898726',
    'logglytrackingsession': '0859cf9a-b937-4777-a97b-33db13738b1f',
    'exco-uid': 'pkcl3v8vk2xheagq',
    '_li_dcdm_c': '.sofascore.com',
    '_lc2_fpi': 'a78faec1e09d--01hwdb32mq14v3sfdf4v0t6ae4',
    '_lc2_fpi_meta': '{%22w%22:1714139794071}',
    '_lr_env_src_ats': 'false',
    'idl_env': 'AiNo502mm94sIq0tXUIsriVtawU5YOf1Xr9oJNbxf4-CJ0Y8RFz3X6Qnr3A',
    'idl_env_cst': 'Ryz5LF8s6Q%3D%3D',
    'idl_env_last': 'Mon%2C%2024%20Jun%202024%2013%3A51%3A10%20GMT',
    'panoramaId_expiry': '1719841870734',
    '_cc_id': '813f38570303168e86ec72223d5eeefb',
    'panoramaId': 'a26edf51218aaf1f35e0d11bbb0b4945a7027a866d919b59238dfd459c872938',
    'pbjs-unifiedid': '%7B%22TDID%22%3A%229beb3b12-eaf4-4cb8-b2d4-89709156f99d%22%2C%22TDID_LOOKUP%22%3A%22TRUE%22%2C%22TDID_CREATED_AT%22%3A%222024-05-24T13%3A51%3A10%22%7D',
    'pbjs-unifiedid_cst': 'Ryz5LF8s6Q%3D%3D',
    '__gads': 'ID=a73a314369da22b5:T=1718898728:RT=1719240794:S=ALNI_MYnexAAhM-Zz3TFQDa_4fBe-aT44g',
    '__gpi': 'UID=00000e35a54504a0:T=1718898728:RT=1719240794:S=ALNI_Mb7QnjnET63s0lNLdkqEwPXB3UxHw',
    '__eoi': 'ID=b6d813e372124385:T=1718898728:RT=1719240794:S=AA-AfjbLAnfv1PNEIfNVvkzwjD4g',
    'FCNEC': '%5B%5B%22AKsRol9sUceDiWKhB5NONeO-pxKrAJBJpA1SFYgpgdKl-qNCx4KutTZdwRYy6cPcrEy7I89xZziY3ORmRwzzuaGiqRR07BIoSCtLKGdOaP-gXEK5BuP0sEo1j6VitrsvWcy0-WcCC_2z273vOb_pNY2CrTstJeVNVQ%3D%3D%22%5D%5D',
    '_ga_HNQ9P9MGZR': 'GS1.1.1719237052.2.1.1719240921.58.0.0',
    '_ga_QH2YGS7BB4': 'GS1.1.1719237052.2.1.1719240974.0.0.0',
    '_ga_3KF4XTPHC4': 'GS1.1.1719237052.2.1.1719240974.5.0.0',
}

headers = {
    'accept': '*/*',
    'accept-language': 'en-US,en;q=0.9,es;q=0.8',
    'cache-control': 'max-age=0',
    # 'cookie': '_gcl_au=1.1.1568534472.1718898726; _ga=GA1.1.753054649.1718898726; logglytrackingsession=0859cf9a-b937-4777-a97b-33db13738b1f; exco-uid=pkcl3v8vk2xheagq; _li_dcdm_c=.sofascore.com; _lc2_fpi=a78faec1e09d--01hwdb32mq14v3sfdf4v0t6ae4; _lc2_fpi_meta={%22w%22:1714139794071}; _lr_env_src_ats=false; idl_env=AiNo502mm94sIq0tXUIsriVtawU5YOf1Xr9oJNbxf4-CJ0Y8RFz3X6Qnr3A; idl_env_cst=Ryz5LF8s6Q%3D%3D; idl_env_last=Mon%2C%2024%20Jun%202024%2013%3A51%3A10%20GMT; panoramaId_expiry=1719841870734; _cc_id=813f38570303168e86ec72223d5eeefb; panoramaId=a26edf51218aaf1f35e0d11bbb0b4945a7027a866d919b59238dfd459c872938; pbjs-unifiedid=%7B%22TDID%22%3A%229beb3b12-eaf4-4cb8-b2d4-89709156f99d%22%2C%22TDID_LOOKUP%22%3A%22TRUE%22%2C%22TDID_CREATED_AT%22%3A%222024-05-24T13%3A51%3A10%22%7D; pbjs-unifiedid_cst=Ryz5LF8s6Q%3D%3D; __gads=ID=a73a314369da22b5:T=1718898728:RT=1719240794:S=ALNI_MYnexAAhM-Zz3TFQDa_4fBe-aT44g; __gpi=UID=00000e35a54504a0:T=1718898728:RT=1719240794:S=ALNI_Mb7QnjnET63s0lNLdkqEwPXB3UxHw; __eoi=ID=b6d813e372124385:T=1718898728:RT=1719240794:S=AA-AfjbLAnfv1PNEIfNVvkzwjD4g; FCNEC=%5B%5B%22AKsRol9sUceDiWKhB5NONeO-pxKrAJBJpA1SFYgpgdKl-qNCx4KutTZdwRYy6cPcrEy7I89xZziY3ORmRwzzuaGiqRR07BIoSCtLKGdOaP-gXEK5BuP0sEo1j6VitrsvWcy0-WcCC_2z273vOb_pNY2CrTstJeVNVQ%3D%3D%22%5D%5D; _ga_HNQ9P9MGZR=GS1.1.1719237052.2.1.1719240921.58.0.0; _ga_QH2YGS7BB4=GS1.1.1719237052.2.1.1719240974.0.0.0; _ga_3KF4XTPHC4=GS1.1.1719237052.2.1.1719240974.5.0.0',
    'priority': 'u=1, i',
    'referer': 'https://www.sofascore.com/tournament/football/south-america/copa-america/133',
    'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
    'x-requested-with': 'aa9e1a',
}

copa_america_response = requests.get(
    'https://www.sofascore.com/api/v1/unique-tournament/133/season/57114/power-rankings/round/126',
    cookies=cookies,
    headers=headers,
)

# Check if the request was successful
if copa_america_response.status_code == 200:
    copa_america_data = copa_america_response.json()
else:
    print(f"Request failed with status code {copa_america_response.status_code}")

# Extracting team names and IDs
teams = []
for item in copa_america_data['powerRankings']:
    team_name = item['team']['name']
    team_id = item['team']['id']
    teams.append({'Team Name': team_name, 'Team ID': team_id})

# Creating a DataFrame
copa_america_teams_df = pd.DataFrame(teams)

# Iterating over the DataFrame
for index, row in copa_america_teams_df.iterrows():
    print(f"Team Name: {row['Team Name']}, Team ID: {row['Team ID']}")

Team Name: Argentina, Team ID: 4819
Team Name: Brazil, Team ID: 4748
Team Name: Uruguay, Team ID: 4725
Team Name: Colombia, Team ID: 4820
Team Name: USA, Team ID: 4724
Team Name: Mexico, Team ID: 4781
Team Name: Ecuador, Team ID: 4757
Team Name: Chile, Team ID: 4754
Team Name: Venezuela, Team ID: 4722
Team Name: Canada, Team ID: 4752
Team Name: Paraguay, Team ID: 4789
Team Name: Peru, Team ID: 4790
Team Name: Panama, Team ID: 5164
Team Name: Jamaica, Team ID: 4769
Team Name: Bolivia, Team ID: 4746
Team Name: Costa Rica, Team ID: 4756


In [2]:
import requests

cookies = {
    '_gcl_au': '1.1.1568534472.1718898726',
    '_ga': 'GA1.1.753054649.1718898726',
    'logglytrackingsession': '0859cf9a-b937-4777-a97b-33db13738b1f',
    'exco-uid': 'pkcl3v8vk2xheagq',
    '_li_dcdm_c': '.sofascore.com',
    '_lc2_fpi': 'a78faec1e09d--01hwdb32mq14v3sfdf4v0t6ae4',
    '_lc2_fpi_meta': '{%22w%22:1714139794071}',
    '_lr_retry_request': 'true',
    '_lr_env_src_ats': 'false',
    'idl_env': 'AiNo502mm94sIq0tXUIsriVtawU5YOf1Xr9oJNbxf4-CJ0Y8RFz3X6Qnr3A',
    'idl_env_cst': 'Ryz5LF8s6Q%3D%3D',
    'idl_env_last': 'Mon%2C%2024%20Jun%202024%2013%3A51%3A10%20GMT',
    'panoramaId_expiry': '1719841870734',
    '_cc_id': '813f38570303168e86ec72223d5eeefb',
    'panoramaId': 'a26edf51218aaf1f35e0d11bbb0b4945a7027a866d919b59238dfd459c872938',
    'pbjs-unifiedid': '%7B%22TDID%22%3A%229beb3b12-eaf4-4cb8-b2d4-89709156f99d%22%2C%22TDID_LOOKUP%22%3A%22TRUE%22%2C%22TDID_CREATED_AT%22%3A%222024-05-24T13%3A51%3A10%22%7D',
    'pbjs-unifiedid_cst': 'Ryz5LF8s6Q%3D%3D',
    'FCNEC': '%5B%5B%22AKsRol-4WAfbZykEh_7TQugFgxIJPVDcb48N1h3bx3voDeMfQFIkw7zsu4V3wx2xm1kxSttFfPK3gr0jiyQJlUcmjH04zYq1yVlSeulnTolqMEjKPh_NzW3QX8_z24MeijFSZhXd7wXuzxepyCKxmAeGzjKmfpVLvg%3D%3D%22%5D%5D',
    '__gads': 'ID=a73a314369da22b5:T=1718898728:RT=1719239219:S=ALNI_MYnexAAhM-Zz3TFQDa_4fBe-aT44g',
    '__gpi': 'UID=00000e35a54504a0:T=1718898728:RT=1719239219:S=ALNI_Mb7QnjnET63s0lNLdkqEwPXB3UxHw',
    '__eoi': 'ID=b6d813e372124385:T=1718898728:RT=1719239219:S=AA-AfjbLAnfv1PNEIfNVvkzwjD4g',
    '_ga_QH2YGS7BB4': 'GS1.1.1719237052.2.1.1719239292.0.0.0',
    '_ga_3KF4XTPHC4': 'GS1.1.1719237052.2.1.1719239292.23.0.0',
    '_ga_HNQ9P9MGZR': 'GS1.1.1719237052.2.1.1719239292.20.0.0',
}

headers = {
    'accept': '*/*',
    'accept-language': 'en-US,en;q=0.9,es;q=0.8',
    'cache-control': 'max-age=0',
    # 'cookie': '_gcl_au=1.1.1568534472.1718898726; _ga=GA1.1.753054649.1718898726; logglytrackingsession=0859cf9a-b937-4777-a97b-33db13738b1f; exco-uid=pkcl3v8vk2xheagq; _li_dcdm_c=.sofascore.com; _lc2_fpi=a78faec1e09d--01hwdb32mq14v3sfdf4v0t6ae4; _lc2_fpi_meta={%22w%22:1714139794071}; _lr_retry_request=true; _lr_env_src_ats=false; idl_env=AiNo502mm94sIq0tXUIsriVtawU5YOf1Xr9oJNbxf4-CJ0Y8RFz3X6Qnr3A; idl_env_cst=Ryz5LF8s6Q%3D%3D; idl_env_last=Mon%2C%2024%20Jun%202024%2013%3A51%3A10%20GMT; panoramaId_expiry=1719841870734; _cc_id=813f38570303168e86ec72223d5eeefb; panoramaId=a26edf51218aaf1f35e0d11bbb0b4945a7027a866d919b59238dfd459c872938; pbjs-unifiedid=%7B%22TDID%22%3A%229beb3b12-eaf4-4cb8-b2d4-89709156f99d%22%2C%22TDID_LOOKUP%22%3A%22TRUE%22%2C%22TDID_CREATED_AT%22%3A%222024-05-24T13%3A51%3A10%22%7D; pbjs-unifiedid_cst=Ryz5LF8s6Q%3D%3D; FCNEC=%5B%5B%22AKsRol-4WAfbZykEh_7TQugFgxIJPVDcb48N1h3bx3voDeMfQFIkw7zsu4V3wx2xm1kxSttFfPK3gr0jiyQJlUcmjH04zYq1yVlSeulnTolqMEjKPh_NzW3QX8_z24MeijFSZhXd7wXuzxepyCKxmAeGzjKmfpVLvg%3D%3D%22%5D%5D; __gads=ID=a73a314369da22b5:T=1718898728:RT=1719239219:S=ALNI_MYnexAAhM-Zz3TFQDa_4fBe-aT44g; __gpi=UID=00000e35a54504a0:T=1718898728:RT=1719239219:S=ALNI_Mb7QnjnET63s0lNLdkqEwPXB3UxHw; __eoi=ID=b6d813e372124385:T=1718898728:RT=1719239219:S=AA-AfjbLAnfv1PNEIfNVvkzwjD4g; _ga_QH2YGS7BB4=GS1.1.1719237052.2.1.1719239292.0.0.0; _ga_3KF4XTPHC4=GS1.1.1719237052.2.1.1719239292.23.0.0; _ga_HNQ9P9MGZR=GS1.1.1719237052.2.1.1719239292.20.0.0',
    'priority': 'u=1, i',
    'referer': 'https://www.sofascore.com/team/football/colombia/4820',
    'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
    'x-requested-with': 'af6fcd',
}

# Initialize an empty DataFrame to store all match data
all_matches_df = pd.DataFrame()

# Iterate over each team in the DataFrame
for index, row in copa_america_teams_df.iterrows():
    team_id = row['Team ID']
    url = f'https://www.sofascore.com/api/v1/team/{team_id}/performance'

    # Fetch data for the current team
    response = requests.get(url, cookies=cookies, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        last_matches_data = response.json()

        # Extracting match details
        matches = []
        for event in last_matches_data['events']:
            home_team = event['homeTeam']['name']
            away_team = event['awayTeam']['name']
            goals_home = event['homeScore']['current']
            goals_away = event['awayScore']['current']
            event_name = event['tournament']['name']
            event_date = pd.to_datetime(event['startTimestamp'], unit='s').strftime('%Y-%m-%d')

            if goals_home > goals_away:
                result = 'Home Win'
                points = 3
            elif goals_home < goals_away:
                result = 'Away Win'
                points = 0
            else:
                result = 'Draw'
                points = 1

            matches.append({
                'Event': event_name,
                'Home': home_team,
                'Away': away_team,
                'Goals Home': goals_home,
                'Goals Away': goals_away,
                'Result': result,
                'Event Date': event_date,
                'Points': points
            })

        # Creating DataFrame for the current team
        team_matches_df = pd.DataFrame(matches)

        # Append the current team's DataFrame to the overall DataFrame
        all_matches_df = pd.concat([all_matches_df, team_matches_df], ignore_index=True)

    else:
        print(f"Request failed for team {row['Team Name']} with status code {response.status_code}")

# Remove duplicates from the combined DataFrame
all_matches_df = all_matches_df.drop_duplicates()

all_matches_df.to_excel('./data/previous_matches.xlsx', index=False)

# Display the combined DataFrame
print(all_matches_df)


                                                 Event        Home  \
0                    World Cup Qualification, CONMEBOL     Bolivia   
1                    World Cup Qualification, CONMEBOL   Argentina   
2                    World Cup Qualification, CONMEBOL        Peru   
3                    World Cup Qualification, CONMEBOL   Argentina   
4                    World Cup Qualification, CONMEBOL      Brazil   
..                                                 ...         ...   
150                                Int. Friendly Games  Costa Rica   
153                                Int. Friendly Games  Costa Rica   
154   CONCACAF Nations League, Copa America Qualifying  Costa Rica   
157  World Cup Qualification, CONCACAF 2nd round, G...  Costa Rica   
158  World Cup Qualification, CONCACAF 2nd round, G...     Grenada   

                      Away  Goals Home  Goals Away    Result  Event Date  \
0                Argentina           0           3  Away Win  2023-09-12   
1      

In [3]:
# Calculate metrics for each team based on the last 8 games
metrics = []
for team_name in copa_america_teams_df['Team Name']:
    
    team_matches = all_matches_df[(all_matches_df['Home'] == team_name) | (all_matches_df['Away'] == team_name)].tail(8)
    
    points_per_game = team_matches['Points'].sum() / len(team_matches)
    win_rate = len(team_matches[team_matches['Points'] == 3]) / len(team_matches)
    goals_scored = team_matches.apply(lambda x: x['Goals Home'] if x['Home'] == team_name else x['Goals Away'], axis=1).sum()
    goals_conceded = team_matches.apply(lambda x: x['Goals Away'] if x['Home'] == team_name else x['Goals Home'], axis=1).sum()
    goal_difference = goals_scored - goals_conceded

    metrics.append({
        'Team': team_name,
        'Points Per Game': points_per_game,
        'Win Rate': win_rate,
        'Goals Scored': goals_scored / len(team_matches),
        'Goals Conceded': goals_conceded / len(team_matches),
        'Goal Difference': goal_difference / len(team_matches)
    })

# Create a DataFrame for the metrics
metrics_df = pd.DataFrame(metrics)


In [4]:
# Change 'USA' to 'United States' in the DataFrame
metrics_df['Team'] = metrics_df['Team'].replace({'USA': 'United States'})


In [5]:
# Display the metrics DataFrame
print(metrics_df)

# Export to Excel
metrics_df.to_excel('./data/team_metrics.xlsx', index=False)

             Team  Points Per Game  Win Rate  Goals Scored  Goals Conceded  \
0       Argentina            1.500     0.500         2.000           0.500   
1          Brazil            1.125     0.250         1.375           1.375   
2         Uruguay            1.500     0.375         2.125           0.875   
3        Colombia            1.250     0.375         2.125           0.625   
4   United States            2.625     0.875         2.500           1.000   
5          Mexico            2.000     0.625         1.875           1.375   
6         Ecuador            2.000     0.625         1.375           0.875   
7           Chile            1.875     0.500         1.250           0.750   
8       Venezuela            1.500     0.375         1.000           0.625   
9          Canada            2.000     0.625         1.875           2.375   
10       Paraguay            1.500     0.375         0.250           0.625   
11           Peru            2.000     0.500         1.000      

## Getting Team Overall rating from FC24

In [15]:
import requests
import re
import json

def extract_json_from_html(url, save_output=False):
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    })
    
    response = session.get(url)
    if response.status_code != 200:
        return "Failed to retrieve the HTML content. Status code: {}".format(response.status_code)

    html = response.text
    regex_pattern = r'(?<=require\.config\.params\["args"\].=.)[\s\S]*?;'
    match = re.findall(regex_pattern, html)
    if not match:
        return "No match found"

    # Correcting data format by adding quotes around keys
    data_txt = match[0]
    keys_to_replace = ['matchId', 'matchCentreData', 'matchCentreEventTypeJson', 'formationIdNameMappings']
    for key in keys_to_replace:
        data_txt = data_txt.replace(key, f'"{key}"')
    data_txt = data_txt.replace('};', '}')

    # Try to parse the JSON data
    try:
        data_json = json.loads(data_txt)
    except json.JSONDecodeError:
        return "JSON decoding failed"

    # Print the JSON structure summarization
    print_json_structure(data_json)

    # Optionally, save the JSON data to a text file
    if save_output:
        output_filename = 'output.txt'
        with open(output_filename, 'w', encoding='utf-8') as output_file:
            json.dump(data_json, output_file, indent=4)

    return data_json

def print_json_structure(data, indent=0, max_list_example=3):
    for key, value in data.items():
        if isinstance(value, dict):
            print('  ' * indent + f"{key} (dict with {len(value.keys())} keys)")
            print_json_structure(value, indent + 1)
        elif isinstance(value, list):
            print('  ' * indent + f"{key} (list with {len(value)} items)")
            if value:
                if isinstance(value[0], dict):
                    print('  ' * (indent + 1) + f"Example item from list:")
                    print_json_structure(value[0], indent + 2)
                else: 
                    example_items = value[:max_list_example]
                    print('  ' * (indent + 1) + f"Example items: {example_items}")
        else:
            print('  ' * indent + f"{key} ({type(value).__name__})")

# Example usage
url = "https://sofifa.com/teams?keyword=brazil&type=history"
data_json = extract_json_from_html(url, save_output=True)

print(data_json)


Failed to retrieve the HTML content. Status code: 403
