In [2]:
# import libraries for data manipulation
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup

# to suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
def generate_seasons_years(from_date, to_date):
    seasons_text = []
    for year in range(from_date, to_date):
        seasons_text.append(str(year) + "-" + str(year + 1))
    return seasons_text

In [4]:
def fill_data(global_dict, country, season, goals, empty_seasons_dictionary=None):
    if (country in global_dict.keys()):
        global_dict[country][season] += goals
        return   
    else:
        global_dict[country] = empty_seasons_dictionary.copy()
        fill_data(global_dict, country, season, goals)

In [5]:
def parse_and_fill(global_dict, url, season, empty_seasons_dictionary):
    cumulative_25_list = ["Germany", "England", "France", "Spain", "Italy",
                          "Argentina", "Brazil", "Scotland", "Netherlands", "Uruguay",
                          "Croatia", "Wales", "Ireland", "Serbia", "Portugal",
                          "Denmark", "Belgium", "Senegal", "Sweden", "Poland",
                          "Ivory Coast", "Algeria", "Nigeria", "Bosnia-Herzegovina", "Austria"]
    # Step 1: Fetch the webpage content
    response = requests.get(url)

    # Step 2: Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Step 3: Locate the table containing the data (goal scorers, etc.)
    table = soup.find('table', class_='standard_tabelle')  # Look for the specific class used in the table

    # Step 4: Extract the data
    rows = table.find_all('tr')
    for row in rows[1:]:  # Skip the header row
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]  # Clean the text
        country = cols[3]
        goals = int(cols[5].rsplit(" ")[0])
        if (country in cumulative_25_list):
            fill_data(global_dict, country, season, goals, empty_seasons_dictionary)

In [6]:
def get_urls(league, season):
    base_url = 'https://www.worldfootball.net/goalgetter/'
    urls = []
    if (league == "esp-primera-division" and season == "2016-2017"):
        urls.append(base_url + league + "-" + season + "_2/")
    elif (league == "esp-primera-division" and season == "1986-1987"):
        spain_leagues = ["esp-primera-division-1986-1987-playoff-1-6", "esp-primera-division-1986-1987-playoff-13-18", "esp-primera-division-1986-1987-playoff-7-12", "esp-primera-division-1986-1987-vorrunde"]
        for spain_league in spain_leagues:
            urls.append(base_url + spain_league + "/")  
    else:
        urls.append(base_url + league + "-" + season + "/")
    return urls

In [7]:
def create_empty_seasons_dictionary(seasons):
    seasons_dictionary = {} 
    for season in seasons:
        seasons_dictionary[season] = 0
    return seasons_dictionary

In [8]:
def only_top_20(full_dictionary, season):
    filtered_dictionary = {}
    season_goals = {country: data[season] for country, data in full_dictionary.items()}
    # Sort countries by goals in descending order and pick the top 3
    top_20_countries = sorted(season_goals.items(), key=lambda x: x[1], reverse=True)[:20]
    top_20_country_names = [country for country, _ in top_20_countries]

    # Update the original dictionary to keep only the top 3 countries
    football_data = {country: football_data[country] for country in top_20_country_names}

In [9]:
def extract_values_top_5_leagues(from_date, to_date):
    goals_per_nation_and_year = {}
    seasons = generate_seasons_years(int(from_date), int(to_date))
    empty_seasons_dictionary = create_empty_seasons_dictionary(seasons)
    leagues = ["eng-premier-league", "fra-ligue-1", "bundesliga", "ita-serie-a", "esp-primera-division"]
    
    for season in seasons:
        for league in leagues:
            urls = get_urls(league, season)
            for url in urls:
                parse_and_fill(goals_per_nation_and_year, url, season, empty_seasons_dictionary)
        # only_top_20(goals_per_nation_and_year, season)    
    return dict(sorted(goals_per_nation_and_year.items()))

In [93]:
final_dictionary = extract_values_top_5_leagues(1963,2024)

In [97]:
final_dictionary

{'Algeria': {'1963-1964': 91,
  '1964-1965': 49,
  '1965-1966': 58,
  '1966-1967': 27,
  '1967-1968': 34,
  '1968-1969': 32,
  '1969-1970': 12,
  '1970-1971': 15,
  '1971-1972': 4,
  '1972-1973': 10,
  '1973-1974': 32,
  '1974-1975': 21,
  '1975-1976': 19,
  '1976-1977': 38,
  '1977-1978': 27,
  '1978-1979': 35,
  '1979-1980': 33,
  '1980-1981': 36,
  '1981-1982': 18,
  '1982-1983': 40,
  '1983-1984': 33,
  '1984-1985': 18,
  '1985-1986': 17,
  '1986-1987': 19,
  '1987-1988': 28,
  '1988-1989': 18,
  '1989-1990': 22,
  '1990-1991': 8,
  '1991-1992': 10,
  '1992-1993': 9,
  '1993-1994': 14,
  '1994-1995': 25,
  '1995-1996': 26,
  '1996-1997': 13,
  '1997-1998': 11,
  '1998-1999': 15,
  '1999-2000': 18,
  '2000-2001': 18,
  '2001-2002': 8,
  '2002-2003': 19,
  '2003-2004': 11,
  '2004-2005': 17,
  '2005-2006': 8,
  '2006-2007': 34,
  '2007-2008': 23,
  '2008-2009': 30,
  '2009-2010': 30,
  '2010-2011': 27,
  '2011-2012': 25,
  '2012-2013': 30,
  '2013-2014': 28,
  '2014-2015': 38,
  '201

In [95]:
cumulative_dictionary = {}
for country, inner_dict in final_dictionary.items():
    sum = 0
    for key, value in inner_dict.items():
        sum += value
    cumulative_dictionary[country] = sum
sorted_cumulative_dictionary = dict(sorted(cumulative_dictionary.items(), key=lambda x: x[1], reverse = True))
sorted_cumulative_dictionary

{'Germany': 39164,
 'England': 37995,
 'France': 37713,
 'Spain': 34409,
 'Italy': 27814,
 'Argentina': 10598,
 'Brazil': 9390,
 'Scotland': 6809,
 'Netherlands': 3974,
 'Uruguay': 3115,
 'Croatia': 2696,
 'Wales': 2661,
 'Ireland': 2584,
 'Serbia': 2562,
 'Portugal': 2542,
 'Denmark': 2266,
 'Belgium': 2237,
 'Senegal': 2108,
 'Sweden': 1991,
 'Poland': 1985,
 'Ivory Coast': 1914,
 'Algeria': 1782,
 'Nigeria': 1758,
 'Bosnia-Herzegovina': 1623,
 'Austria': 1575}

In [1]:
list_for_csv = []
headers = ["Countries"]
for country, inner_dict in final_dictionary.items():
    for key in inner_dict.keys():
        headers.append(key)
    break

NameError: name 'final_dictionary' is not defined

In [112]:
for country, inner_dict in final_dictionary.items():
    country_goals = [country]
    for value in inner_dict.values():
        country_goals.append(int(value))
    list_for_csv.append(country_goals)

In [146]:
np_list = np.array(list_for_csv)
df = pd.DataFrame(np_list, columns=headers)
df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric)
df['sum'] = df.iloc[:, 1:].sum(axis=1)
df = df.sort_values(by="sum", ascending = False)
df.reset_index(drop = True, inplace=True)
# df.to_csv('top_5_leagues_25_countries_cumulative.csv', index=False)