In [16]:
# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#import libraries for statistical calculations
import scipy.stats as stats

import requests
from bs4 import BeautifulSoup

# to suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [17]:
def generate_seasons_years(from_date, to_date):
    seasons_text = []
    for year in range(from_date, to_date):
        seasons_text.append(str(year) + "-" + str(year + 1))
    return seasons_text

In [22]:
def fill_dictionary(global_dict, url, season):
    # Step 1: Fetch the webpage content
    response = requests.get(url)

    # Step 2: Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Step 3: Locate the table containing the data (goal scorers, etc.)
    table = soup.find('table', class_='standard_tabelle')  # Look for the specific class used in the table

    # Step 4: Extract the data
    rows = table.find_all('tr')
    for row in rows[1:]:  # Skip the header row
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]  # Clean the text
        country = cols[3]
        goals = int(cols[5].rsplit(" ")[0])
        if (season in global_dict.keys()):
            if (country in global_dict[season].keys()):
                global_dict[season][country] += goals
            else:
                global_dict[season][country] = goals    
        else:
            global_dict[season] = {country : goals}

In [23]:
def sorted_dictionary(dictionary_to_be_sorted):
    for outer_key, inner_dict in dictionary_to_be_sorted.items():
        # Sort the inner dictionary by values, keeping the order of the outer dictionary
        sorted_inner_dict = dict(sorted(inner_dict.items(), key=lambda item: item[1], reverse=True)[:20])
        # Update the outer dictionary with the sorted inner dictionary
        dictionary_to_be_sorted[outer_key] = sorted_inner_dict
    return dictionary_to_be_sorted

In [25]:
def extract_values_top_5_leagues(from_date, to_date):
    goals_per_nation_and_year = {}
    seasons = generate_seasons_years(int(from_date), int(to_date))
    leagues = ["eng-premier-league", "fra-ligue-1", "bundesliga", "ita-serie-a", "esp-primera-division"]
    base_url = 'https://www.worldfootball.net/goalgetter/'
    for season in seasons:
        for league in leagues:
            if (league == "esp-primera-division" and season == "2016-2017"):
                url = base_url + league + "-" + season + "_2/"
            elif (league == "esp-primera-division" and season == "1986-1987"):
                spain_leagues = ["esp-primera-division-1986-1987-playoff-1-6", "esp-primera-division-1986-1987-playoff-13-18", "esp-primera-division-1986-1987-playoff-7-12", "esp-primera-division-1986-1987-vorrunde"]
                for spain_league in spain_leagues:
                    url = base_url + spain_league + "/"
                    fill_dictionary(goals_per_nation_and_year, url, season)
                continue    
            else:
                url = base_url + league + "-" + season + "/"
            fill_dictionary(goals_per_nation_and_year, url, season)
    return sorted_dictionary(goals_per_nation_and_year)

In [26]:
final_dictionary = extract_values_top_5_leagues(1963,2024)

In [27]:
final_dictionary

{'1963-1964': {'England': 1132,
  'Germany': 830,
  'France': 721,
  'Spain': 495,
  'Italy': 373,
  'Scotland': 266,
  'Brazil': 139,
  'Argentina': 129,
  'Algeria': 91,
  'Wales': 80,
  'Paraguay': 59,
  'Hungary': 45,
  'Ireland': 35,
  'Northern Ireland': 26,
  'Sweden': 24,
  'Denmark': 23,
  'Cameroon': 18,
  'Austria': 18,
  'USA': 15,
  'Uruguay': 15},
 '1964-1965': {'England': 1061,
  'Germany': 760,
  'France': 701,
  'Spain': 495,
  'Italy': 457,
  'Scotland': 337,
  'Brazil': 103,
  'Argentina': 92,
  'Paraguay': 65,
  'Algeria': 49,
  'Northern Ireland': 46,
  'Ireland': 41,
  'Wales': 39,
  'Peru': 24,
  'Hungary': 19,
  'Sweden': 18,
  'Switzerland': 16,
  'Denmark': 16,
  'Angola': 13,
  'Austria': 12},
 '1965-1966': {'France': 967,
  'England': 953,
  'Germany': 953,
  'Spain': 499,
  'Italy': 466,
  'Scotland': 324,
  'Brazil': 107,
  'Argentina': 104,
  'Northern Ireland': 64,
  'Wales': 59,
  'Algeria': 58,
  'Paraguay': 45,
  'Uruguay': 26,
  'Ireland': 23,
  'Den