## **English Football Dataset Creation (w/ Python BeautifulSoup)**

### **Import Libraries**

In [1]:
import requests # pip install requests
from bs4 import BeautifulSoup as bs # pip install beautifulsoup4


### **Get Club Info**

In [2]:
def get_content_value(row_data):
    if row_data.find("td"):
        return [td.get_text(" ", strip=True).replace("\xa0", " ").replace(" \ufeff ", "").replace("&nbsp;", "") for td in row_data.find_all("td")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ").replace(" \ufeff ", "").replace("&nbsp;", "")
        # \ufeff 
    
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vcard")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    club_info = {}
    
    for index, row in enumerate(info_rows):
        if index == 0:
            continue
        else:
            try:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                club_info[content_key] = content_value
            except:
                pass
            
    return club_info

In [23]:
get_info_box("https://en.wikipedia.org/wiki/Hamworthy_United_F.C.")

{'Full name': '',
 'Nickname(s)': 'The Hammers',
 'Founded': '1970',
 'Ground': 'The County Ground, Hamworthy',
 'Capacity': '2,000',
 'Chairman': 'Steve Mitchener',
 'Manager': 'Tim Sills',
 'League': 'Wessex League Premier Division',
 '2020–21': 'Wessex League Premier Division (season curtailed)',
 'Website': ''}

### **Get Club List**

In [4]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_football_clubs_in_England")

# Convert to a beautiful soup object
soup = bs(r.content)
clubs = soup.select(".wikitable a")

base_path = "https://en.wikipedia.org/"

club_info_list = []
for index, club in enumerate(clubs):
    try:
        relative_path = club['href']
        full_path = base_path + relative_path
        club_name = club['title']
            
        club_info_list.append(get_info_box(full_path))
            
    except Exception as e:
        pass


### **Get Full Capacity And Seated Capacity In Integer**

In [5]:
print([club.get('Capacity', 'N/A') for club in club_info_list])

['N/A', '3,000 (500 seated)', '1,200', 'N/A', '1,976 (196 seated)', '2,920 (150 seated)', 'N/A', '3,000 (554 seated)', '2,500 (250 seated)', '1,500', '2,500 (200 seated)', 'N/A', '3,500 (216 seated)', '6,500 (5,419 seated)', 'N/A', '23,287', '5,000 (250 seated)', 'N/A', '5,045 (1,000 seated)', '2,000 (100 seated)', '4,000 (160 seated)', '3,000 (240 seated)', 'N/A', '2,500 (256 seated)', '1,600', '4,250 (250 seated)', '2,000 (400 seated)', 'N/A', '2,000', 'N/A', '3,528 (1,006 seated)', 'N/A', 'N/A', '4,000 (120 seated)', '1,200', '3,000', 'N/A', 'N/A', '3,000 (300 seated)', '3,000 (300 seated)', '3,000 (300 seated)', '2,650 (500 seated)', '2,650 (500 seated)', '2,100 (250 seated)', '4,100 (500 seated)', '2,500 (170 seated)', 'N/A', 'N/A', 'N/A', '6,000 (375 seated)', '3,000 (300 seated)', '3,000 (300 seated)', '2,000 (260 seated)', '5,000 (2,000 seated)', '1,970 (370 seated)', '3,000 (173 seated)', 'N/A', 'N/A', 'N/A', 'N/A', '29,409', 'N/A', 'N/A', '2,004 (500 seated)', '1,500 (100 sea

In [6]:
from re import search

In [42]:
def full_capacity_to_integer(capacity):
    if capacity == None:
        full_capacity = None
    else:
        full_capacity_list = capacity.split(" ")
        full_capacity = int(full_capacity_list[0].replace(",",""))
    
    return full_capacity

def seated_capacity_to_integer(capacity):
    if capacity == None:
        seated_capacity = None
    else:
        if search(" ", capacity):
            seated_capacity_list = capacity.split(" ")
            seated_capacity = int(seated_capacity_list[1].replace(",","").replace("(",""))
        else:
            seated_capacity = None
    
    return seated_capacity



In [43]:
for club in club_info_list:
    try:
        club['Full Capacity'] = full_capacity_to_integer(club.get('Capacity', "N/A"))
        club['Seated Capacity'] = seated_capacity_to_integer(club.get('Capacity', "N/A"))
    except:
        pass

In [44]:
print([club.get('Seated Capacity', 'N/A') for club in club_info_list])

[None, 500, None, None, 196, 150, None, 554, 250, None, 200, None, 216, 5419, None, None, 250, None, 1000, 100, 160, 240, None, 256, None, 250, 400, None, None, None, 1006, None, None, 120, None, None, None, None, 300, 300, 300, 500, 500, 250, 500, 170, None, None, None, 375, 300, 300, 260, 2000, 370, 173, None, None, None, None, None, None, None, 500, 100, None, None, 525, None, 100, None, None, 180, None, 100, 556, None, 400, 367, None, None, 230, None, None, 1700, None, 450, None, 90, None, None, None, None, None, 500, None, 600, None, None, None, 1800, 100, 553, None, 200, None, 50, None, 128, 740, None, 180, 370, None, 100, None, None, None, None, 350, 150, None, 200, 3000, None, None, None, None, None, 200, None, 1606, 2017, None, None, None, 120, 408, None, None, 156, None, None, 2034, None, 350, 300, None, 490, None, None, 78, 196, None, 4376, None, 150, None, 200, 500, None, None, None, None, None, 240, None, None, None, 220, None, None, None, None, None, 500, None, None, None

### **Save/Reload Club Info**

In [19]:
import json

In [20]:
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [21]:
def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [22]:
save_data("club_data_cleaned.json", club_info_list)