In [10]:
import requests
from bs4 import BeautifulSoup as bs
import csv
import re

In [49]:
def get_history(country, from_date='20170924'):
    page = requests.get(f'https://en.wikipedia.org/w/index.php?title={country}_national_soccer_team&offset={from_date}&action=history')
    if page.status_code == 200:
        res = get_revision_link(page)
        if res:
            return res
    
    page = requests.get(f'https://en.wikipedia.org/w/index.php?title={country}_national_football_team&offset={from_date}&action=history')
    if page.status_code == 200:
        res = get_revision_link(page)
        if res:
            return res
    
    page = requests.get(f'https://en.wikipedia.org/w/index.php?title={country}_men%27s_national_soccer_team&offset={from_date}&action=history')
    if page.status_code == 200:
        res = get_revision_link(page)
        if res:
            return res
    
    return False

In [50]:
def get_revision_link(page):
    soup = bs(page.content, 'html.parser')
    
    page_history = soup.select('ul#pagehistory')
    if len(page_history) < 1:
        return False
    
    first_history = page_history[0].find('li')
    if not first_history:
        return False
    
    revision_link = first_history.select('a.mw-changeslist-date')
    if len(revision_link) < 1:
        return False
    
    revision_data = revision_link[0]
    
    # If the revision isn't very recent, we're possibly at the wrong page
    revision_date = revision_data.text
    if int(revision_date.split(' ')[-1]) < 2017:
        return False
    
    return revision_data['href']    

In [63]:
def extract_players(page):
    soup = bs(page.content, 'html.parser')

    # Get the first table after "Current squad" heading
    squad_heading = soup.find(lambda x: x.name in ['h2', 'h3'] and x.text.startswith('Current squad'))
    if not squad_heading:
        print('Cannot find current squad')
        return False
    squad_table = squad_heading.find_next_sibling('table')
    squad_table_body = squad_table.find('tbody')
    
    # Get the rows of the table and add data to all_players
    rows = squad_table_body.find_all('tr')
    all_players = []
    for row in rows:
        cols = row.find_all(['td', 'th'])
        cols = [c.text.strip() for c in cols]

        if len(cols) < 2:
            continue
        if len(cols) != 7:
            print('Unexpected number of columns')
            continue
        # skip header
        if cols[0] == 'No.':
            continue
        
        # Remove number from position
        cols[1] = re.sub(r"[0-9]", "", cols[1])
        
        # Remove captain/vice-captain from name
        if cols[2].split(' ')[-1].startswith('('):
            cols[2] = ' '.join(cols[2].split(' ')[:-1])

        age_cell = cols[3]
        # Extract age
        try:
            age = re.search(r"\xa0([0-9]+)", age_cell).group(1)
            cols[3] = age
        except:
            cols[3] = age_cell
            print('Match failed')

        # Extract birthday
        try:
            birthday = re.search(r"([0-9]{4}-[0-9]{2}-[0-9]{2})", age_cell).group()
            cols = cols[:3] + [birthday] + cols[3:]
        except:
            cols = cols[:3] + [age_cell] + cols[3:]
            print('Match failed')

        all_players.append(cols)
    
    return all_players

In [67]:
def create_players_csv(country, dir='teams-2017'):
    print(f'Getting players for {country}')

    country = country.replace(' ', '_')
    
    link = get_history(country)
    if not link:
        print('Error when finding revision history')
        return False
    
    page = requests.get(f'https://en.wikipedia.org/{link}')
    if not page:
        print('Error when retrieving page')
        return False
    
    all_players = extract_players(page)
    if not all_players:
        return False

    country_basename = country.replace(' ', '-').lower()
    with open(f'{dir}/{country_basename}.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['No.', 'Position', 'Player', 'DOB', 'Current age', 'Caps', 'Goals', 'Club'])
        writer.writerows(all_players)
    
    return True

In [65]:
def create_all_countries():
    failed = []
    with open('all-countries.txt', 'r') as f:
        for line in f.readlines():
            country = line.strip()
            if not create_players_csv(country):
                failed.append(country)

    with open('fail-logs.txt', 'w') as f:
        print('\n'.join(failed), file=f)

In [66]:
create_all_countries()

Getting players for Iran
Getting players for Japan
Getting players for South Korea
Getting players for Australia
Getting players for Qatar
Getting players for United Arab Emirates
Cannot find current squad
Getting players for Saudi Arabia
Getting players for China PR
Error when finding revision history
Getting players for Iraq
Getting players for Syria
Getting players for Uzbekistan
Getting players for Lebanon
Cannot find current squad
Getting players for Oman
Getting players for Kyrgyzstan
Cannot find current squad
Getting players for Jordan
Getting players for Vietnam
Match failed
Match failed
Match failed
Getting players for Palestine
Getting players for India
Getting players for Bahrain
Getting players for Thailand
Getting players for Tajikistan
Match failed
Match failed
Getting players for North Korea
Unexpected number of columns
Getting players for Philippines
Unexpected number of columns
Unexpected number of columns
Unexpected number of columns
Unexpected number of columns
Unexp