In [79]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

### Gets the data for 1991-1999 and 2003 the Knockout Stage, 2007, 2011, 2015

In [320]:
def match_finder(year = ''):
    webpage = requests.get(f'https://en.wikipedia.org/wiki/{year}_FIFA_Women%27s_World_Cup')
    soup = BeautifulSoup(webpage.content, 'html.parser')
    
    if year == '2015':
        matches = soup.findAll("div", {"class": "vevent"})
    else:
        matches = soup.findAll("div", {"class": "footballbox"})
    return matches

In [324]:
def tournament_parser(year = ''):
    
    matches = match_finder(year)
    
    results = {'date': [],
          'home_team': [],
          'away_team': [],
          'home_score': [],
          'away_score': []}

    for match in matches:

        if year == '2015':
            table = match.findAll('td')
            
            datetime = table[0].text
            home_team = table[1].text
            away_team = table[3].text
            score_list = table[2].text.split('–')

        else:
            try:
                datetime = match.find('time')['datetime']
            except:
                datetime = match.find('time').text
                
            home_team = match.find('th', {'class': 'fhome'}).text
            away_team = match.find('th', {'class': 'faway'}).text
            score_list = match.find('th', {'class': 'fscore'}).text.split('–')

        results['date'].append(datetime)
        results['home_team'].append(home_team)
        results['away_team'].append(away_team)
        results['home_score'].append(score_list[0])
        results['away_score'].append(score_list[1])
    
    results = pd.DataFrame(results)
    
    return results

In [327]:
year_results = []
for year in ['1991', '1993', '1999', '2003', '2007', '2011', '2015']:
    year_results.append(tournament_parser(year))
    
partial_results = pd.concat(year_results)
partial_results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,1991-11-16T20:45:00+00:00,China PR,Norway,4,0
1,1991-11-17T19:45:00+00:00,Denmark,New Zealand,3,0
2,1991-11-19T15:30:00+00:00,Norway,New Zealand,4,0
3,1991-11-19T19:45:00+00:00,China PR,Denmark,2,2
4,1991-11-21T19:45:00+00:00,China PR,New Zealand,4,1


### Group Stage 2003

In [334]:
def match_finder(year = ''):
    webpage = requests.get(f'https://en.wikipedia.org/wiki/{year}_FIFA_Women%27s_World_Cup')
    soup = BeautifulSoup(webpage.content, 'html.parser')
    matches = soup.findAll("table")
    return matches

In [335]:
matches_2003 = match_finder('2003')

In [336]:
group_a = matches_2003[5]
group_b = matches_2003[7]
group_c = matches_2003[9]
group_d = matches_2003[11]

In [337]:
def match_extracter(group, group_name):
    
    results = {'date': [],
               'home_team': [],
              'away_team': [],
              'home_score': [],
              'away_score': []}

    group = group.findAll('tr')
    
    if group_name in ['c', 'd']:
        for match_idx, match in enumerate(group):

            if match_idx in [2,3,5, 6, 8, 9]:
                match_info = match.findAll('td')
                home_team = match_info[0].text
                away_team = match_info[2].text
                score_list = match_info[1].text.split('–')

                results['home_team'].append(home_team)
                results['away_team'].append(away_team)
                results['home_score'].append(score_list[0])
                results['away_score'].append(score_list[1])

            if match_idx in [1, 4, 7]:
                results['date'].append(match.find('td').text)
                results['date'].append(match.find('td').text)
                
    elif group_name in ['a', 'b']:
        for match_idx, match in enumerate(group):

            if match_idx in [2,4,6, 7, 9, 10]:
                match_info = match.findAll('td')
                home_team = match_info[0].text
                away_team = match_info[2].text
                score_list = match_info[1].text.split('–')

                results['home_team'].append(home_team)
                results['away_team'].append(away_team)
                results['home_score'].append(score_list[0])
                results['away_score'].append(score_list[1])

            if match_idx in [1, 3]:
                results['date'].append(match.find('td').text)

            if match_idx in [5, 8]:
                results['date'].append(match.find('td').text)
                results['date'].append(match.find('td').text)

    results = pd.DataFrame(results)
    
    return results

In [338]:
group_a = match_extracter(group_a, 'a')
group_b = match_extracter(group_b, 'b')
group_c = match_extracter(group_c, 'c')
group_d = match_extracter(group_d, 'd')

In [339]:
group_results_2003 = pd.concat([pd.DataFrame(group_a), pd.DataFrame(group_b), 
           pd.DataFrame(group_c), pd.DataFrame(group_d)], axis = 0)

In [342]:
pd.concat([partial_results, group_results_2003])

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,1991-11-16T20:45:00+00:00,China PR,Norway,4,0
1,1991-11-17T19:45:00+00:00,Denmark,New Zealand,3,0
2,1991-11-19T15:30:00+00:00,Norway,New Zealand,4,0
3,1991-11-19T19:45:00+00:00,China PR,Denmark,2,2
4,1991-11-21T19:45:00+00:00,China PR,New Zealand,4,1
5,1991-11-21T19:45:00+00:00,Norway,Denmark,2,1
6,1991-11-17T19:45:00+00:00,Japan,Brazil,0,1
7,1991-11-17T19:45:00+00:00,Sweden,United States,2,3
8,1991-11-19T19:45:00+00:00,Japan,Sweden,0,8
9,1991-11-19T19:45:00+00:00,Brazil,United States,0,5
