In [126]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook

In [127]:
betstudy_url='https://www.betstudy.com/soccer-stats/c/england/premier-league/d/results/2018-2019/'

To get line ups spider needs to iterate over all games on the web page, getting the date and team names to validate then traveling along the hyperlink to get the line up web page.

Once on the line up web page iterate over both lineups and take all names.

Final dataframe should have two rows per game, with 20 columns. One for team name, one for date and 18 for line up.

In [128]:
def game_spider(url):
    '''
    input: url
    output: two dictionaries, one with the player names the other with their positions
    as both are scraped in parallel the location of the name is the location of the position
    e.g. player at lineups["1starter_5"][7] has position lineups_pos["1starter_5"][7].
    
    function that iterates over table rows
    also runs other functions that collects data
    '''
    #gets soup of url
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    
    #gets list of each table row
    table=soup.find('table', class_='schedule-table').find_all('tr', class_='rounds')
    
    
    lineups={
        'date':[],
        'team1':[],
        'team2':[],
        '1starter_1':[],
        '1starter_2':[],
        '1starter_3':[],
        '1starter_4':[],
        '1starter_5':[],
        '1starter_6':[],
        '1starter_7':[],
        '1starter_8':[],
        '1starter_9':[],
        '1starter_10':[],
        '1starter_11':[],
        '1sub_1':[],
        '1sub_2':[],
        '1sub_3':[],
        '1sub_4':[],
        '1sub_5':[],
        '1sub_6':[],
        '1sub_7':[],
        '2starter_1':[],
        '2starter_2':[],
        '2starter_3':[],
        '2starter_4':[],
        '2starter_5':[],
        '2starter_6':[],
        '2starter_7':[],
        '2starter_8':[],
        '2starter_9':[],
        '2starter_10':[],
        '2starter_11':[],
        '2sub_1':[],
        '2sub_2':[],
        '2sub_3':[],
        '2sub_4':[],
        '2sub_5':[],
        '2sub_6':[],
        '2sub_7':[],
        'referee':[],
        'assistant_1':[],
        'assistant_2':[],
        'fourth':[]
    }

    lineups_pos={
        '1starter_1':[],
        '1starter_2':[],
        '1starter_3':[],
        '1starter_4':[],
        '1starter_5':[],
        '1starter_6':[],
        '1starter_7':[],
        '1starter_8':[],
        '1starter_9':[],
        '1starter_10':[],
        '1starter_11':[],
        '1sub_1':[],
        '1sub_2':[],
        '1sub_3':[],
        '1sub_4':[],
        '1sub_5':[],
        '1sub_6':[],
        '1sub_7':[],
        '2starter_1':[],
        '2starter_2':[],
        '2starter_3':[],
        '2starter_4':[],
        '2starter_5':[],
        '2starter_6':[],
        '2starter_7':[],
        '2starter_8':[],
        '2starter_9':[],
        '2starter_10':[],
        '2starter_11':[],
        '2sub_1':[],
        '2sub_2':[],
        '2sub_3':[],
        '2sub_4':[],
        '2sub_5':[],
        '2sub_6':[],
        '2sub_7':[],
    }
    
    for game in tqdm_notebook(table):
        #collect data for game into two rows, one per team
        
        #scrape date for both teams
        date = get_date(game)
        lineups['date'].append(date)
        
        #get team names
        lineups['team1'].append(get_team(game, 'left'))
        lineups['team2'].append(get_team(game, 'right'))
        
        #get soup for lineup page
        lineup_soup=get_lineup_soup(game)
        
        #gets the 18 players for each team
        name_pos_dict = get_lineup(lineup_soup)
        
        #parsing the name_pos_dict into
        #the lineups and lineups_pos dicts
        for key in name_pos_dict.keys():
            lineups[key].append(name_pos_dict[key][0]) #player_name
            lineups_pos[key].append(name_pos_dict[key][1]) #player_pos
            
        #gets the 4 refs for each game
        ref_dict=get_referees(lineup_soup)
        
        #parsing the ref_dict into the lineups dicts
        for key in ref_dict.keys():
            lineups[key].append(ref_dict[key])
            
    return lineups, lineups_pos

In [129]:
def get_date(game):
    '''
    input: game soup
    ouput: date of game
    '''
    date=game.find('td').text
    return date

def get_team(game, team):
    '''
    input: game soup and team (left or right)
    ouput: team name
    '''
    if team == 'left':
        #they inverse the direction the
        #webpages right is our left
        class_='right-align'
    else:
        class_='left-align'
        
    team = game.find('td', class_=class_)
    
    return team.text

def get_lineup_soup(game):
    '''
    input: game soup
    output: full url of lineup webpage
    '''
    
    url_end=game.find('li').find('a')['href']
    url_start='https://www.betstudy.com'
    full_url = url_start+url_end
    r_lineup=requests.get(full_url)
    lineup_soup=BeautifulSoup(r_lineup.text)
        
    return lineup_soup

def get_lineup(lineup_soup):
    '''
    input: lineup page soup
    output: two dictionaries containing lineup
    elements for each team. First dictionary is for
    left team, second for right.
    '''
    table=lineup_soup.find_all('div', class_='table-holder')[1]
    #there are 3 table-holder div elements. 2nd is for lineups.
    
    #within this table there are 4 smaller tables the first
    #and third are for the left team, the second and fourth
    #are for the right team. The first table for each team is
    #there starting 11, the second table for each team are their
    #subs.
    
    more_tables = lineup_soup.find_all('table', 'info-table')
    
    left_11=more_tables[0].find_all('tr')
    left_subs=more_tables[2].find_all('tr')
    right_11=more_tables[1].find_all('tr')
    right_subs=more_tables[3].find_all('tr')
    
    name_pos_dict={}
    
    for i in range(1,12): #get starting 11
        #first element is header so skip
        key='1starter_'+str(i)
        left_player=left_11[i]
        #to get players full name and position
        #spider needs to go into their profile page.
        name_pos_dict[key]=get_player_data(left_player)
        
        key='2starter_'+str(i)
        right_player=right_11[i]
        name_pos_dict[key]=get_player_data(right_player)
        
    for i in range(1, 8): #get sub 7
        #first element is header so skip
        key='1sub_'+str(i)
        try:
            left_player=left_subs[i]
            name_pos_dict[key]=get_player_data(left_player)
        except:
            #not often but sometimes teams dont have 7 subs
            #if so index will be out of range and player
            #is np.nan and so is their position
            name_pos_dict[key]=[np.nan, np.nan]
        
        
        key='2sub_'+str(i)
        try:
            right_player=right_subs[i]
            name_pos_dict[key]=get_player_data(right_player)
        except:
            name_pos_dict[key]=[np.nan, np.nan]
        
        
    #Both dictionaries are now full, each key has 
    #a list as an element, with the first element in the
    #list being the name of the player and the second
    #is the position
    
    return name_pos_dict

def get_player_soup(table_row):
    '''
    input: row
    output: soup from player profile page
    '''
    url_end = table_row.find('a')['href']
    url_start = 'https://www.betstudy.com'
    full_url = url_start+url_end
    
    r_player = requests.get(full_url)
    
    player_soup = BeautifulSoup(r_player.text)
    
    return player_soup
    
def get_player_data(row):
    '''
    input: row
    output: player name and position
    '''
    player_soup=get_player_soup(row)
    
    
    player_name=player_soup.find('div', class_='compare-heading').find('h1').text
    
    player_position=player_soup.find('div', class_='player-bio').find_all('dd')[7].text
    
    return [player_name, player_position]

def get_referees(lineup_soup):
    '''
    input: lineup soup
    output: all 4 refs in a dict with 
    titles as keys
    '''
    table=lineup_soup.find_all('div', class_='table-holder')[2]
    #there are 3 table-holder div elements. 3rd is for referees.
    
    #table is laid out like this
    #                #               #
    #   Referee      #  Assistant 1  #
    #                #               #
    #  Assistant 2   #    Fourth     #
    #                #               #
    referee_url=table.find_all('tr')[0].find_all('td')[0].find('a')['href']
    ass1_url=table.find_all('tr')[0].find_all('td')[1].find('a')['href']
    ass2_url=table.find_all('tr')[1].find_all('td')[0].find('a')['href']
    fourth_url=table.find_all('tr')[1].find_all('td')[1].find('a')['href']
    
    ref_dict={
        'referee':get_ref_name(referee_url),
        'assistant_1':get_ref_name(ass1_url),
        'assistant_2':get_ref_name(ass2_url),
        'fourth':get_ref_name(fourth_url)
    }
    
    return ref_dict
    
def get_ref_name(ref_url):
    '''
    input: ending of ref url
    output: ref name
    
    gets soup then collects name
    '''
    url_start = 'https://www.betstudy.com'
    full_url = url_start+ref_url
    r_ref = requests.get(full_url)
    ref_soup = BeautifulSoup(r_ref.text)
    
    ref_name=ref_soup.find('div', class_='compare-heading').find('h1').text
    
    return ref_name
    

In [130]:
player, position = game_spider(betstudy_url)

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

In [131]:
player_df = pd.DataFrame(player)
position_df = pd.DataFrame(position)

In [132]:
player_df.to_csv('lineup_names.csv')
position_df.to_csv('lineup_positions.csv')

In [137]:
player_df.isnull().sum().sum()

2

In [135]:
position_df.isnull().sum().sum()

2