In [39]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as soup

In [40]:
import time

In [41]:
df_main = pd.read_csv('all_results_data.csv')
base_url = "https://www.11v11.com"

In [42]:
example_url = base_url + df_main.iloc[0]['Data Link']
response = requests.get(example_url,headers={'User-Agent': 'Custom'})
page = soup(response.content, "html.parser")

In [43]:
def get_basic_data(page):
    basicData = page.findAll('div',{'class':'basicData'})
    basicDataEntries = basicData[0].findAll('td')
    basicDataDict = {'Referee':basicDataEntries[3].text,'Stadium':basicDataEntries[7].text}
    return basicDataDict

In [44]:
basicDataDict = get_basic_data(page)
basicDataDict

{'Referee': 'Alan Gunn', 'Stadium': 'Highbury'}

In [45]:
def get_lineups_data(page):
    lineupData = page.findAll('div',{'class':'lineup'})

    homeData = lineupData[0].findAll('div',{'class':'home'})
    homeLineupBlocks = homeData[0].findAll('div',{'class':'player flagged'})
    homePlayers = [x.a['href'].split("/")[2] for x in homeLineupBlocks]

    awayData = lineupData[0].findAll('div',{'class':'away'})
    awayLineupBlocks = awayData[0].findAll('div',{'class':'player flagged'})
    awayPlayers = [x.a['href'].split("/")[2] for x in awayLineupBlocks]
    
    return {'home':homePlayers,'away':awayPlayers}

In [46]:
lineupsDict = get_lineups_data(page)
lineupsDict

{'home': ['david-seaman-5',
  'lee-dixon-14',
  'tony-adams-31',
  'steve-bould-484',
  'nigel-winterburn-20',
  'david-hillier-4377',
  'john-jensen-7800',
  'anders-limpar-37097',
  'paul-merson-57',
  'kevin-campbell-196',
  'alan-smith-25601'],
 'away': ['bryan-gunn-38002',
  'ian-culverhouse-24730',
  'ian-butterworth-26985',
  'john-polston-4690',
  'rob-newman-8253',
  'chris-sutton-98',
  'david-phillips-38680',
  'mark-bowen-846',
  'ruel-fox-511',
  'jeremy-goss-27123',
  'gary-megson-32983']}

In [64]:
def get_subs_data(page):
    subBlock = page.findAll('div',{'class':'substitutions'})
    if len(subBlock) == 0:
        return {'home':[],'away':[]}
    homeSubsBlock = subBlock[0].findAll('div',{'class':'home'})
    homeAllSubBlocks = homeSubsBlock[0].findAll('tr')
    all_home_subs = dict()
    for i in range(len(homeAllSubBlocks)):
        homeSubBlock = homeAllSubBlocks[i]
        home_substitution_data = homeSubBlock.findAll('span')
        home_substitute = home_substitution_data[0].text.strip()
        home_substituted = home_substitution_data[1].text.strip()
        time = homeSubBlock.findAll('td',{'class':'time'})[0].text
        home_sub = home_substitute + ">" + home_substituted
        all_home_subs[home_sub] = time
    
    awaySubsBlock = subBlock[0].findAll('div',{'class':'away'})
    awayAllSubBlocks = awaySubsBlock[0].findAll('tr')
    all_away_subs = dict()
    for i in range(len(awayAllSubBlocks)):
        awaySubBlock = awayAllSubBlocks[i]
        away_substitution_data = awaySubBlock.findAll('span')
        away_substitute = away_substitution_data[0].text.strip()
        away_substituted = away_substitution_data[1].text.strip()
        time = awaySubBlock.findAll('td',{'class':'time'})[0].text
        away_sub = away_substitute + ">" + away_substituted
        all_away_subs[away_sub] = time
    
    all_subs_dict = {'home':all_home_subs,'away':all_away_subs}  

    return all_subs_dict

In [48]:
all_subs_dict = get_subs_data(page)
all_subs_dict

{'home': {'Ian Wright>Paul Merson': '73'},
 'away': {'Mark Robins>Chris Sutton': '58', 'Ian Crook>Gary Megson': '87'}}

In [2]:
def get_all_goal_scorers(page):
    goalsBlocks = page.findAll('div',{'class':'goals'})
    if len(goalsBlocks) == 0:
        return {'home':[],'away':[]}
    goalsBlock = goalsBlocks[0]
    print(goalsBlock)
    homeGoalsBlock = goalsBlock.findAll('div',{'class':'home'})[0]
    print(homeGoalsBlock)
    all_home_goal_blocks = homeGoalsBlock.findAll('tr')
    print(all_home_goal_blocks)
    all_home_goalscorers = []
    for i in range(len(all_home_goal_blocks)):
        home_goal_block = all_home_goal_blocks[i]
        home_goalscorer = home_goal_block.findAll('td')[0].text
        formatted_name = home_goalscorer.lower().replace(" ","-")
        all_home_goalscorers.append(formatted_name)

    awayGoalsBlock = goalsBlock.findAll('div',{'class':'away'})[0]
    all_away_goal_blocks = awayGoalsBlock.findAll('tr')
    all_away_goalscorers = []
    for i in range(len(all_away_goal_blocks)):
        away_goal_block = all_away_goal_blocks[i]
        away_goalscorer = away_goal_block.findAll('td')[0].text
        formatted_name = away_goalscorer.lower().replace(" ","-")
        all_away_goalscorers.append(formatted_name)
        
    all_goalscorers = {'home':all_home_goalscorers,'away':all_away_goalscorers}
    return all_goalscorers

In [50]:
all_goalscorers = get_all_goal_scorers(page)
all_goalscorers

{'home': ['steve-bould', 'kevin-campbell'],
 'away': ['mark-robins', 'david-phillips', 'ruel-fox', 'mark-robins']}

In [51]:
# so now can get lineups, subs, goalscorers, referee and stadium
# basicDataDict # referee and stadium
# lineupsDict # lineups
# all_subs_dict # subs
# all_goalscorers # goalscorers

In [65]:
def get_subs_on_data(page):
    subsBroughtOnDataBlocks = page.findAll('div',{'class':'lineup'})
    if len(subsBroughtOnDataBlocks) < 2:
        return {'home':[],'away':[]}
    subsBroughtOnData = subsBroughtOnDataBlocks[1]
    homeSubsOnBlocks = subsBroughtOnData.findAll('div',{'class':'home'})[0].findAll('div',{'class':'player flagged'})
    homeSubsOn = [x.a['href'].split("/")[2] for x in homeSubsOnBlocks]

    awaySubsOnBlocks = subsBroughtOnData.findAll('div',{'class':'away'})[0].findAll('div',{'class':'player flagged'})
    awaySubsOn = [x.a['href'].split("/")[2] for x in awaySubsOnBlocks]

    subsOnDict = {'home':homeSubsOn,'away':awaySubsOn}

    return subsOnDict

In [66]:
def compute_minutes_played_dict(teamLineup,teamSubs,subsOnLinks):
    mins_played_dict = dict()
    for player in teamLineup:
        mins_played_dict[player] = str(90)
    if len(teamSubs) == 0:
        return mins_played_dict
    for sub in teamSubs.keys():
        [sub_on,sub_off] = sub.split(">")
        time = teamSubs[sub]
        if time == "":
            print("{} -> {} has no time".format(sub_on,sub_off))
            time = 75
        
        formatted_name_on = sub_on.lower().replace(" ","-")
        formatted_name_off = sub_off.lower().replace(" ","-")
        
        for player in teamLineup:
            if formatted_name_off in player:
                mins_played_dict[player] = time
                break
        
        for subOn in subsOnLinks:
            if formatted_name_on in subOn:
                mins_played_dict[subOn] = str(90-int(time))
                
    return mins_played_dict

In [54]:
def get_all_minutes_played_data(page):
    lineupData = get_lineups_data(page)
    allSubsData = get_subs_data(page)
    subsOnData = get_subs_on_data(page)
    home_mins_played = compute_minutes_played_dict(lineupData['home'],allSubsData['home'],subsOnData['home'])
    away_mins_played = compute_minutes_played_dict(lineupData['away'],allSubsData['away'],subsOnData['away'])
    all_minutes_played_data = {'home':home_mins_played,'away':away_mins_played}
    return all_minutes_played_data

In [55]:
t0 = time.time()
all_minutes_played_data = get_all_minutes_played_data(page)
t1 = time.time()
#print(all_minutes_played_data)
#print(t1-t0)

In [56]:
# get_all_minutes_played_data(page) gives us both full teams, each player with number of minutes played
# get_basic_data(page) gives us the referee and stadium
# get_all_goal_scorers(page) gives us all the goalscorers
# so we need to put this into ['Data Link','Referee','Stadium','HomeLineup','AwayLineup','HomeScorers','AwayScorers']
# still also need to reformat goalscorers

In [57]:
def reformat_goalscorers(goalscorers,all_minutes_played_data):
    home_goalscorers = goalscorers['home']
    home_players = all_minutes_played_data['home']
    formatted_home_goalscorers = []
    for scorer in home_goalscorers:
        for player in home_players:
            if scorer in player:
                formatted_home_goalscorers.append(player)
                
    away_goalscorers = goalscorers['away']
    away_players = all_minutes_played_data['away']
    formatted_away_goalscorers = []
    for scorer in away_goalscorers:
        for player in away_players:
            if scorer in player:
                formatted_away_goalscorers.append(player)
                
    formatted_goalscorers = {'home':formatted_home_goalscorers,'away':formatted_away_goalscorers}
    return formatted_goalscorers

In [58]:
def get_goalscorers_data(page,allMinutesPlayedData):
    all_goalscorers = get_all_goal_scorers(page)
    goalscorersData = reformat_goalscorers(all_goalscorers,allMinutesPlayedData)
    return goalscorersData

In [1]:
def get_all_match_data(data_link):
    response = requests.get(data_link,headers={'User-Agent': 'Custom'})
    page = soup(response.content, "html.parser")
    basicData = get_basic_data(page)
    allMinutesPlayedData = get_all_minutes_played_data(page)
    goalscorersData = get_goalscorers_data(page,allMinutesPlayedData)
    print(goalscorersData)
    print(allMinutesPlayedData)
    data_url = data_link.split("com")[1]
    match_data = [data_url,basicData['Referee'],basicData['Stadium'],allMinutesPlayedData['home']]
    match_data += [allMinutesPlayedData['away'],goalscorersData['home'],goalscorersData['away']]
    return match_data

In [60]:
t1 = time.time()
match_data = get_all_match_data(example_url)
t2 = time.time()
#print("Took {} seconds".format(t2-t1))
match_data

['/matches/arsenal-v-norwich-city-15-august-1992-20787/',
 'Alan Gunn',
 'Highbury',
 {'david-seaman-5': '90',
  'lee-dixon-14': '90',
  'tony-adams-31': '90',
  'steve-bould-484': '90',
  'nigel-winterburn-20': '90',
  'david-hillier-4377': '90',
  'john-jensen-7800': '90',
  'anders-limpar-37097': '90',
  'paul-merson-57': '73',
  'kevin-campbell-196': '90',
  'alan-smith-25601': '90',
  'ian-wright-2700': '17'},
 {'bryan-gunn-38002': '90',
  'ian-culverhouse-24730': '90',
  'ian-butterworth-26985': '90',
  'john-polston-4690': '90',
  'rob-newman-8253': '90',
  'chris-sutton-98': '58',
  'david-phillips-38680': '90',
  'mark-bowen-846': '90',
  'ruel-fox-511': '90',
  'jeremy-goss-27123': '90',
  'gary-megson-32983': '87',
  'mark-robins-2266': '32',
  'ian-crook-35514': '3'},
 ['steve-bould-484', 'kevin-campbell-196'],
 ['mark-robins-2266',
  'david-phillips-38680',
  'ruel-fox-511',
  'mark-robins-2266']]

In [61]:
def format_all_match_data(all_match_data):
    data_dict = dict()
    data_dict['Data Link'] = all_match_data[0]
    data_dict['Referee'] = all_match_data[1]
    data_dict['Stadium'] = all_match_data[2]
    homePlayers = all_match_data[3]
    homePlayersNew = []
    for url_name in homePlayers.keys():
        pid = url_name.split("-")[-1]
        homePlayersNew.append(str(pid) + "(" + str(homePlayers[url_name]) + ")")
    data_dict['HomePlayers'] = homePlayersNew
    
    awayPlayers = all_match_data[4]
    awayPlayersNew = []
    for url_name in awayPlayers.keys():
        pid = url_name.split("-")[-1]
        awayPlayersNew.append(str(pid) + "(" + str(awayPlayers[url_name]) + ")")
    data_dict['AwayPlayers'] = awayPlayersNew
    
    homeScorers = all_match_data[5]
    homeScorersNew = []
    for url_name in homeScorers:
        pid = url_name.split("-")[-1]
        homeScorersNew.append(int(pid))
    homeScorersNew = sorted(homeScorersNew)
    homeScorersNew = [str(x) for x in homeScorersNew]
    data_dict['HomeScorers'] = homeScorersNew
    
    awayScorers = all_match_data[6]
    awayScorersNew = []
    for url_name in awayScorers:
        pid = url_name.split("-")[-1]
        awayScorersNew.append(int(pid))
    awayScorersNew = sorted(awayScorersNew)
    awayScorersNew = [str(x) for x in awayScorersNew]
    data_dict['AwayScorers'] = awayScorersNew
    
    return data_dict

In [62]:
data_dict = format_all_match_data(match_data)
data_dict

{'Data Link': '/matches/arsenal-v-norwich-city-15-august-1992-20787/',
 'Referee': 'Alan Gunn',
 'Stadium': 'Highbury',
 'HomePlayers': ['5(90)',
  '14(90)',
  '31(90)',
  '484(90)',
  '20(90)',
  '4377(90)',
  '7800(90)',
  '37097(90)',
  '57(73)',
  '196(90)',
  '25601(90)',
  '2700(17)'],
 'AwayPlayers': ['38002(90)',
  '24730(90)',
  '26985(90)',
  '4690(90)',
  '8253(90)',
  '98(58)',
  '38680(90)',
  '846(90)',
  '511(90)',
  '27123(90)',
  '32983(87)',
  '2266(32)',
  '35514(3)'],
 'HomeScorers': ['196', '484'],
 'AwayScorers': ['511', '2266', '2266', '38680']}