In [1]:
import pandas as pd
import time
import numpy as np
import requests
from bs4 import BeautifulSoup as soup

In [2]:
example_url = "https://www.11v11.com/matches/manchester-city-v-brighton-and-hove-albion-09-may-2018-357538/"
response = requests.get(example_url,headers={'User-Agent': 'Custom'})
page = soup(response.content, "html.parser")

In [6]:
def get_all_goal_scorers(page):
    goalsBlocks = page.findAll('div',{'class':'goals'})
    if len(goalsBlocks) == 0:
        return {'home':[],'away':[]}
    goalsBlock = goalsBlocks[0]
    homeGoalsBlock = goalsBlock.findAll('div',{'class':'home'})[0]
    all_home_goal_blocks = homeGoalsBlock.findAll('tr')
    all_home_goalscorers = []
    for i in range(len(all_home_goal_blocks)):
        home_goal_block = all_home_goal_blocks[i]
        home_goalscorer = home_goal_block.findAll('td')[0].text
        if home_goalscorer[0] == " ":
            home_goalscorer = home_goalscorer[1:]
        formatted_name = home_goalscorer.lower().replace(" ","-")
        all_home_goalscorers.append(formatted_name)

    awayGoalsBlock = goalsBlock.findAll('div',{'class':'away'})[0]
    all_away_goal_blocks = awayGoalsBlock.findAll('tr')
    all_away_goalscorers = []
    for i in range(len(all_away_goal_blocks)):
        away_goal_block = all_away_goal_blocks[i]
        away_goalscorer = away_goal_block.findAll('td')[0].text
        if away_goalscorer[0] == " ":
            away_goalscorer = away_goalscorer[1:]
        formatted_name = away_goalscorer.lower().replace(" ","-")
        all_away_goalscorers.append(formatted_name)
        
    all_goalscorers = {'home':all_home_goalscorers,'away':all_away_goalscorers}
    return all_goalscorers

In [7]:
all_goalscorers = get_all_goal_scorers(page)
print(all_goalscorers)

{'home': ['danilo', 'bernardo-silva', 'fernandinho'], 'away': ['leonardo-ulloa']}


In [8]:
def get_lineups_data(page):
    lineupData = page.findAll('div',{'class':'lineup'})

    homeData = lineupData[0].findAll('div',{'class':'home'})
    homeLineupBlocks = homeData[0].findAll('div',{'class':'player flagged'})
    homePlayers = [x.a['href'].split("/")[2] for x in homeLineupBlocks]

    awayData = lineupData[0].findAll('div',{'class':'away'})
    awayLineupBlocks = awayData[0].findAll('div',{'class':'player flagged'})
    awayPlayers = [x.a['href'].split("/")[2] for x in awayLineupBlocks]
    
    return {'home':homePlayers,'away':awayPlayers}

In [9]:
lineups = get_lineups_data(page)
print(lineups)

{'home': ['claudio-bravo-115855', 'danilo-239141', 'aymeric-laporte-254251', 'vincent-kompany-22665', 'olexsandr-zinchenko-250144', 'bernardo-silva-248892', 'fernandinho-197129', 'ilkay-gündogan-229415', 'yaya-touré-15819', 'leroy-sané-247933', 'gabriel-jesus-253514'], 'away': ['mathew-ryan-242106', 'gaëtan-bong-224927', 'bruno-saltor-236614', 'shane-duffy-230212', 'lewis-dunk-232346', 'pascal-groß-255219', 'davy-pröpper-236275', 'dale-stephens-226609', 'josé-izquierdo-249850', 'anthony-knockaert-239707', 'leonardo-ulloa-240526']}


In [10]:
def get_goalscorers_ids(page):
    goalscorers = get_all_goal_scorers(page)
    lineups = get_lineups_data(page)
    home_ids = []
    home_goalscorers = goalscorers.get("home")
    home_players = lineups.get("home")
    for goalscorer in home_goalscorers:
        for player in home_players:
            if goalscorer in player:
                home_ids.append(player.split("-")[-1])
    
    away_ids = []
    away_goalscorers = goalscorers.get("away")
    away_players = lineups.get("away")
    for goalscorer in away_goalscorers:
        for player in away_players:
            if goalscorer in player:
                away_ids.append(player.split("-")[-1])
                
    goalscorers_ids = {'home':home_ids,'away':away_ids}
    return goalscorers_ids

In [11]:
goalscorers_ids = get_goalscorers_ids(page)
print(goalscorers_ids)

{'home': ['239141', '248892', '197129'], 'away': ['240526']}


In [16]:
missing_goalscorers_data = pd.read_csv("scores_not_equal.csv")
eleven_ids = missing_goalscorers_data['11id'].tolist()
urls = ["https://www.11v11.com/matches/" + str(int(x)) for x in eleven_ids]
urls[0:10]

['https://www.11v11.com/matches/20852',
 'https://www.11v11.com/matches/20867',
 'https://www.11v11.com/matches/20893',
 'https://www.11v11.com/matches/20931',
 'https://www.11v11.com/matches/20932',
 'https://www.11v11.com/matches/20944',
 'https://www.11v11.com/matches/20958',
 'https://www.11v11.com/matches/20959',
 'https://www.11v11.com/matches/20968',
 'https://www.11v11.com/matches/20971']

In [50]:
def follow_link_get_goalscorers(url):
    response = requests.get(url,headers={'User-Agent': 'Custom'})
    page = soup(response.content, "html.parser")
    goalscorers = get_goalscorers_ids(page)
    return goalscorers

def get_home_goalscorers(row):
    elevenid = str(int(row['11id']))
    url = "https://www.11v11.com/matches/" + elevenid
    goalscorers = follow_link_get_goalscorers(url)
    home_scorers = goalscorers.get("home")
    print(elevenid)
    return home_scorers

def get_away_goalscorers(row):
    elevenid = str(int(row['11id']))
    url = "https://www.11v11.com/matches/" + elevenid
    goalscorers = follow_link_get_goalscorers(url)
    away_scorers = goalscorers.get("away")
    print(elevenid)
    return away_scorers

In [51]:
missing_goalscorers_data['home_scorers'] = missing_goalscorers_data.apply(get_home_goalscorers,axis=1)
missing_goalscorers_data['away_scorers'] = missing_goalscorers_data.apply(get_away_goalscorers,axis=1)

20852
20867
20893
20931
20932
20944
20958
20959
20968
20971
20977
21028
21049
21053
21060
21064
21096
21117
21119
21138
21170
21172
21180
21192
21204
21224
21231
21240
21266
21283
21330
21331
21347
21362
21372
21396
21397
21440
21441
21449
21468
21469
21470
21486
21539
21565
21574
21590
21594
21596
21605
21619
21623
21631
21632
21649
21672
21673
21684
21703
21718
21739
21745
21748
21749
21753
21762
21774
21790
21800
21807
21808
21825
21839
21867
21881
21935
21942
21947
21963
21991
21999
22003
22051
22058
22066
22069
22072
22081
22086
22144
22150
22187
22216
22231
22232
22249
22264
22313
22323
22337
22348
22374
22379
22386
22391
22419
22425
22442
22452
22458
22462
22468
22486
22525
22547
22556
22568
22573
22575
22578
22581
22584
22589
22594
22596
22602
22605
22609
22616
22621
22622
22623
22624
22637
22641
22642
22648
22649
22657
22659
22663
22672
22676
22680
22696
22703
22704
22713
22719
22722
22729
22737
22738
22744
22747
22750
22760
22765
22766
22767
22769
22773
22786
22788
22794
2279

294647
294651
294906
294943
294946
294988
295027
295033
295066
295104
295420
295459
295460
295461
295486
295487
295490
295527
295556
295671
295677
295738
295772
295775
295811
295814
295816
295881
295882
295883
295918
295919
295984
296042
296044
296083
296107
296112
296167
296208
296227
296261
296266
296270
296279
296280
296320
296324
296335
296336
296374
296414
296415
296613
296629
296630
296633
296642
296643
296648
296670
296739
296742
297070
297161
297162
297163
297456
297500
297680
297812
297817
297836
297896
297900
297921
297958
298021
298025
298085
298086
298118
298126
298132
298133
298140
298645
298729
298733
298801
298840
298888
298894
299015
299018
299038
299041
299042
299128
299130
299170
299222
299225
299224
299262
299263
299348
299349
299364
299367
299468
299473
299526
299528
299529
299531
299569
299603
299604
299666
299665
299680
299681
299685
299687
299767
299768
299806
299857
299860
299899
299960
299961
299967
299968
299969
300011
300012
300014
300015
300059
300063
300179

14593
14595
14596
14600
15232
15436
15098
15422
15147
15280
15301
15213
15410
15283
15388
15269
15356
15337
15304
15369
15431
15151
15358
15453
15252
15152
15273
15253
15154
15311
15342
15435
16284
16282
16280
16320
16324
16364
16368
16394
16411
16453
16646
16647
16652
16807
17074
17571
17573
17847
17845
17901
17980
18035
18075
18120
18115
18183
18234
18237
18276
18283
18279
18291
18285
18293
18287
18377
18381
18401
18407
18408
18431
18485
18487
18491
18576
18584
18586
18630
18642
18690
18692
18732
19115
19458
19464
19496
19519
19522
19583
19619
19634
19659
19720
19721
19742
19743
19858
19909
19911
19949
19988
20072
20348
20495
24923
25008
25047
25049
25052
25215
25381
25382
25383
25386
25387
25676
25681
25719
25721
25851
26013
26053
26159
26217
26306
26309
26437
26585
26588
26589
26643
26661
26667
26746
26785
26834
26837
26923
26924
26995
27031
27052
27100
27149
27150
27192
27234
27237
27285
27352
27466
27468
27469
27485
27490
27495
27547
27585
27718
27974
33721
33727
173513
186565
18

351232
351265
351264
351301
351300
351485
351488
351491
351503
351509
351513
351547
351554
351568
351577
351625
351629
351664
351672
351682
351801
351802
351808
351813
351806
351810
351820
351824
351825
351832
352129
352127
352136
352139
352141
352177
352176
352188
352189
352192
352229
352231
352312
352404
352447
352503
352539
352534
352536
352533
352694
352710
352727
352722
352802
352810
352811
352890
352893
352927
352928
353014
353017
353016
353048
353049
353052
353056
353061
353062
353065
353067
353074
353379
353374
353375
353432
353437
353474
353478
353482
354588
354584
354670
354718
354719
354796
354798
355320
355350
355351
355461
355421
355458
355468
355570
355568
355598
355602
355604
355639
355684
355680
355681
355720
355721
355727
355733
356102
355856
355865
355864
355902
355905
355907
355917
355919
355922
355960
355972
355966
356008
356011
356051
356058
356097
356350
356354
356368
356408
356413
356455
356501
356510
356508
356512
356552
356556
356641
356721
356719
356723
356760

In [52]:
missing_goalscorers_data.to_csv("goalscorers_added_fixed_step6_2.csv",index=False)

In [53]:
missing_goalscorers_data.head()

Unnamed: 0,premid2,season2,date2,home2,score2,away2,ref2,11id,season,date,...,h_players,a_players,home_equal,away_equal,refs_equal,final_ref,score_equal,"(home_scorers, away_scorers)",home_scorers,away_scorers
0,66,1,1992-09-05,blackburn,4-1,nottingham forest,bob nixon,20852,1,1992-09-05,...,"['5223(90)', '27669(90)', '3482(90)', '302(90)...","['1044(90)', '38354(90)', '582(90)', '968(90)'...",True,True,True,bob nixon,False,"([362, 11282, 362], [38455])","[362, 11282, 362]",[38455]
1,81,1,1992-09-12,man city,0-1,middlesbrough,peter willis,20867,1,1992-09-12,...,"['37907(90)', '4776(90)', '33650(67)', '1052(9...","['24327(90)', '3723(90)', '34052(90)', '3471(9...",True,True,False,ray lewis,False,"([], [])",[],[]
2,107,1,1992-09-26,middlesbrough,2-3,aston villa,david elleray,20893,1,1992-09-26,...,"['36162(90)', '3723(90)', '34052(90)', '3471(9...","['10259(90)', '759(90)', '291(90)', '2507(90)'...",True,True,True,david elleray,False,"([34656], [71, 22546, 71])",[34656],"[71, 22546, 71]"
3,145,1,1992-10-31,everton,1-3,man city,joe worrall,20931,1,1992-10-31,...,"['2448(90)', '32094(90)', '188(90)', '1523(90)...","['37907(90)', '4776(90)', '32770(90)', '1052(9...",True,True,True,joe worrall,False,"([], [2392, 32874, 2392])",[],"[2392, 32874, 2392]"
4,146,1,1992-10-31,leeds,2-2,coventry,brian hill,20932,1,1992-10-31,...,"['13(90)', '5368(90)', '407(90)', '31079(90)',...","['151(90)', '6177(90)', '32033(90)', '32411(90...",True,True,True,brian hill,False,"([31954, 5368], [2002])","[31954, 5368]",[2002]
