In [1]:
import pandas as pd
from proxy_requests import ProxyRequests
import requests
from bs4 import BeautifulSoup as soup
import time
%run scrape_one_match_code.ipynb

In [6]:
def scrape_all_links(link_list):
    all_dfs = []
    print("{} link(s) to scrape".format(len(link_list)))
    for i in range(len(link_list)):
        try:
            link = link_list[i]
            r = ProxyRequests(link)
            r.get()
            page = soup(r.get_raw(), "html.parser")
            df = get_all_match_data(page)
            all_dfs.append(df)
            print("{}/{} complete ({})".format(i+1,len(link_list),link.split("/")[-1]))
        except Exception as e:
            print("Error Found, returning what has been collected so far")
            if len(all_dfs) == 1:
                return all_dfs[0]
            elif len(all_dfs) > 1:
                final_df = pd.concat(all_dfs,axis=0)
                return final_df
            else:
                print("No Data Collected")
                return pd.DataFrame()
    if len(all_dfs) == 1:
        return all_dfs[0]
    elif len(all_dfs) > 1:
        final_df = pd.concat(all_dfs,axis=0)
        return final_df
    else:
        print("No data collected")
        return pd.DataFrame()

def check_game_data_available(row):
    match_report = str(row.find("td",{"data-stat":"match_report"}).text)
    if match_report != "Match Report":
        return False
    xg_a = str(row.find("td",{"data-stat":"xg_a"}).text)
    if xg_a == "":
        return False
    return True

def get_all_match_links(page):
    link_list = []
    main_table_body = page.find("tbody")
    data_rows = main_table_body.findAll("tr")
    for row in data_rows:
        isValid = check_game_data_available(row)
        if isValid:
            td = row.find("td",{"data-stat":"match_report"})
            if td.a is not None:
                text = td.a.text
                link = td.a["href"]
                if text == "Match Report":
                    link_list.append("https://fbref.com" + link)
    return link_list

def remove_links_already_explored(link_list,df):
    if df.empty:
        return link_list
    output_link_list = []
    
    completed_matchups = list(set(df.copy().set_index(['home_name','away_name']).index.tolist()))
    formatted_matchups = set([str(x[0] + " " + x[1]).replace(" ","-") for x in completed_matchups])
    
    final_matchups = []
    for item in formatted_matchups:
        if "&" in item:
            item = item.replace("&","and")
        final_matchups.append(item)
        
    for link in link_list:
        relevant_info = link.split("/")[-1]
        teams_text = relevant_info.split("-")[:-5]
        teams_text_final = str(teams_text[0])
        for i in range(1,len(teams_text)):
            word = str(teams_text[i])
            if word.lower() == "and" or word == "&":
                word = "and"
            teams_text_final += str("-" + word)
        #print("Checking if {} already done".format(teams_text_final))
        if "Derby" in teams_text_final:
            teams_text_final = teams_text_final.split("Derby-")[1]
        if teams_text_final not in final_matchups:
            print("Link to add: {}".format(link))
            output_link_list.append(link)
            
    return output_link_list
    
def convert_id_to_name(row):
    ids = {}
    ids["47c64c55"] = "Crystal Palace"
    ids["d07537b9"] = "Brighton & Hove Albion"
    ids["822bd0ba"] = "Liverpool"
    ids["8602292d"] = "Aston Villa"
    ids["18bb7c10"] = "Arsenal"
    ids["361ca564"] = "Tottenham Hotspur"
    ids["d3fd31cc"] = "Everton"
    ids["19538871"] = "Manchester United"
    ids["33c895d4"] = "Southampton"
    ids["8cec06e1"] = "Wolverhampton Wanderers"
    ids["7c21e445"] = "West Ham United"
    ids["cff3d9bb"] = "Chelsea"
    ids["a2d435b3"] = "Leicester City"
    ids["5bfb9659"] = "Leeds United"
    ids["60c6b05f"] = "West Bromwich Albion"
    ids["b8fd03ef"] = "Manchester City"
    ids["b2b47a98"] = "Newcastle United"
    ids["1df6b87e"] = "Sheffield United"
    ids["fd962109"] = "Fulham"
    ids["943e8050"] = "Burnley"
    id_ = str(row["id"])
    name = ids[id_]
    return name

def clean_df(df):
    df["team"] = df.apply(convert_id_to_name,axis=1)
    cols = list(df.columns.values)
    to_front = ['team','id','home_name','home_score','home_xg','home_manager','away_name','away_score','away_xg',
                'away_manager','date','time','matchweek','stadium','ref','var_ref']
    rest = [x for x in cols if x not in to_front]
    df_new = df[to_front+rest]
    return df_new

def get_season_data(season_link,output_file,dataExists=False):
    if dataExists:
        old_df = pd.read_csv(output_file)
    else:
        old_df = pd.DataFrame()
    r = ProxyRequests(season_link)
    r.get()
    page = soup(r.get_raw(), "html.parser")
    link_list = get_all_match_links(page)
    shortened_list = remove_links_already_explored(link_list,old_df)
    if len(shortened_list) != 0:
        final_df = scrape_all_links(shortened_list)
    else:
        print("No links to scrape. Stopping.")
        return pd.DataFrame()
    
    if final_df.empty: 
        return pd.DataFrame()
    
    if dataExists:
        output_df = pd.concat([old_df,final_df],axis=0)
    else:
        output_df = final_df
        
    cleaned_df = clean_df(output_df)   
    cleaned_df.to_csv(output_file,index=False)
    return cleaned_df

In [7]:
t1 = time.time()
df = get_season_data("https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures",
                "C:/Users/joeco/Python/fb-ref-data/data/181220_2021_season_player_data.csv",
                dataExists=True)
t2 = time.time()
print("Took {} seconds".format(round(t2-t1,2)))

Link to add: https://fbref.com/en/matches/666f6961/Leicester-City-Liverpool-February-13-2021-Premier-League
Link to add: https://fbref.com/en/matches/34a4b546/Crystal-Palace-Burnley-February-13-2021-Premier-League
Link to add: https://fbref.com/en/matches/5d0cd646/Manchester-City-Tottenham-Hotspur-February-13-2021-Premier-League
Link to add: https://fbref.com/en/matches/c7922348/Brighton-and-Hove-Albion-Aston-Villa-February-13-2021-Premier-League
Link to add: https://fbref.com/en/matches/34682a95/Southampton-Wolverhampton-Wanderers-February-14-2021-Premier-League
Link to add: https://fbref.com/en/matches/80124feb/West-Bromwich-Albion-Manchester-United-February-14-2021-Premier-League
Link to add: https://fbref.com/en/matches/85624e5e/Arsenal-Leeds-United-February-14-2021-Premier-League
Link to add: https://fbref.com/en/matches/46f84386/Everton-Fulham-February-14-2021-Premier-League
Link to add: https://fbref.com/en/matches/d6167896/West-Ham-United-Sheffield-United-February-15-2021-Premi

1/112 complete (Leicester-City-Liverpool-February-13-2021-Premier-League)
2/112 complete (Crystal-Palace-Burnley-February-13-2021-Premier-League)
3/112 complete (Manchester-City-Tottenham-Hotspur-February-13-2021-Premier-League)
4/112 complete (Brighton-and-Hove-Albion-Aston-Villa-February-13-2021-Premier-League)
5/112 complete (Southampton-Wolverhampton-Wanderers-February-14-2021-Premier-League)
6/112 complete (West-Bromwich-Albion-Manchester-United-February-14-2021-Premier-League)
7/112 complete (Arsenal-Leeds-United-February-14-2021-Premier-League)
8/112 complete (Everton-Fulham-February-14-2021-Premier-League)
9/112 complete (West-Ham-United-Sheffield-United-February-15-2021-Premier-League)
10/112 complete (Chelsea-Newcastle-United-February-15-2021-Premier-League)
11/112 complete (Burnley-Fulham-February-17-2021-Premier-League)
12/112 complete (Everton-Manchester-City-February-17-2021-Premier-League)
13/112 complete (Wolverhampton-Wanderers-Leeds-United-February-19-2021-Premier-Lea

108/112 complete (Everton-Aston-Villa-May-1-2021-Premier-League)
109/112 complete (Newcastle-United-Arsenal-May-2-2021-Premier-League)
110/112 complete (Tottenham-Hotspur-Sheffield-United-May-2-2021-Premier-League)
111/112 complete (West-Bromwich-Albion-Wolverhampton-Wanderers-May-3-2021-Premier-League)
112/112 complete (Burnley-West-Ham-United-May-3-2021-Premier-League)
Took 2354.98 seconds


In [49]:
# seems to take about 10 seconds if no data to add
# seems to take around 40-70 seconds per match to add
df.tail(10)

Unnamed: 0,team,id,home_name,home_score,home_xg,home_manager,away_name,away_score,away_xg,away_manager,...,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct,progressive_carries,carries_into_final_third,carries_into_penalty_area,progressive_passes_received
16,Crystal Palace,47c64c55,Leeds United,2,2.8,Marcelo Bielsa,Crystal Palace,0,0.5,Roy Hodgson,...,0,0,3,0,1,0.0,1,1,0,1
17,Crystal Palace,47c64c55,Leeds United,2,2.8,Marcelo Bielsa,Crystal Palace,0,0.5,Roy Hodgson,...,0,0,2,0,0,,2,2,0,2
18,Crystal Palace,47c64c55,Leeds United,2,2.8,Marcelo Bielsa,Crystal Palace,0,0.5,Roy Hodgson,...,0,0,6,0,0,,4,2,0,2
19,Crystal Palace,47c64c55,Leeds United,2,2.8,Marcelo Bielsa,Crystal Palace,0,0.5,Roy Hodgson,...,0,0,9,0,2,0.0,2,3,0,0
20,Crystal Palace,47c64c55,Leeds United,2,2.8,Marcelo Bielsa,Crystal Palace,0,0.5,Roy Hodgson,...,0,0,12,2,1,66.7,11,3,1,3
21,Crystal Palace,47c64c55,Leeds United,2,2.8,Marcelo Bielsa,Crystal Palace,0,0.5,Roy Hodgson,...,0,0,8,2,1,66.7,1,0,0,2
22,Crystal Palace,47c64c55,Leeds United,2,2.8,Marcelo Bielsa,Crystal Palace,0,0.5,Roy Hodgson,...,0,0,3,2,2,50.0,2,0,0,0
23,Crystal Palace,47c64c55,Leeds United,2,2.8,Marcelo Bielsa,Crystal Palace,0,0.5,Roy Hodgson,...,0,0,10,2,1,66.7,1,0,0,0
24,Crystal Palace,47c64c55,Leeds United,2,2.8,Marcelo Bielsa,Crystal Palace,0,0.5,Roy Hodgson,...,0,0,8,1,1,50.0,0,0,0,1
25,Crystal Palace,47c64c55,Leeds United,2,2.8,Marcelo Bielsa,Crystal Palace,0,0.5,Roy Hodgson,...,0,0,0,0,0,,0,0,0,0
