In [6]:
import pandas as pd
from proxy_requests import ProxyRequests
import requests
from bs4 import BeautifulSoup as soup
import json
import time
import pathlib

In [2]:
def urls_to_json_yield(urls):
    for url in urls:
        r = ProxyRequests(url)
        r.get()
        page = soup(r.get_raw(), "html.parser")
        fixture_data = page.find("div",{"class":"mcTabsContainer"})["data-fixture"]
        yield fixture_data

In [3]:
base_url = "https://www.premierleague.com/match/"
urls0607_1415 = [base_url + str(i) for i in range(5567,9991)] # 5567 to 9990 inclusive
urls1516 = [base_url + str(i) for i in range(12115,12495)] # 12115 to 12494 inclusive
urls1617 = [base_url + str(i) for i in range(14040,14420)] # 14040 to 14419 inclusive
urls1718 = [base_url + str(i) for i in range(22342,22722)] # 22342 to 22721 inclusive
urls1819 = [base_url + str(i) for i in range(38308,38688)] # 38308 to 38687 inclusive
urls1920 = [base_url + str(i) for i in range(46605,46985)] # 46605 to 46984 inclusive
urls = urls0607_1415 + urls1516 + urls1617 + urls1718 + urls1819 + urls1920

In [4]:
test_set = [base_url + str(i) for i in range(5567,5571)] # 4 matches, 5567, 5568, 5569, 5570
test_set

['https://www.premierleague.com/match/5567',
 'https://www.premierleague.com/match/5568',
 'https://www.premierleague.com/match/5569',
 'https://www.premierleague.com/match/5570']

In [17]:
test_fixture_data_yielder = urls_to_json_yield(test_set)

In [4]:
def yield_then_store(generator_function):
    json_data_from_generator = next(generator_function)
    json_data = json.loads(json_data_from_generator)
    gameweek_data = json_data["gameweek"]
    season = gameweek_data["compSeason"]["label"]
    season_formatted = season.replace("/","")
    gameweek = gameweek_data["gameweek"]
    id_ = json_data["id"]
    filename = str("data/") + str(season_formatted) + "_" + str(gameweek) + "_" + str(id_) + ".json"
    with open(filename, 'w') as json_file:
        json.dump(json_data, json_file)

In [19]:
for i in range(len(test_set)):
    yield_then_store(test_fixture_data_yielder)

In [12]:
def yield_then_store_by_gameweek(generator_function,first_gameweek):
    current_gameweek = first_gameweek
    final_data = {}
    while True:
        try:
            json_data_from_generator = next(generator_function)
        except:
            break
        json_data = json.loads(json_data_from_generator)
        gameweek_data = json_data["gameweek"]
        # "if not prem, skip" check
        competition = str(gameweek_data["compSeason"]["competition"]["abbreviation"])
        if competition == "EN_PR":
            gameweek = str(gameweek_data["gameweek"])
            if gameweek == current_gameweek:
                season = gameweek_data["compSeason"]["label"]
                season_formatted = season.replace("/","")
                id_ = json_data["id"]
                final_data[str(id_)] = json_data
            else:
                tagline = str("data/") + str(season_formatted) + "_" + str(current_gameweek)
                filename = tagline + ".json"
                path = pathlib.Path(filename)
                while path.exists():
                    new_tagline = tagline + "_"
                    tagline = new_tagline
                    filename = tagline + ".json"
                    path = pathlib.Path(filename)
                with open(filename, 'w') as json_file:
                    json.dump(final_data, json_file)
                print("Gameweek {} complete, stored as {}".format(current_gameweek,filename))
                current_gameweek = gameweek
            
                # Then redo all the work for this current match, which is part of a new gameweek
                season = gameweek_data["compSeason"]["label"]
                season_formatted = season.replace("/","")
                id_ = json_data["id"]
                final_data = {}
                final_data[str(id_)] = json_data
        else:
            print("Non prem game found (Id:{}): Competition = {}".format(json_data["id"],competition))
    # Store all extra data not yet stored if there is any
    if final_data != {}:
        tagline = str("data/") + str(season_formatted) + "_" + str(current_gameweek)
        filename = tagline + ".json"
        path = pathlib.Path(filename)
        while path.exists():
            new_tagline = tagline + "_"
            tagline = new_tagline
            filename = tagline + ".json"
            path = pathlib.Path(filename)
        with open(filename, 'w') as json_file:
            json.dump(final_data, json_file)
    print("Final ID Saved: {}".format(id_))
    print("Done")

In [49]:
test_set_2 = [base_url + str(i) for i in range(5567,5590)] # 23 matches, should have 3 files, 240kb, 240kb and 72 kb ish.

In [46]:
import time
t1 = time.time()
test_2_fixture_data_yielder = urls_to_json_yield(test_set_2)
yield_then_store_by_gameweek(test_2_fixture_data_yielder)
t2 = time.time()
print("Took {} seconds".format(round(t2-t1,2)))

Gameweek 1 complete, stored as data/200607_1.json
Gameweek 2 complete, stored as data/200607_2.json
Done
Took 603.96 seconds


In [50]:
big_test_set = [base_url + str(i) for i in range(5585,6500)]
t1 = time.time()
big_test_yielder = urls_to_json_yield(big_test_set)
yield_then_store_by_gameweek(big_test_yielder)
t2 = time.time()
print("Took {} seconds".format(round(t2-t1,2)))

Gameweek 3 complete, stored as data/200607_3.json
Gameweek 4 complete, stored as data/200607_4.json
Gameweek 5 complete, stored as data/200607_5.json
Gameweek 6 complete, stored as data/200607_6.json
Gameweek 7 complete, stored as data/200607_7.json
Gameweek 8 complete, stored as data/200607_8.json
Gameweek 9 complete, stored as data/200607_9.json
Gameweek 10 complete, stored as data/200607_10.json
Gameweek 11 complete, stored as data/200607_11.json
Gameweek 12 complete, stored as data/200607_12.json
Gameweek 13 complete, stored as data/200607_13.json
Gameweek 14 complete, stored as data/200607_14.json
Gameweek 15 complete, stored as data/200607_15.json
Gameweek 16 complete, stored as data/200607_16.json
Gameweek 17 complete, stored as data/200607_17.json
Gameweek 18 complete, stored as data/200607_18.json
Gameweek 19 complete, stored as data/200607_19.json
Gameweek 20 complete, stored as data/200607_20.json
Gameweek 21 complete, stored as data/200607_21.json
Gameweek 22 complete, stor

In [53]:
chunk2 = [base_url + str(i) for i in range(6500,7500)]
t1 = time.time()
chunk2_yielder = urls_to_json_yield(chunk2)
yield_then_store_by_gameweek(chunk2_yielder)
t2 = time.time()
print("Took {} seconds".format(round(t2-t1,2)))

Gameweek 18 complete, stored as data/200809_18.json
Gameweek 19 complete, stored as data/200809_19.json
Gameweek 20 complete, stored as data/200809_20.json
Gameweek 21 complete, stored as data/200809_21.json
Gameweek 22 complete, stored as data/200809_22.json
Gameweek 23 complete, stored as data/200809_23.json
Gameweek 24 complete, stored as data/200809_24.json
Gameweek 25 complete, stored as data/200809_25.json
Done
Took 2410.47 seconds


In [55]:
chunk3 = [base_url + str(i) for i in range(6576,7500)]
t1 = time.time()
chunk3_yielder = urls_to_json_yield(chunk3)
yield_then_store_by_gameweek(chunk3_yielder,"26")
t2 = time.time()
print("Took {} seconds".format(round(t2-t1,2)))

Gameweek 26 complete, stored as data/200809_26.json
Gameweek 27 complete, stored as data/200809_27.json
Gameweek 28 complete, stored as data/200809_28.json
Gameweek 29 complete, stored as data/200809_29.json
Gameweek 30 complete, stored as data/200809_30.json
Done
Took 1589.21 seconds


In [56]:
chunk4 = [base_url + str(i) for i in range(6626,7500)]
t1 = time.time()
chunk4_yielder = urls_to_json_yield(chunk4)
yield_then_store_by_gameweek(chunk4_yielder,"31")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Gameweek 31 complete, stored as data/200809_31.json
Gameweek 32 complete, stored as data/200809_32.json
Gameweek 33 complete, stored as data/200809_33.json
Gameweek 34 complete, stored as data/200809_34.json
Gameweek 35 complete, stored as data/200809_35.json
Gameweek 36 complete, stored as data/200809_36.json
Gameweek 37 complete, stored as data/200809_37.json
Gameweek 38 complete, stored as data/200809_38.json
Gameweek 39 complete, stored as data/200809_39.json
Gameweek 1 complete, stored as data/200910_1.json
Done
Took 2764.98 seconds


In [58]:
chunk5 = [base_url + str(i) for i in range(6717,7500)]
t1 = time.time()
chunk5_yielder = urls_to_json_yield(chunk5)
yield_then_store_by_gameweek(chunk5_yielder,"2")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Gameweek 2 complete, stored as data/200910_2.json
Gameweek 3 complete, stored as data/200910_3.json
Gameweek 4 complete, stored as data/200910_4.json
Gameweek 5 complete, stored as data/200910_5.json
Gameweek 6 complete, stored as data/200910_6.json
Gameweek 7 complete, stored as data/200910_7.json
Gameweek 8 complete, stored as data/200910_8.json
Gameweek 9 complete, stored as data/200910_9.json
Gameweek 10 complete, stored as data/200910_10.json
Gameweek 11 complete, stored as data/200910_11.json
Gameweek 12 complete, stored as data/200910_12.json
Gameweek 13 complete, stored as data/200910_13.json
Gameweek 14 complete, stored as data/200910_14.json
Gameweek 15 complete, stored as data/200910_15.json
Gameweek 16 complete, stored as data/200910_16.json
Gameweek 17 complete, stored as data/200910_17.json
Gameweek 18 complete, stored as data/200910_18.json
Gameweek 19 complete, stored as data/200910_19.json
Gameweek 20 complete, stored as data/200910_20.json
Gameweek 21 complete, stored

In [59]:
chunk6 = [base_url + str(i) for i in range(7496,7900)]
t1 = time.time()
chunk6_yielder = urls_to_json_yield(chunk6)
yield_then_store_by_gameweek(chunk6_yielder,"4")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Gameweek 4 complete, stored as data/201112_4.json
Gameweek 5 complete, stored as data/201112_5.json
Gameweek 6 complete, stored as data/201112_6.json
Gameweek 7 complete, stored as data/201112_7.json
Gameweek 8 complete, stored as data/201112_8.json
Gameweek 9 complete, stored as data/201112_9.json
Gameweek 10 complete, stored as data/201112_10.json
Gameweek 11 complete, stored as data/201112_11.json
Gameweek 12 complete, stored as data/201112_12.json
Gameweek 13 complete, stored as data/201112_13.json
Gameweek 14 complete, stored as data/201112_14.json
Gameweek 15 complete, stored as data/201112_15.json
Gameweek 16 complete, stored as data/201112_16.json
Gameweek 17 complete, stored as data/201112_17.json
Gameweek 18 complete, stored as data/201112_18.json
Gameweek 19 complete, stored as data/201112_19.json
Gameweek 20 complete, stored as data/201112_20.json
Gameweek 21 complete, stored as data/201112_21.json
Gameweek 22 complete, stored as data/201112_22.json
Gameweek 23 complete, st

KeyError: 'gameweek'

In [68]:
chunk7 = [base_url + str(i) for i in range(7837,9991)]
t1 = time.time()
chunk7_yielder = urls_to_json_yield(chunk7)
yield_then_store_by_gameweek(chunk7_yielder,"37")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Non prem game found (Id:7847): Competition = AFAPL
Non prem game found (Id:7848): Competition = AFAPL
Non prem game found (Id:7849): Competition = AFAPL
Non prem game found (Id:7850): Competition = AFAPL
Non prem game found (Id:7851): Competition = AFAPL
Non prem game found (Id:7852): Competition = AFAPL
Non prem game found (Id:7853): Competition = AFAPL
Non prem game found (Id:7854): Competition = AFAPL
Non prem game found (Id:7855): Competition = AFAPL
Non prem game found (Id:7856): Competition = AFAPL
Non prem game found (Id:7857): Competition = AFAPL
Non prem game found (Id:7858): Competition = AFAPL
Non prem game found (Id:7859): Competition = AFAPL
Non prem game found (Id:7860): Competition = AFAPL
Non prem game found (Id:7861): Competition = AFAPL
Non prem game found (Id:7862): Competition = AFAPL
Non prem game found (Id:7863): Competition = AFAPL
Gameweek 37 complete, stored as data/201112_37.json
Gameweek 1 complete, stored as data/201213_1.json
Gameweek 2 complete, stored as 

In [7]:
chunk1315 = [base_url + str(i) for i in range(9231,9991)]
t1 = time.time()
chunk1315_yielder = urls_to_json_yield(chunk1315)
yield_then_store_by_gameweek(chunk1315_yielder,"1")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Gameweek 1 complete, stored as data/201314_1.json
Gameweek 2 complete, stored as data/201314_2.json
Gameweek 3 complete, stored as data/201314_3.json
Final ID Saved: 9266
Done
Took 888.38 seconds = 14.81 mins = 0.25 hours


In [8]:
chunk1315_2 = [base_url + str(i) for i in range(9261,9991)]
t1 = time.time()
chunk1315_2_yielder = urls_to_json_yield(chunk1315_2)
yield_then_store_by_gameweek(chunk1315_2_yielder,"4")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Gameweek 4 complete, stored as data/201314_4.json
Gameweek 5 complete, stored as data/201314_5.json
Gameweek 6 complete, stored as data/201314_6.json
Gameweek 7 complete, stored as data/201314_7.json
Gameweek 8 complete, stored as data/201314_8.json
Gameweek 9 complete, stored as data/201314_9.json
Gameweek 10 complete, stored as data/201314_10.json
Gameweek 11 complete, stored as data/201314_11.json
Gameweek 12 complete, stored as data/201314_12.json
Gameweek 13 complete, stored as data/201314_13.json
Gameweek 14 complete, stored as data/201314_14.json
Gameweek 15 complete, stored as data/201314_15.json
Gameweek 16 complete, stored as data/201314_16.json
Gameweek 17 complete, stored as data/201314_17.json
Gameweek 18 complete, stored as data/201314_18.json
Gameweek 19 complete, stored as data/201314_19.json
Gameweek 20 complete, stored as data/201314_20.json
Gameweek 21 complete, stored as data/201314_21.json
Gameweek 22 complete, stored as data/201314_22.json
Gameweek 23 complete, st

In [11]:
urls1516 = [base_url + str(i) for i in range(12115,12495)] # 12115 to 12494 inclusive
urls1617 = [base_url + str(i) for i in range(14040,14420)] # 14040 to 14419 inclusive
urls1718 = [base_url + str(i) for i in range(22342,22722)] # 22342 to 22721 inclusive
urls1819 = [base_url + str(i) for i in range(38308,38688)] # 38308 to 38687 inclusive
urls1920 = [base_url + str(i) for i in range(46605,46985)] # 46605 to 46984 inclusive
chunk1517 = urls1516 + urls1617
t1 = time.time()
chunk1517_yielder = urls_to_json_yield(chunk1517)
yield_then_store_by_gameweek(chunk1517_yielder,"1")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Gameweek 1 complete, stored as data/201516_1.json
Gameweek 2 complete, stored as data/201516_2.json
Gameweek 3 complete, stored as data/201516_3.json
Gameweek 4 complete, stored as data/201516_4.json
Gameweek 5 complete, stored as data/201516_5.json
Gameweek 6 complete, stored as data/201516_6.json
Gameweek 7 complete, stored as data/201516_7.json
Gameweek 8 complete, stored as data/201516_8.json
Gameweek 9 complete, stored as data/201516_9.json
Gameweek 10 complete, stored as data/201516_10.json
Gameweek 11 complete, stored as data/201516_11.json
Gameweek 12 complete, stored as data/201516_12.json
Gameweek 13 complete, stored as data/201516_13.json
Gameweek 14 complete, stored as data/201516_14.json
Gameweek 15 complete, stored as data/201516_15.json
Gameweek 16 complete, stored as data/201516_16.json
Gameweek 17 complete, stored as data/201516_17.json
Gameweek 18 complete, stored as data/201516_18.json
Gameweek 19 complete, stored as data/201516_19.json
Gameweek 20 complete, stored a

In [13]:
t1 = time.time()
urls1718_yielder = urls_to_json_yield(urls1718)
yield_then_store_by_gameweek(urls1718_yielder,"1")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Gameweek 1 complete, stored as data/201718_1.json
Gameweek 2 complete, stored as data/201718_2.json
Gameweek 32 complete, stored as data/201718_32.json
Gameweek 3 complete, stored as data/201718_3.json
Gameweek 4 complete, stored as data/201718_4.json
Gameweek 5 complete, stored as data/201718_5.json
Gameweek 6 complete, stored as data/201718_6.json
Gameweek 7 complete, stored as data/201718_7.json
Gameweek 8 complete, stored as data/201718_8.json
Gameweek 9 complete, stored as data/201718_9.json
Gameweek 10 complete, stored as data/201718_10.json
Gameweek 11 complete, stored as data/201718_11.json
Gameweek 12 complete, stored as data/201718_12.json
Gameweek 13 complete, stored as data/201718_13.json
Gameweek 14 complete, stored as data/201718_14.json
Gameweek 15 complete, stored as data/201718_15.json
Gameweek 16 complete, stored as data/201718_16.json
Gameweek 17 complete, stored as data/201718_17.json
Gameweek 18 complete, stored as data/201718_18.json
Gameweek 19 complete, stored a

In [9]:
urls1718 = [base_url + str(i) for i in range(22342,22722)] # 22342 to 22721 inclusive
urls1819 = [base_url + str(i) for i in range(38308,38688)] # 38308 to 38687 inclusive
urls1920 = [base_url + str(i) for i in range(46605,46985)] # 46605 to 46984 inclusive
urls1718_2 = [base_url + str(i) for i in range(22612,22722)]
t1 = time.time()
urls1718_2_yielder = urls_to_json_yield(urls1718_2)
yield_then_store_by_gameweek(urls1718_2_yielder,"28")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Gameweek 28 complete, stored as data/201718_28.json
Gameweek 29 complete, stored as data/201718_29.json
Gameweek 30 complete, stored as data/201718_30.json
Gameweek 31 complete, stored as data/201718_31.json
Gameweek 34 complete, stored as data/201718_34.json
Gameweek 31 complete, stored as data/201718_31.json
Gameweek 37 complete, stored as data/201718_37.json
Gameweek 31 complete, stored as data/201718_31.json
Gameweek 37 complete, stored as data/201718_37.json
Gameweek 31 complete, stored as data/201718_31.json
Gameweek 37 complete, stored as data/201718_37.json
Gameweek 32 complete, stored as data/201718_32.json
Gameweek 2 complete, stored as data/201718_2.json
Gameweek 32 complete, stored as data/201718_32.json
Gameweek 33 complete, stored as data/201718_33.json
Gameweek 34 complete, stored as data/201718_34.json
Gameweek 35 complete, stored as data/201718_35.json
Gameweek 34 complete, stored as data/201718_34.json
Gameweek 37 complete, stored as data/201718_37.json
Gameweek 35 co

In [14]:
urls1617 = [base_url + str(i) for i in range(14040,14420)] # 14040 to 14419 inclusive
urls1718 = [base_url + str(i) for i in range(22342,22722)] # 22342 to 22721 inclusive
urls1819 = [base_url + str(i) for i in range(38308,38688)] # 38308 to 38687 inclusive
urls1920 = [base_url + str(i) for i in range(46605,46985)] # 46605 to 46984 inclusive
t1 = time.time()
urls1617_yielder = urls_to_json_yield(urls1617)
yield_then_store_by_gameweek(urls1617_yielder,"1")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Gameweek 1 complete, stored as data/201617_1.json
Gameweek 2 complete, stored as data/201617_2.json
Gameweek 28 complete, stored as data/201617_28.json
Gameweek 2 complete, stored as data/201617_2_.json
Gameweek 3 complete, stored as data/201617_3.json
Gameweek 4 complete, stored as data/201617_4.json
Gameweek 5 complete, stored as data/201617_5.json
Gameweek 6 complete, stored as data/201617_6.json
Gameweek 7 complete, stored as data/201617_7.json
Gameweek 8 complete, stored as data/201617_8.json
Gameweek 9 complete, stored as data/201617_9.json
Gameweek 10 complete, stored as data/201617_10.json
Gameweek 11 complete, stored as data/201617_11.json
Gameweek 12 complete, stored as data/201617_12.json
Gameweek 13 complete, stored as data/201617_13.json
Gameweek 14 complete, stored as data/201617_14.json
Gameweek 15 complete, stored as data/201617_15.json
Gameweek 16 complete, stored as data/201617_16.json
Gameweek 17 complete, stored as data/201617_17.json
Gameweek 18 complete, stored as

In [16]:
t1 = time.time()
rest_of_data = urls1718 + urls1819 + urls1920
rest_yielder = urls_to_json_yield(rest_of_data)
yield_then_store_by_gameweek(rest_yielder,"1")
t2 = time.time()
print("Took {} seconds = {} mins = {} hours".format(round(t2-t1,2) , round((t2-t1)/60,2) , round((t2-t1)/3600,2) ))

Gameweek 1 complete, stored as data/201718_1.json
Gameweek 2 complete, stored as data/201718_2.json
Gameweek 32 complete, stored as data/201718_32.json
Gameweek 3 complete, stored as data/201718_3.json
Gameweek 4 complete, stored as data/201718_4.json
Gameweek 5 complete, stored as data/201718_5.json
Gameweek 6 complete, stored as data/201718_6.json
Gameweek 7 complete, stored as data/201718_7.json
Gameweek 8 complete, stored as data/201718_8.json
Gameweek 9 complete, stored as data/201718_9.json
Gameweek 10 complete, stored as data/201718_10.json
Gameweek 11 complete, stored as data/201718_11.json
Gameweek 12 complete, stored as data/201718_12.json
Gameweek 13 complete, stored as data/201718_13.json
Gameweek 14 complete, stored as data/201718_14.json
Gameweek 15 complete, stored as data/201718_15.json
Gameweek 16 complete, stored as data/201718_16.json
Gameweek 17 complete, stored as data/201718_17.json
Gameweek 18 complete, stored as data/201718_18.json
Gameweek 19 complete, stored a