In [2]:
import requests
from bs4 import BeautifulSoup
import re
import datetime
import pandas as pd
import json

In [3]:
event_dict = {'PS' : 'Play Start',
              'PE' : 'Play End',
              'G' : 'Goal',
              'S' : 'Substitution',
              'O' : 'Own Goal',
              'B' : 'Booking',
              'P' : 'Penalty'}

description_dict = {'G' : 'Goal',
                    'Y' : 'Yellow',
                    'YR' : 'Red (Second Yellow)',
                    'R' : 'Red',
                    'ON' : 'Player On',
                    'OFF' : 'Player Off',
                    'P' : 'Penalty',
                    'O' : 'Own Goal'}

season_dict = {'1992-93' : range(1, 463),
               '1993-94' : range(463, 925),
               '1994-95' : range(925, 1387),
               '1995-96' : range(1387, 1767),
               '1996-97' : range(1767, 2147),
               '1997-98' : range(2147, 2527),
               '1998-99' : range(2527, 2907),
               '1999-00' : range(2907, 3287),
               '2000-01' : range(3287, 3667),
               '2001-02' : range(3667, 4047),
               '2002-03' : range(4047, 4427),
               '2003-04' : range(4427, 4807),
               '2004-05' : range(4807, 5187),
               '2005-06' : range(5187, 5567),
               '2006-07' : range(5567, 5947),
               '2007-08' : range(5947, 6327),
               '2008-09' : range(6327, 6707),
               '2009-10' : range(6707, 7087),
               '2010-11' : range(7087, 7467),
               '2011-12' : range(7467, 7847),
               '2012-13' : range(7864, 8244),
               '2013-14' : range(9231, 9611),
               '2014-15' : range(9611, 9991),
               '2015-16' : range(12115, 12495),
               '2016-17' : range(14040, 14420)}

In [4]:
def getMatch(matchID) :
    #print('\rMatch Id : ', matchID, end = '')
    url = "https://www.premierleague.com/match/" + str(matchID)
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "lxml")
    
    find = str(soup.find_all(attrs = {"class" : "mcTabsContainer"})).split('</div>')[0]
    
    str_data = find.lstrip("""[<div class="mcTabsContainer" data-fixture=\'""")
    str_data = str_data.rstrip("""\' data-script="pl_match-centre" data-widget="match-tabs">\n<div class="wrapper col-12">\n<div class="tabLinks matchNav">\n<div class="tabs">""")
    try :
        match_data = json.loads(str_data)
    except Exception :
        return False
    return(match_data)

In [5]:
def getTeams(match_data) :
    teams = match_data['teams']
    home_team = teams[0]['team']['name']
    home_team_id = teams[0]['team']['id']
    
    away_team = teams[1]['team']['name']
    away_team_id = teams[1]['team']['id']
    
    return((home_team, home_team_id), (away_team, away_team_id))

In [6]:
def getFullTimeScore(match_data, str_output = False) :
    data = match_data['teams']
    home = data[0]['score']
    away = data[1]['score']
    if str_output :
        return(str(home) + '-' + str(away))
    else :
        return(home, away)

In [7]:
def getHalfTimeScore(match_data, str_output = False) :
    home = match_data['halfTimeScore']['homeScore']
    away = match_data['halfTimeScore']['awayScore']
    
    if str_output :
        return(str(home) + '-' + str(away))
    else :
        return(home, away)

In [8]:
def getMatchInfo(match_data) :
    competition = match_data['gameweek']['compSeason']['competition']['description']
    season = match_data['gameweek']['compSeason']['label']
    gameweek = match_data['gameweek']['gameweek']
    date = datetime.datetime.fromtimestamp(match_data['kickoff']['millis'] / 1e3)
    stadium = match_data['ground']['name']
    city = match_data['ground']['city']
    attendance = match_data['attendance']
    match_officials = [(match_data['matchOfficials'][i]['name']['display'], match_data['matchOfficials'][i]['role']) for i in range(len(match_data['matchOfficials']))]
    
    return(competition, season, gameweek, date, stadium, city, attendance, match_officials)

In [9]:
def getTeamsInfo(match_data) :
    home_formation, away_formation = None, None
    if 'formation' in match_data['teamLists'][0] :
        home_formation = match_data['teamLists'][0]['formation']['label']
    if 'formation' in match_data['teamLists'][1] :
        away_formation = match_data['teamLists'][1]['formation']['label']
        
    home_team = [(match_data['teamLists'][0]['lineup'][i]['name']['display'],
                  match_data['teamLists'][0]['lineup'][i]['id'],
                  match_data['teamLists'][0]['lineup'][i]['matchPosition']) for i in range(len(match_data['teamLists'][0]['lineup']))]
    
    away_team = [(match_data['teamLists'][1]['lineup'][i]['name']['display'],
                  match_data['teamLists'][1]['lineup'][i]['id'],
                  match_data['teamLists'][1]['lineup'][i]['matchPosition']) for i in range(len(match_data['teamLists'][1]['lineup']))]
    
    return(home_team, home_formation, away_team, away_formation)

In [49]:
def getMatchEvents(match_data) :
    events = match_data['events']
    match_events = []
    gw = match_data['gameweek']['gameweek']
    gid = match_data['id']
    for event in events :
        minute = event['clock']['label']
        
        event_type = event_dict[event['type']]
        
        event_description = None
        if 'description' in event :
            event_description = description_dict[event['description']]
            
        player = None
        if 'personId' in event :
            player = event['personId']
            #player_name = match_data['teamLists'][0]['lineup'][i]['name']['display']
        
        team = None
        if 'teamId' in event :
            team = event['teamId']
        
        assister = None
        if 'assistId' in event :
            assister = event['assistId']
            #assister_name = match_data['teamLists'][0]['lineup'][i]['name']['display']
        
        match_events.append((gw, gid, minute, event_type, event_description, player, team, assister))
    return match_events

In [11]:
def jsonDump(season, path) : 
    data = [getMatch(match) for match in season_dict[season]]
    bad_vals = data.count(False)
    with open(path + season + '.json', 'w') as fp:
        json.dump(data, fp)
    with open(path + 'stats.txt', 'a') as st :
        st.write('{}, {}/{} Bad Data\n'.format(season, bad_vals, len(data)))