In [473]:
#importing packages

import os 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [474]:
# unpack single season data
def unpack_season(year_fp,current, teams = None):
    #dictionaries to store gameweek details and player details 
    gw_data = {}
    players = {}
    
    #dictionary to store season data 
    data = {}
    
    #generating file paths for the data
    players_raw_fp = os.path.join(year_fp, 'players_raw.csv')
    cleaned_players_fp = os.path.join(year_fp,'cleaned_players.csv')
    gw_data_fp = os.path.join(year_fp, 'gws')
    players_fp = os.path.join(year_fp, 'players')
    
    season = 'season-' + year_fp.split('/')[1][2:].replace('-','') + '.csv'
    season_fp = os.path.join(year_fp, season)
    
    #reading in dataframes and storing in the data dictionary 
    players_raw = pd.read_csv(players_raw_fp)
    data['players_raw'] = players_raw
    
    #differing data files between the current season and past seasons 
    if year_fp.split('/')[1] != current:
        current_season = pd.read_csv(season_fp)
        data['current_season'] = clean_results(current_season)
        
        #gw data only available for past seasons 
        for file in os.listdir(gw_data_fp):
            filename = os.fsdecode(file)
            key_val = filename.split('.')[0]
            gw_fp = os.path.join(gw_data_fp,filename)
            try:
                gw_data[key_val] = pd.read_csv(gw_fp, encoding = "ISO-8859-1")
            except:
                continue
                
        data['gw_data'] = gw_data
    else:
        fixtures_fp = os.path.join('data', current, 'fixtures.csv')
        data['fixtures'] = pd.read_csv(fixtures_fp)
        
        teams_fp = os.path.join('data', current, 'teams.csv')
        data['teams'] = pd.read_csv(teams_fp)
                
        
    #looping through all players and getting their gameweek and past season history
    for file in os.listdir(players_fp):
        player_gw_fp = os.path.join(players_fp, file, 'gw.csv')
        player_history_fp = os.path.join(players_fp,file,'history.csv')
        players[file] = {}
        try:
            # we dont have gameweek data at the start of a season 
            player_gw = pd.read_csv(player_gw_fp)
            players[file]['player_gw'] = player_gw
        except:
            pass
        try:
            # new players in the league do not have any history.csv file 
            player_history = pd.read_csv(player_history_fp)
            players[file]['players_history'] = player_history 
        except:
            pass
    data['players'] = players
    
    #we only want players that are playing the current season
    if isinstance(teams, type(None)):
        get_squads(data,data['teams'])
    else:
        get_squads(data,teams)
    
    print(data.keys())
    print()
    
    
    return data

In [475]:
def get_squads(season,teams):
    players_raw = season['players_raw']
    #creating a unique id which corresponds to the key of the players dictionary 
    players_raw['player_name_code'] = players_raw['first_name'] + '_' + players_raw['second_name'] + '_' + players_raw['id'].map(str)
    #refer to players by their webname 
    players_raw['name'] = players_raw['web_name']
    #getting the team names for each player 
    players_raw = players_raw.merge(teams[['name','code']],left_on = 'team_code', right_on = 'code', how = 'left')
    players_raw['team'] = players_raw['name_y']
    players_raw['name'] = players_raw['name_x']
    #dropping unnecessary columns
    players_raw = players_raw.drop(['photo','first_name','second_name','web_name','code_x','code_y','name_y','name_x','team_code'], axis = 1)
    #dropping players that have left the league 
    players_raw = players_raw.drop(players_raw.loc[players_raw.status== 'n'].index, axis = 0)
    #display(players_raw)
    squads = {}
    #looping through the teams in the current season
    for team in teams.name.values:
        #adding each teams squads to a dictionary with key of the teams name
        squad = players_raw.loc[players_raw['team'] == team]
        squads[team] = squad.copy()
    #no longer need raw player data since we have squad information
    del season['players_raw']
    season['squads'] = squads
    return squads
    
    
    

In [482]:

def clean_results(df):
    data_dict = os.path.join('data','data_dict.txt')
    column_conversions = {}
    f = open(data_dict, 'r')
    for line in f:
        if '(' in line:
            line = line.split('(')[0]
        if '=' in line:
            line = line.split('=')
            if 'and' in line[0]:
                l = line[0].split('and')
                column_conversions[l[0].strip()] = line[1].strip()
                column_conversions[l[1].strip()] = line[1].strip()
            else:
                column_conversions[line[0].strip()] = line[1].strip()

    cols = []
    for col in df.columns:
        if col in column_conversions.keys():
            cols.append(column_conversions[col])
        else:
            cols.append(col)
    df.columns = cols
    #getting rid of betting data for time being 
    df = df.loc[:,'Match Date': 'Away Team Red Cards']
    return df

In [485]:
#loop through all season data and return a dictionary of all the data
def get_all_seasons(current):
    seasons = {}
    print(current)
    current_season_fp = os.path.join('data', current)
    #getting current season information since we need to know the teams playing 
    current_season = unpack_season(current_season_fp, current)
    seasons[current] = current_season
    teams = seasons[current]['teams']
    #get all previous season data 
    for file in os.listdir('data'):
        if file == current or file == 'data_dict.txt':
            continue
        print(file)
        season_fp = os.path.join('data', file)
        seasons[file] = unpack_season(season_fp, current, teams)
            
    return seasons 

In [486]:
seasons_data = get_all_seasons('2019-20')

2019-20
dict_keys(['fixtures', 'teams', 'players', 'squads'])

2017-18
dict_keys(['current_season', 'gw_data', 'players', 'squads'])

2016-17
dict_keys(['current_season', 'gw_data', 'players', 'squads'])

2018-19
dict_keys(['current_season', 'gw_data', 'players', 'squads'])



In [487]:
seasons = seasons_data.copy()

In [511]:
cop = seasons['2018-19']['current_season']
ar = cop.loc[cop['Away Team Red Cards']>0]
hr = cop.loc[cop['Home Team Red Cards']>0]

In [513]:
ar['Full Time Result'].value_counts()

H    14
A     7
D     7
Name: Full Time Result, dtype: int64

In [514]:
hr['Full Time Result'].value_counts()

A    12
H     4
D     2
Name: Full Time Result, dtype: int64

In [520]:
cop.loc[(cop['Half Time Result'] == 'A') & (cop['Full Time Result'] == 'H')]

Unnamed: 0,Match Date,Home Team,Away Team,Full Time Home Team Goals,Full Time Away Team Goals,Full Time Result,Half Time Home Team Goals,Half Time Away Team Goals,Half Time Result,Match Referee,...,Home Team Shots on Target,Away Team Shots on Target,Home Team Fouls Committed,Away Team Fouls Committed,Home Team Corners,Away Team Corners,Home Team Yellow Cards,Away Team Yellow Cards,Home Team Red Cards,Away Team Red Cards
74,06/10/2018,Man United,Newcastle,3,2,H,0,2,A,A Taylor,...,10,8,16,8,10,6,2,2,0,0
130,30/11/2018,Cardiff,Wolves,2,1,H,0,1,A,A Marriner,...,3,4,3,12,7,6,1,2,0,0
137,02/12/2018,Arsenal,Tottenham,4,2,H,1,2,A,M Dean,...,7,6,15,17,8,5,3,3,0,1
149,05/12/2018,Wolves,Chelsea,2,1,H,0,1,A,J Moss,...,2,3,18,10,1,5,4,4,0,0
157,08/12/2018,West Ham,Crystal Palace,3,2,H,0,1,A,A Taylor,...,6,4,10,8,5,3,1,2,0,0
222,19/01/2019,Liverpool,Crystal Palace,4,3,H,0,1,A,J Moss,...,9,3,6,8,8,3,0,1,1,0
231,29/01/2019,Fulham,Brighton,4,2,H,0,2,A,L Probert,...,7,6,10,5,10,1,2,3,0,0
234,29/01/2019,Newcastle,Man City,2,1,H,0,1,A,P Tierney,...,2,4,9,7,1,8,2,3,0,0
239,30/01/2019,Tottenham,Watford,2,1,H,0,1,A,G Scott,...,3,1,6,9,9,5,0,4,0,0
282,02/03/2019,Man United,Southampton,3,2,H,0,1,A,S Attwell,...,6,3,7,10,9,7,2,1,0,0


In [518]:
cop

Unnamed: 0,Match Date,Home Team,Away Team,Full Time Home Team Goals,Full Time Away Team Goals,Full Time Result,Half Time Home Team Goals,Half Time Away Team Goals,Half Time Result,Match Referee,...,Home Team Shots on Target,Away Team Shots on Target,Home Team Fouls Committed,Away Team Fouls Committed,Home Team Corners,Away Team Corners,Home Team Yellow Cards,Away Team Yellow Cards,Home Team Red Cards,Away Team Red Cards
0,10/08/2018,Man United,Leicester,2,1,H,1,0,H,A Marriner,...,6,4,11,8,2,5,2,1,0,0
1,11/08/2018,Bournemouth,Cardiff,2,0,H,1,0,H,K Friend,...,4,1,11,9,7,4,1,1,0,0
2,11/08/2018,Fulham,Crystal Palace,0,2,A,0,1,A,M Dean,...,6,9,9,11,5,5,1,2,0,0
3,11/08/2018,Huddersfield,Chelsea,0,3,A,0,2,A,C Kavanagh,...,1,4,9,8,2,5,2,1,0,0
4,11/08/2018,Newcastle,Tottenham,1,2,A,1,2,A,M Atkinson,...,2,5,11,12,3,5,2,2,0,0
5,11/08/2018,Watford,Brighton,2,0,H,1,0,H,J Moss,...,5,0,10,16,8,2,2,2,0,0
6,11/08/2018,Wolves,Everton,2,2,D,1,1,D,C Pawson,...,4,5,8,7,3,6,0,1,0,1
7,12/08/2018,Arsenal,Man City,0,2,A,0,1,A,M Oliver,...,3,8,11,14,2,9,2,2,0,0
8,12/08/2018,Liverpool,West Ham,4,0,H,2,0,H,A Taylor,...,8,2,14,9,5,4,1,2,0,0
9,12/08/2018,Southampton,Burnley,0,0,D,0,0,D,G Scott,...,3,6,10,9,8,5,0,1,0,0
