In [20]:
import pandas as pd
import time
import ast

In [21]:
def run_data_quality_checks_per_row(row):
    goal_count, outcome = check_goals_and_outcome(row)
    home_lineup, away_lineup = check_lineups(row)
    check = ""
    if goal_count == False:
        check += "goal_count "
    if outcome == False:
        check += "outcome "
    if home_lineup == False:
        check += "home_lineup "
    if away_lineup == False:
        check += "away_lineup"
    
    return check

def check_lineups(row):
    home = False
    away = False
    home_lineup = len(ast.literal_eval(row["home_lineup"]))
    away_lineup = len(ast.literal_eval(row["away_lineup"]))
    if home_lineup == 11:
        home = True
    if away_lineup == 11:
        away = True
    return home, away
        
def check_goals_and_outcome(row):
    goal_count = False
    outcome = False
    expected_home_goals = row["hScore"]
    expected_away_goals = row["aScore"]
    expected_total_goals = expected_home_goals + expected_away_goals
    actual_goals = len(ast.literal_eval(row["goals"]))
    if actual_goals == expected_total_goals:
        goal_count = True
    given_outcome = row["outcome"]
    if expected_home_goals > expected_away_goals:
        if given_outcome == "H":
            outcome = True
    elif expected_home_goals == expected_away_goals:
        if given_outcome == "D":
            outcome = True
    else:
        if given_outcome == "A":
            outcome = True
    return goal_count, outcome

In [24]:
def run_overall_quality_checks(df):
    for season in set(df["season"].tolist()):
        df_season = df.query("season == @season")
        print("Season {} has {} games".format(season,len(df_season)))
        for gameweek in set(df_season["gameweek"].tolist()):
            df_gameweek = df_season.query("gameweek == @gameweek")
            if len(df_gameweek) != 10:
                print("Gameweek {} of season {} has {} games".format(gameweek,season,len(df_gameweek)))

In [25]:
df = pd.read_csv("C:/Users/joeco/Python/fantasy-football-strategy/data-cleaning/data_outputs/all_prem_data_cleaned.csv")
df["checks"] = df.apply(run_data_quality_checks_per_row,axis=1)
run_overall_quality_checks(df)
df.to_csv("all_prem_cleaned_with_initial_quality_checks.csv",index=False)

Season 12/13 has 380 games
Gameweek 1 of season 12/13 has 11 games
Gameweek 2 of season 12/13 has 9 games
Gameweek 3 of season 12/13 has 9 games
Gameweek 16 of season 12/13 has 11 games
Gameweek 17 of season 12/13 has 9 games
Gameweek 19 of season 12/13 has 9 games
Gameweek 22 of season 12/13 has 11 games
Gameweek 23 of season 12/13 has 11 games
Gameweek 26 of season 12/13 has 11 games
Gameweek 27 of season 12/13 has 9 games
Gameweek 29 of season 12/13 has 6 games
Gameweek 33 of season 12/13 has 7 games
Gameweek 34 of season 12/13 has 4 games
Gameweek 37 of season 12/13 has 13 games
Season 11/12 has 380 games
Gameweek 1 of season 11/12 has 9 games
Gameweek 19 of season 11/12 has 21 games
Gameweek 25 of season 11/12 has 9 games
Gameweek 27 of season 11/12 has 11 games
Gameweek 28 of season 11/12 has 4 games
Gameweek 29 of season 11/12 has 5 games
Gameweek 32 of season 11/12 has 20 games
Gameweek 33 of season 11/12 has 6 games
Gameweek 34 of season 11/12 has 11 games
Gameweek 35 of seaso

In [26]:
df["checks"].value_counts()

                5309
home_lineup        4
away_lineup        3
Name: checks, dtype: int64

In [32]:
home_lineup = "home_lineup "
away_lineup = "away_lineup"
df_home_lineups = df.query("checks == @home_lineup")
df_home_lineups.head()

Unnamed: 0,match_id,gameweek,season,kickoff_time,hName,hScore,aScore,aName,outcome,home_lineup,home_bench,away_lineup,away_bench,goals,subs,cards,phases,pen_misses,checks
4023,14267,23,16/17,01-02-2017 19:45,West Ham United,0,4,Manchester City,A,"[{'pos': 'D', 'id': 8897}, {'pos': 'D', 'id': ...","[{'pos': 'F', 'id': 13145}, {'pos': 'G', 'id':...","[{'pos': 'M', 'id': 4316}, {'pos': 'D', 'id': ...","[{'pos': 'M', 'id': 3799}, {'pos': 'D', 'id': ...","[{'time': ""17'00"", 'type': 'Goal', 'scorer_id'...","[{'time': ""64'00"", 'type': 'Sub', 'desc': 'ON'...","[{'time': ""19'00"", 'type': 'Yellow Card', 'pla...","[{'time': ""00'00"", 'type': 'Half Start'}, {'ti...",[],home_lineup
4029,14273,24,16/17,04-02-2017 15:00,Hull City,2,0,Liverpool,H,"[{'pos': 'M', 'id': 6494}, {'pos': 'M', 'id': ...","[{'pos': 'M', 'id': 3066}, {'pos': 'F', 'id': ...","[{'pos': 'F', 'id': 13511}, {'pos': 'M', 'id':...","[{'pos': 'D', 'id': 7219}, {'pos': 'D', 'id': ...","[{'time': ""44'00"", 'type': 'Goal', 'scorer_id'...","[{'time': ""62'00"", 'type': 'Sub', 'desc': 'ON'...","[{'time': ""27'00"", 'type': 'Yellow Card', 'pla...","[{'time': ""00'00"", 'type': 'Half Start'}, {'ti...",[],home_lineup
4040,14287,25,16/17,11-02-2017 15:00,Sunderland,0,4,Southampton,A,"[{'pos': 'D', 'id': 1717}, {'pos': 'M', 'id': ...","[{'pos': 'D', 'id': 4918}, {'pos': 'M', 'id': ...","[{'pos': 'M', 'id': 3811}, {'pos': 'D', 'id': ...","[{'pos': 'G', 'id': 20494}, {'pos': 'F', 'id':...","[{'time': ""30'00"", 'type': 'Goal', 'scorer_id'...","[{'time': ""45'00"", 'type': 'Sub', 'desc': 'ON'...","[{'time': ""90 +4'00"", 'type': 'Yellow Card', '...","[{'time': ""00'00"", 'type': 'Half Start'}, {'ti...",[],home_lineup
4049,14293,26,16/17,25-02-2017 15:00,Hull City,1,1,Burnley,D,"[{'pos': 'M', 'id': 3066}, {'pos': 'D', 'id': ...","[{'pos': 'F', 'id': 10460}, {'pos': 'F', 'id':...","[{'pos': 'D', 'id': 3999}, {'pos': 'D', 'id': ...","[{'pos': 'M', 'id': 8242}, {'pos': 'F', 'id': ...","[{'time': ""72'00"", 'type': 'Pen', 'scorer_id':...","[{'time': ""67'00"", 'type': 'Sub', 'desc': 'ON'...","[{'time': ""40'00"", 'type': 'Yellow Card', 'pla...","[{'time': ""00'00"", 'type': 'Half Start'}, {'ti...",[],home_lineup
