In [1]:
import pandas as pd
import time
from datetime import datetime
import ast

In [2]:
def clean_raw_data(input_filename,output_filename):
    df = pd.read_csv(input_filename)
    new_df = clean_dataframe(df)
    new_df.to_csv(output_filename,index=False)

In [3]:
def clean_dataframe(df):
    df["goals"], df["subs"], df["cards"], df["phases"], df["pen_misses"] = separate_events(df)
    df["home_lineup"], df["home_bench"], df["away_lineup"], df["away_bench"] = clean_lineups(df)
    df["kickoff_time"] = reformat_kickoffs(df)
    df["season"] = reformat_season(df)
    
    duplicate_drop_list = ["all_events","hLineup","hSubs","aLineup","aSubs","kickoff"]
    df.drop(duplicate_drop_list,inplace=True,axis=1)
    
    not_needed_list = ["opta_id","attendance","ref_id","ref_name","stadium","hAbbr","aAbbr"]
    df.drop(not_needed_list,inplace=True,axis=1)
    
    reordered_columns = ["match_id","gameweek","season","kickoff_time","hName","hScore","aScore","aName","outcome",
                         "home_lineup","home_bench","away_lineup","away_bench","goals","subs","cards","phases","pen_misses"]
    df = df[reordered_columns]
    
    renamed_columns = ["match_id","gameweek","season","kickoff_time","home_name","home_score","away_score","away_name",
                       "outcome","home_lineup","home_bench","away_lineup","away_bench","goals","subs","cards","phases",
                       "pen_misses"]
    
    df["date"] = [datetime.strptime(x,"%d-%m-%Y %H:%M") for x in df["kickoff_time"].tolist()]
    df.sort_values(by="date",inplace=True)
    df.drop("date",inplace=True,axis=1)
    return df

In [4]:
def separate_events(df):
    all_events_data = df["all_events"].tolist()
    max_i = len(all_events_data)
    goal_events = [""]*max_i
    sub_events = [""]*max_i
    card_events = [""]*max_i
    phase_events = [""]*max_i
    pen_events = [""]*max_i
    for i in range(max_i):
        event_data = ast.literal_eval(all_events_data[i])
        other_data, goals = extract_goals(event_data)
        other_data, subs = extract_subs(other_data)
        other_data, cards = extract_cards(other_data)
        other_data, phases = extract_phases(other_data)
        other_data, pens = extract_pen_misses(other_data)
        goal_events[i] = goals
        sub_events[i] = subs
        card_events[i] = cards
        phase_events[i] = phases
        pen_events[i] = pens
    return goal_events, sub_events, card_events, phase_events, pen_events

def extract_goals(event_data):
    goal_events = []
    other_events = []
    for i in range(len(event_data)):
        event_dict = event_data[i]
        if event_dict["type"] == "Goal":
            goal_events.append(event_dict)
        elif event_dict["type"] == "Penalty":
            event_dict["type"] = "Pen"
            goal_events.append(event_dict)
        elif event_dict["type"] == "O":
            event_dict["type"] = "Own Goal"
            goal_events.append(event_dict)
        else:
            other_events.append(event_dict)
    return other_events, goal_events

def extract_subs(event_data):
    subs_events = []
    other_events = []
    for i in range(len(event_data)):
        event_dict = event_data[i]
        if event_dict["type"] == "Sub":
            subs_events.append(event_dict)
        else:
            other_events.append(event_dict)
    return other_events, subs_events

def extract_cards(event_data):
    card_events = []
    other_events = []
    for i in range(len(event_data)):
        event_dict = event_data[i]
        if "card" in str(event_dict["type"]).lower():
            card_events.append(event_dict)
        else:
            other_events.append(event_dict)
    return other_events, card_events

def extract_phases(event_data):
    phase_events = []
    other_events = []
    for i in range(len(event_data)):
        event_dict = event_data[i]
        if "half" in str(event_dict["type"]).lower():
            phase_events.append(event_dict)
        else:
            other_events.append(event_dict)
    return other_events, phase_events

def extract_pen_misses(event_data):
    pen_events = []
    other_events = []
    for i in range(len(event_data)):
        event_dict = event_data[i]
        if event_dict["type"] == "SP":
            event_dict["type"] = "Pen Saved"
            pen_events.append(event_dict)
        elif event_dict["type"] == "MP":
            event_dict["type"] = "Pen Missed"
            pen_events.append(event_dict)
        else:
            other_events.append(event_dict)
    return other_events, pen_events

def str_to_list_of_dicts(string_data):
    print("String_data input:")
    print(string_data)
    
    literal_eval = ast.literal_eval(string_data)
    print("literal_eval:")
    print(literal_eval)
    
    output = literal_eval
    return output

In [5]:
def clean_lineups(df):
    name_id_dict = {}
    hLineup = df["hLineup"].tolist()
    hSubs = df["hSubs"].tolist()
    aLineup = df["aLineup"].tolist()
    aSubs = df["aSubs"].tolist()
    home_lineups = []
    home_benches = []
    away_lineups = []
    away_benches = []
    for lineup in hLineup:
        home_lineup = []
        lineup = ast.literal_eval(lineup)
        for player in lineup:
            new_player_dict = {}
            new_player_dict["pos"] = player["pos"]
            new_player_dict["id"] = player["id"]
            name_id_dict[str(player["id"])] = str(player["name"])
            home_lineup.append(new_player_dict)
        home_lineups.append(home_lineup)
    for lineup in aLineup:
        away_lineup = []
        lineup = ast.literal_eval(lineup)
        for player in lineup:
            new_player_dict = {}
            new_player_dict["pos"] = player["pos"]
            new_player_dict["id"] = player["id"]
            name_id_dict[str(player["id"])] = str(player["name"])
            away_lineup.append(new_player_dict)
        away_lineups.append(away_lineup)
    for bench in hSubs:
        home_bench = []
        bench = ast.literal_eval(bench)
        for player in bench:
            new_player_dict = {}
            new_player_dict["pos"] = player["pos"]
            new_player_dict["id"] = player["id"]
            name_id_dict[str(player["id"])] = str(player["name"])
            home_bench.append(new_player_dict)
        home_benches.append(home_bench)
    for bench in aSubs:
        away_bench = []
        bench = ast.literal_eval(bench)
        for player in bench:
            new_player_dict = {}
            new_player_dict["pos"] = player["pos"]
            new_player_dict["id"] = player["id"]
            name_id_dict[str(player["id"])] = str(player["name"])
            away_bench.append(new_player_dict)
        away_benches.append(away_bench)
    df = pd.DataFrame(list(name_id_dict.items()),columns = ['id','name'])
    df.to_csv("data_outputs/all_player_ids_and_names.csv",index=False)
    return home_lineups, home_benches, away_lineups, away_benches

In [6]:
def reformat_kickoffs(df):
    kickoffs = df["kickoff"].tolist()
    new_kickoffs = []
    for kickoff in kickoffs:
        ko_dt = datetime.strptime(kickoff[:-4], "%a %d %b %Y, %H:%M")
        ko_date_str = str(ko_dt.strftime("%d-%m-%Y %H:%M"))
        new_kickoffs.append(ko_date_str)
    return new_kickoffs

In [7]:
def reformat_season(df):
    seasons = df["season"].tolist()
    new_seasons = []
    for season in seasons:
        new_season = str(season[-5:])
        new_seasons.append(new_season)
    return new_seasons

In [8]:
input_filename = "C:/Users/joeco/Python/fantasy-football-strategy/data-cleaning/data_outputs/all_prem_data_raw.csv"
output_filename = "C:/Users/joeco/Python/fantasy-football-strategy/data-cleaning/data_outputs/all_prem_data_cleaned.csv"
t1 = time.time()
clean_raw_data(input_filename,output_filename)
t2 = time.time()
print("Time taken: {} seconds".format(round(t2-t1,2)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Time taken: 16.73 seconds
