# Data Cleaning - Player Data

This notebook will perform some data cleaning on the episodes dataframe. Changes mostly involve parsing the raw strings into a useful form for EDA and Modelling studies.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

Quick check that we havent lossed/duplicated data somewhere, check that the number of players is 4x the number of episodes.

In [2]:
df_episodes = pd.read_csv("../data/all_episodes.csv")
df_players = pd.read_csv("../data/all_players.csv")
# sanity check episodes x 4 = players
print("Episodes: {}".format(len(df_episodes)))
print("Players: {}".format(len(df_players)))
print("Length Check: {}".format(len(df_episodes)*4 == len(df_players)))

Episodes: 2161
Players: 8644
Length Check: True


## Dataframe Inspection

In [3]:
df_players.head()

Unnamed: 0,Date,PlayerNo.,Name,CashBuilder,Chaser,LowerOffer,HigherOffer,ChosenOffer,HTHResult,FC CorrectAnswers,FC Winner,Amount WonBy Player,bgc,Series,isCelebrity
0,29/06/2009,P1,Lisa,"£5,000",Mark Labbett,"£2,000","£10,000","£2,000 \/",Caught -1,,Chaser by 0:07,£0,,1,False
1,29/06/2009,P2,Ian,"£7,000",Mark Labbett,"£2,000","£20,000","£20,000 /\",Home +1,1 (team 2),Chaser by 0:07,£0,C,1,False
2,29/06/2009,P3,Claire,"£8,000",Mark Labbett,"£2,000","£20,000","£8,000 =",Caught -3,,Chaser by 0:07,£0,,1,False
3,29/06/2009,P4,Driss,"£9,000",Mark Labbett,£200,"£20,000",£200 \/,Home +5,15 (team 2),Chaser by 0:07,£0,C,1,False
4,30/06/2009,P1,Bradley,"£8,000",Shaun Wallace,"£4,000","£16,000","£8,000 =",Home +1,14 (team 3),Chaser by 0:02,£0,C,1,False


In [4]:
df_players.tail()

Unnamed: 0,Date,PlayerNo.,Name,CashBuilder,Chaser,LowerOffer,HigherOffer,ChosenOffer,HTHResult,FC CorrectAnswers,FC Winner,Amount WonBy Player,bgc,Series,isCelebrity
8639,21/05/2023,P4,Eamonn Holmes,"£6,000",Jenny Ryan,"£1,000","£143,000","£143,000 /\",Caught -2,,Chaser by 0:11,£0,,0,True
8640,09/06/2023,P1,Alex Brooker,"£9,000",Anne Hegerty,"£3,000","£45,000","£9,000 =",Home +3,2 (team 4),Team by 4,"£42,250",T,0,True
8641,09/06/2023,P2,Kyle Walker,"£4,000",Anne Hegerty,"£1,000","£90,000","£4,000 =",Home +2,1 (team 4),Team by 4,"£42,250",T,0,True
8642,09/06/2023,P3,Lynsey Hipgrave,"£6,000",Anne Hegerty,"£1,000","£120,000","£6,000 =",Home +2,2 (team 4),Team by 4,"£42,250",T,0,True
8643,09/06/2023,P4,Dermot O'Leary,"£6,000",Anne Hegerty,"£3,000","£150,000","£150,000 /\",Home +2,12 (team 4),Team by 4,"£42,250",T,0,True


## Drop Columns

For these studies we don't need to track information on the players contribution to the 

In [5]:
df_players = df_players.drop(columns=['PlayerNo.', 'Chaser', 'FC CorrectAnswers', 'FC Winner', 'Amount WonBy Player', 'bgc', 'isCelebrity'])

In [6]:
df_players.head()

Unnamed: 0,Date,Name,CashBuilder,LowerOffer,HigherOffer,ChosenOffer,HTHResult,Series
0,29/06/2009,Lisa,"£5,000","£2,000","£10,000","£2,000 \/",Caught -1,1
1,29/06/2009,Ian,"£7,000","£2,000","£20,000","£20,000 /\",Home +1,1
2,29/06/2009,Claire,"£8,000","£2,000","£20,000","£8,000 =",Caught -3,1
3,29/06/2009,Driss,"£9,000",£200,"£20,000",£200 \/,Home +5,1
4,30/06/2009,Bradley,"£8,000","£4,000","£16,000","£8,000 =",Home +1,1


## PrizeFund string to Float

When a player answers no questions in the cash builder the Chaser can offer no money for the lower offer, marked as "no offer". There are also some columns marked as "unknown". In rarer cases we have no HigherOffer also with just a blank "£" given.

These shall be replaced with np.nan as XGBoost should be able to handle missing values like this

In [7]:
df_players.loc[df_players["LowerOffer"] == "no offer", "LowerOffer"] = np.nan
df_players.loc[df_players["LowerOffer"] == "unknown", "LowerOffer"] = np.nan
df_players.loc[df_players["HigherOffer"] == "£", "HigherOffer"] = np.nan

Want to Convert the string "£20,200" into numeric "20200.00" and also remove the selected offer notation on the ChosenOffer column, i.e. " /\" or " =". We can infer this by column comparisons later in the analysis. 

In [8]:
def strip_offer_string(offer_string):
    try:
        return float(offer_string)
    except ValueError:
        # keep only first split of string
        output_string = offer_string.split()[0]

        # remove currency symbol and commas
        output_string = output_string.replace("£", "").replace(",", "")
        # format pence to be 1/100th of a £...
        output_string = output_string.replace("p", "e-2")

        # if nothing survived, will just return zero
        if output_string == "":
            return 0

        return float(output_string)



df_players["CashBuilder"] = df_players["CashBuilder"].apply(lambda x: strip_offer_string(x))
df_players["LowerOffer"] = df_players["LowerOffer"].apply(lambda x: strip_offer_string(x))
df_players["HigherOffer"] = df_players["HigherOffer"].apply(lambda x: strip_offer_string(x))
df_players["ChosenOffer"] = df_players["ChosenOffer"].apply(lambda x: strip_offer_string(x))

## Format HeadToHead Result

We want to format the "HTHResult" column into two separate columns:
- isHome (boolean): Was the player successfull in getting back home?  
- HTHResult (int): At the end of the Head to Head what was the difference between the chaser and players ladder position? Positive for Home, Negative for Caught.  

In [9]:
df_players["isHome"] = df_players["HTHResult"].str.contains("Home")

def strip_hth_string(offer_string):
    
    # keep only last split of string
    output_string = offer_string.split()[-1]
    
    if output_string == "Home":
        return 1
    if output_string == "Caught":
        return -1
    
    output_string = output_string.replace("+","")

    if output_string == "":
        return 1
    
    return int(output_string)

df_players["HTHResult"] = df_players["HTHResult"].apply(lambda x: strip_hth_string(x))

In [10]:
df_players["HTHResult"].value_counts()

 2    1712
 3    1432
 1    1405
-3     780
-2     721
-1     718
-4     631
 4     591
-5     401
 5     139
-6     102
 6      12
Name: HTHResult, dtype: int64

Remove the HTHResult column

In [11]:
df_players = df_players.drop(columns=['HTHResult'])

# Final Inspection and Save

In [12]:
df_players.head()

Unnamed: 0,Date,Name,CashBuilder,LowerOffer,HigherOffer,ChosenOffer,Series,isHome
0,29/06/2009,Lisa,5000.0,2000.0,10000.0,2000.0,1,False
1,29/06/2009,Ian,7000.0,2000.0,20000.0,20000.0,1,True
2,29/06/2009,Claire,8000.0,2000.0,20000.0,8000.0,1,False
3,29/06/2009,Driss,9000.0,200.0,20000.0,200.0,1,True
4,30/06/2009,Bradley,8000.0,4000.0,16000.0,8000.0,1,True


In [13]:
df_players.tail()

Unnamed: 0,Date,Name,CashBuilder,LowerOffer,HigherOffer,ChosenOffer,Series,isHome
8639,21/05/2023,Eamonn Holmes,6000.0,1000.0,143000.0,143000.0,0,False
8640,09/06/2023,Alex Brooker,9000.0,3000.0,45000.0,9000.0,0,True
8641,09/06/2023,Kyle Walker,4000.0,1000.0,90000.0,4000.0,0,True
8642,09/06/2023,Lynsey Hipgrave,6000.0,1000.0,120000.0,6000.0,0,True
8643,09/06/2023,Dermot O'Leary,6000.0,3000.0,150000.0,150000.0,0,True


In [14]:
df_players.to_csv('../data/cleaned_players.csv', index=False)