# Data Cleaning - Player Data

This notebook will perform some data cleaning on the episodes dataframe. Changes mostly involve parsing the raw strings into a useful form for EDA and Modelling studies.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

Quick check that we havent lossed/duplicated data somewhere, check that the number of players is 4x the number of episodes.

In [2]:
df_episodes = pd.read_csv("../data/all_episodes.csv")
df_players = pd.read_csv("../data/all_players.csv")
# sanity check episodes x 4 = players
print("Episodes: {}".format(len(df_episodes)))
print("Players: {}".format(len(df_players)))
print("Length Check: {}".format(len(df_episodes)*4 == len(df_players)))

Episodes: 2021
Players: 8084
Length Check: True


## Dataframe Inspection

In [3]:
df_players.head()

Unnamed: 0,Date,PlayerNo.,Name,CashBuilder,Chaser,LowerOffer,HigherOffer,ChosenOffer,HTHResult,FC CorrectAnswers,FC Winner,Amount WonBy Player,bgc,Series,isCelebrity
0,29/06/2009,P1,Lisa,"£5,000",Mark Labbett,"£2,000","£10,000","£2,000 \/",Caught -1,,Chaser by 0:07,£0,,1,False
1,29/06/2009,P2,Ian,"£7,000",Mark Labbett,"£2,000","£20,000","£20,000 /\",Home +1,1 (team 2),Chaser by 0:07,£0,C,1,False
2,29/06/2009,P3,Claire,"£8,000",Mark Labbett,"£2,000","£20,000","£8,000 =",Caught -3,,Chaser by 0:07,£0,,1,False
3,29/06/2009,P4,Driss,"£9,000",Mark Labbett,£200,"£20,000",£200 \/,Home +5,15 (team 2),Chaser by 0:07,£0,C,1,False
4,30/06/2009,P1,Bradley,"£8,000",Shaun Wallace,"£4,000","£16,000","£8,000 =",Home +1,14 (team 3),Chaser by 0:02,£0,C,1,False


In [4]:
df_players.tail()

Unnamed: 0,Date,PlayerNo.,Name,CashBuilder,Chaser,LowerOffer,HigherOffer,ChosenOffer,HTHResult,FC CorrectAnswers,FC Winner,Amount WonBy Player,bgc,Series,isCelebrity
8079,10/06/2022,P4,Shaun Williamson,"£8,000",Darragh Ennis,"£2,000","£132,000","£132,000 /\",Home +1,11 (team 4),Team by 5,"£37,500",T,0,True
8080,28/08/2022,P1,Sunetra Sarker,"£5,000",Darragh Ennis,"£1,000","£50,000","£5,000 =",Home +3,3 (team 4),Team by 1,"£4,750",T,0,True
8081,28/08/2022,P2,David Arnold,"£8,000",Darragh Ennis,"£2,000","£80,000","£8,000 =",Home +2,5 (team 4),Team by 1,"£4,750",T,0,True
8082,28/08/2022,P3,Matty Lee,"£3,000",Darragh Ennis,"£1,000","£87,000","£3,000 =",Home +3,1 (team 4),Team by 1,"£4,750",T,0,True
8083,28/08/2022,P4,Basil Brush,"£3,000",Darragh Ennis,"£1,000","£144,000","£3,000 =",Home +4,2 (team 4),Team by 1,"£4,750",T,0,True


## Drop Columns

For these studies we don't need to track information on the players contribution to the 

In [5]:
df_players = df_players.drop(columns=['PlayerNo.', 'Chaser', 'FC CorrectAnswers', 'FC Winner', 'Amount WonBy Player', 'bgc', 'isCelebrity', 'Series'])

In [6]:
df_players.head()

Unnamed: 0,Date,Name,CashBuilder,LowerOffer,HigherOffer,ChosenOffer,HTHResult
0,29/06/2009,Lisa,"£5,000","£2,000","£10,000","£2,000 \/",Caught -1
1,29/06/2009,Ian,"£7,000","£2,000","£20,000","£20,000 /\",Home +1
2,29/06/2009,Claire,"£8,000","£2,000","£20,000","£8,000 =",Caught -3
3,29/06/2009,Driss,"£9,000",£200,"£20,000",£200 \/,Home +5
4,30/06/2009,Bradley,"£8,000","£4,000","£16,000","£8,000 =",Home +1


## PrizeFund string to Float

When a player answers no questions in the cash builder the Chaser can offer no money for the lower offer, marked as "no offer". There are also some columns marked as "unknown". In rarer cases we have no HigherOffer also with just a blank "£" given.

As these are rare cases we will just assign these a very low value of -£100,000 which is not expected to be offered for normal gameplay. 

In [7]:
df_players.loc[df_players["LowerOffer"] == "no offer", "LowerOffer"] = "-£100,000"
df_players.loc[df_players["LowerOffer"] == "unknown", "LowerOffer"] = "-£100,000"
df_players.loc[df_players["HigherOffer"] == "£", "HigherOffer"] = "-£100,000"


Want to Convert the string "£20,200" into numeric "20200.00" and also remove the selected offer notation on the ChosenOffer column, i.e. " /\" or " =". We can infer this by column comparisons later in the analysis. 

In [8]:
def strip_offer_string(offer_string):
    
    # keep only first split of string
    output_string = offer_string.split()[0]

    # remove currency symbol and commas
    output_string = output_string.replace("£","").replace(",","")
    # format pence to be 1/100th of a £...
    output_string = output_string.replace("p","e-2")
    
    # if nothing survived, will just return zero 
    if output_string == "":
        return 0
    
    return float(output_string)

df_players["CashBuilder"] = df_players["CashBuilder"].apply(lambda x: strip_offer_string(x))
df_players["LowerOffer"] = df_players["LowerOffer"].apply(lambda x: strip_offer_string(x))
df_players["HigherOffer"] = df_players["HigherOffer"].apply(lambda x: strip_offer_string(x))
df_players["ChosenOffer"] = df_players["ChosenOffer"].apply(lambda x: strip_offer_string(x))

## Format HeadToHead Result

We want to format the "HTHResult" column into two separate columns:
- isHome (boolean): Was the player successfull in getting back home?  
- LadderDifference (int): At the end of the HTH what was the difference between the chaser and players ladder position? Positive for Home, Negative for Caught.  

In [9]:
df_players["isHome"] = df_players["HTHResult"].str.contains("Home")

def strip_hth_string(offer_string):
    
    # keep only last split of string
    output_string = offer_string.split()[-1]
    
    if output_string == "Home":
        return 1
    if output_string == "Caught":
        return -1
    
    output_string = output_string.replace("+","")

    if output_string == "":
        return 1
    
    return int(output_string)

df_players["LadderDifference"] = df_players["HTHResult"].apply(lambda x: strip_hth_string(x))

In [10]:
df_players["LadderDifference"].value_counts()

 2    1601
 3    1316
 1    1306
-3     739
-2     676
-1     674
-4     595
 4     551
-5     385
 5     132
-6      98
 6      11
Name: LadderDifference, dtype: int64

Remove the HTHResult column

In [11]:
df_players = df_players.drop(columns=['HTHResult'])

# Final Inspection and Save

In [12]:
df_players.head()

Unnamed: 0,Date,Name,CashBuilder,LowerOffer,HigherOffer,ChosenOffer,isHome,LadderDifference
0,29/06/2009,Lisa,5000.0,2000.0,10000.0,2000.0,False,-1
1,29/06/2009,Ian,7000.0,2000.0,20000.0,20000.0,True,1
2,29/06/2009,Claire,8000.0,2000.0,20000.0,8000.0,False,-3
3,29/06/2009,Driss,9000.0,200.0,20000.0,200.0,True,5
4,30/06/2009,Bradley,8000.0,4000.0,16000.0,8000.0,True,1


In [13]:
df_players.tail()

Unnamed: 0,Date,Name,CashBuilder,LowerOffer,HigherOffer,ChosenOffer,isHome,LadderDifference
8079,10/06/2022,Shaun Williamson,8000.0,2000.0,132000.0,132000.0,True,1
8080,28/08/2022,Sunetra Sarker,5000.0,1000.0,50000.0,5000.0,True,3
8081,28/08/2022,David Arnold,8000.0,2000.0,80000.0,8000.0,True,2
8082,28/08/2022,Matty Lee,3000.0,1000.0,87000.0,3000.0,True,3
8083,28/08/2022,Basil Brush,3000.0,1000.0,144000.0,3000.0,True,4


In [14]:
df_players.to_csv('../data/cleaned_players.csv', index=False)