In [9]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

# There's a lot of columns in the DF. 
# Therefore, we add this option so that we can see more columns
pd.options.display.max_columns = 100

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data Analysis and Cleaning

In this notebook, we will try to clean and analyse the data before applying the Machine Learning algorithm. 

First, we need to load the data.

In [10]:
df = pd.read_csv('./data/CrowdstormingDataJuly1st.csv')
print(df.shape)
df.head(10)

(146028, 28)


Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,photoID,rater1,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,0,1,0,0,0,0,95212.jpg,0.25,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,0,1,0,1,0,0,1663.jpg,0.75,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,1,0,0,1,0,0,,,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,0,0,0,0,0,0,,,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,0,0,0,0,0,0,,,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,0,1,0,0,0,0,3868.jpg,0.25,0.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,0,0,0,0,0,0,47704.jpg,0.0,0.25,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,0,1,0,0,0,0,22356.jpg,1.0,1.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
8,anders-lindegaard,Anders Lindegaard,Manchester United,England,13.04.1984,193.0,80.0,Goalkeeper,1,0,1,0,0,0,0,0,16528.jpg,0.25,0.25,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
9,andreas-beck,Andreas Beck,1899 Hoffenheim,Germany,13.03.1987,180.0,70.0,Right Fullback,1,1,0,0,0,0,0,0,36499.jpg,0.0,0.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752


As it is explained in the Data Description (see `Data.md` in the *data* folder), the data are in a format called referee-player dyads. In other terms, each row is a dyad between a player and a referee, *i.e.* interaction between a player and a referee. 

## Clean the NaNs values for the colors of players

Let's print the columns and the size;

In [11]:
print(df.columns)
print(df.shape)

Index(['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height',
       'weight', 'position', 'games', 'victories', 'ties', 'defeats', 'goals',
       'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2',
       'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT',
       'meanExp', 'nExp', 'seExp'],
      dtype='object')
(146028, 28)


First, we can clean the skin color columns. To do so, we remove all the lines with NaN in *rater1* and *rater2* columns.

In [12]:
df = df.dropna(subset=['rater1', 'rater2'])
print(df.shape)
df.head(10)

(124621, 28)


Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,photoID,rater1,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,0,1,0,0,0,0,95212.jpg,0.25,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,0,1,0,1,0,0,1663.jpg,0.75,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,0,1,0,0,0,0,3868.jpg,0.25,0.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,0,0,0,0,0,0,47704.jpg,0.0,0.25,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,0,1,0,0,0,0,22356.jpg,1.0,1.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
8,anders-lindegaard,Anders Lindegaard,Manchester United,England,13.04.1984,193.0,80.0,Goalkeeper,1,0,1,0,0,0,0,0,16528.jpg,0.25,0.25,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
9,andreas-beck,Andreas Beck,1899 Hoffenheim,Germany,13.03.1987,180.0,70.0,Right Fullback,1,1,0,0,0,0,0,0,36499.jpg,0.0,0.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
10,antonio-rukavina,Antonio Rukavina,Real Valladolid,Spain,26.01.1984,177.0,74.0,Right Fullback,2,2,0,0,0,1,0,0,59786.jpg,0.0,0.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
11,ashkan-dejagah,Ashkan Dejagah,Fulham FC,England,05.07.1986,181.0,74.0,Left Winger,1,1,0,0,0,0,0,0,23229.jpg,0.5,0.5,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752
12,benedikt-hoewedes,Benedikt Höwedes,FC Schalke 04,Germany,29.02.1988,187.0,80.0,Center Back,1,1,0,0,0,0,0,0,59387.jpg,0.0,0.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752


In [13]:
# Check for other null values
df.isnull().any()

playerShort      False
player           False
club             False
leagueCountry    False
birthday         False
height            True
weight            True
position          True
games            False
victories        False
ties             False
defeats          False
goals            False
yellowCards      False
yellowReds       False
redCards         False
photoID          False
rater1           False
rater2           False
refNum           False
refCountry       False
Alpha_3           True
meanIAT           True
nIAT              True
seIAT             True
meanExp           True
nExp              True
seExp             True
dtype: bool

In [14]:
df[df['meanIAT'].isnull()].shape

(153, 28)

In [15]:
df[df['meanExp'].isnull()].shape

(153, 28)

In [16]:
df[df['meanExp'].isnull()].Alpha_3.unique()

array(['QAT  ', 'ANT', 'Mali', nan, 'NAM'], dtype=object)

Let's just remove the few referees which haven't any values for the *meanExp* and the *meanIAT*. 

In [17]:
df = df[~df['meanExp'].isnull()]
df.isnull().any()

playerShort      False
player           False
club             False
leagueCountry    False
birthday         False
height            True
weight            True
position          True
games            False
victories        False
ties             False
defeats          False
goals            False
yellowCards      False
yellowReds       False
redCards         False
photoID          False
rater1           False
rater2           False
refNum           False
refCountry       False
Alpha_3          False
meanIAT          False
nIAT             False
seIAT            False
meanExp          False
nExp             False
seExp            False
dtype: bool

We can see here that there are a lot of columns with Nans values. But we will deal with them later. =)

## Create the skin_colour column

We define the skin_colour column as the mean of the *rater1* and *rater2* columns.

In [18]:
df['skin_colour'] = df[['rater1', 'rater2']].mean(axis=1)
df.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,photoID,rater1,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,skin_colour
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,0,1,0,0,0,0,95212.jpg,0.25,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696,0.375
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,0,1,0,1,0,0,1663.jpg,0.75,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504,0.75
5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,0,1,0,0,0,0,3868.jpg,0.25,0.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125
6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,0,0,0,0,0,0,47704.jpg,0.0,0.25,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125
7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,0,1,0,0,0,0,22356.jpg,1.0,1.0,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,1.0


We can remove the columns *rater1* and *rater2*

In [19]:
df = df.drop(['rater1', 'rater2'], axis=1)
df.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,photoID,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,skin_colour
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,0,1,0,0,0,0,95212.jpg,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696,0.375
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,0,1,0,1,0,0,1663.jpg,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504,0.75
5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,0,1,0,0,0,0,3868.jpg,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125
6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,0,0,0,0,0,0,47704.jpg,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125
7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,0,1,0,0,0,0,22356.jpg,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,1.0


In [20]:
# Just check the unique values for the skin_colour column
unique_colours = df['skin_colour'].unique()
unique_colours

array([ 0.375,  0.75 ,  0.125,  1.   ,  0.25 ,  0.   ,  0.5  ,  0.875,
        0.625])

This means that we have the 5 values for the colour plus the values in-between.

## Remove other columns

Now, we want to remove some useless columns. 

In [21]:
print(df.columns)

Index(['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height',
       'weight', 'position', 'games', 'victories', 'ties', 'defeats', 'goals',
       'yellowCards', 'yellowReds', 'redCards', 'photoID', 'refNum',
       'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp',
       'seExp', 'skin_colour'],
      dtype='object')


We can remove the foloowing columns:
- *photoID*: We don't need it since we don't have the picture. =)
- *refNum*: We will aggregate the values with the name of the player.
- *refCountry*: We will aggregate the values with the name of the player.
- *Alpha_3*: We will aggregate the values with the name of the player.

We can also remove all the following columns since they concern only a specific referee:
- *meanIAT*
- *nIAT*
- *seIAT*
- *meanExp*
- *nExp*
- *seExp*

**We are maybe removing too much columns. So, we will have to decide if we keep them (if we don't aggregate for example). Or we can also scrap some website with stats on football players to get more stats on them**

In [26]:
col_to_remove = ['photoID', 'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp']
df_clean = df.drop(col_to_remove, axis=1)

In [27]:
print(df_clean.shape)
df_clean.head(10)

(124468, 17)


Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,skin_colour
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,0,1,0,0,0,0,0.375
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,0,1,0,1,0,0,0.75
5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,0,1,0,0,0,0,0.125
6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,0,0,0,0,0,0,0.125
7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,0,1,0,0,0,0,1.0
8,anders-lindegaard,Anders Lindegaard,Manchester United,England,13.04.1984,193.0,80.0,Goalkeeper,1,0,1,0,0,0,0,0,0.25
9,andreas-beck,Andreas Beck,1899 Hoffenheim,Germany,13.03.1987,180.0,70.0,Right Fullback,1,1,0,0,0,0,0,0,0.0
10,antonio-rukavina,Antonio Rukavina,Real Valladolid,Spain,26.01.1984,177.0,74.0,Right Fullback,2,2,0,0,0,1,0,0,0.0
11,ashkan-dejagah,Ashkan Dejagah,Fulham FC,England,05.07.1986,181.0,74.0,Left Winger,1,1,0,0,0,0,0,0,0.5
12,benedikt-hoewedes,Benedikt Höwedes,FC Schalke 04,Germany,29.02.1988,187.0,80.0,Center Back,1,1,0,0,0,0,0,0,0.0


We now have 17 columns. We can now aggregate on the name of the players.

## Aggregation of the table

First, let's check the number of different players. There are players with same first and last name, so we build a unique identifier concatenating name and birthday.

In [28]:
df_clean['name_date'] = df['playerShort']+df['birthday']

In [29]:
print("Total number of players: ", len(df_clean['name_date'].unique()))

Total number of players:  1585


In [30]:
unique_players = df_clean['name_date'].unique()

In [31]:
df_players = pd.DataFrame([], columns = df_clean.columns)
df_players.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,skin_colour,name_date


In [32]:
# Values we want to sum
summed = ['games', 'victories', 'ties', 'defeats', 'goals', 
          'yellowCards', 'yellowReds', 'redCards']

idx = 0
for plyr in unique_players:
    # Get all entries with the same player
    df_player = df_clean[df_clean['name_date'] == plyr]
    df_player.index = np.arange(len(df_player))
    # Sum the numbers
    sum_values = df_player.sum(numeric_only=True)

    # Create the array for the player
    array_player = []
    for i in df_clean.columns:
        if i in summed:
            array_player.append(sum_values[i])
        else:
            array_player.append(df_player[i][0])  
    
    # Add player to new DF
    df_players.loc[idx] = array_player
    idx += 1

In [33]:
df_players.tail()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,skin_colour,name_date
1580,wesley-jobello,Wesley Jobello,Olympique Marseille,France,23.01.1994,179.0,68.0,Left Winger,3.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.75,wesley-jobello23.01.1994
1581,jerome-sinclair,Jerome Sinclair,Liverpool FC,England,20.09.1996,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,jerome-sinclair20.09.1996
1582,momar-bangoura,Momar Bangoura,Olympique Marseille,France,24.02.1994,176.0,65.0,Defensive Midfielder,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.875,momar-bangoura24.02.1994
1583,kevin-osei,Kevin Osei,Olympique Marseille,France,26.03.1991,173.0,71.0,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.75,kevin-osei26.03.1991
1584,baptiste-aloe,Baptiste Aloe,Olympique Marseille,France,29.06.1994,184.0,77.0,Center Back,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.25,baptiste-aloe29.06.1994


Now, we can save this first DataFrame.

In [34]:
df_players.to_csv('./data/players.csv', index=False)

## Let's check the remaining NaN

In [35]:
print("Percentage of people removed if we drop all na: ", 100*(1-df_players.dropna().shape[0]/df_players.shape[0]), "%%")

Percentage of people removed if we drop all na:  10.473186119873812 %%


Let's check the NA columns

In [36]:
df_players.isnull().any()

playerShort      False
player           False
club             False
leagueCountry    False
birthday         False
height            True
weight            True
position          True
games            False
victories        False
ties             False
defeats          False
goals            False
yellowCards      False
yellowReds       False
redCards         False
skin_colour      False
name_date        False
dtype: bool

In [37]:
# Take someone with NA
df_players[df_players['height'] != df_players['height']]

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,skin_colour,name_date
1277,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,58.0,29.0,12.0,17.0,6.0,8.0,0.0,1.0,0.25,filip-malbasic18.11.1992
1393,loic-abenzoar,Loïc Abenzoar,Olympique Lyon,France,14.02.1989,,,Right Fullback,12.0,0.0,3.0,9.0,0.0,2.0,0.0,0.0,0.75,loic-abenzoar14.02.1989
1581,jerome-sinclair,Jerome Sinclair,Liverpool FC,England,20.09.1996,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,jerome-sinclair20.09.1996


In [38]:
# Let's check in the BIG DF
df[df['player'] == 'Filip Malbašić']

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,photoID,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,skin_colour
3788,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,1,1,0,0,0,0,0,0,171196.jpg,101,51,SRB,0.282037,114.0,0.003725,0.420168,119.0,0.017657,0.25
14798,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,2,1,0,1,0,1,0,0,171196.jpg,305,42,KAZ,0.33098,72.0,0.005102,0.240506,79.0,0.038256,0.25
15152,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,1,0,0,1,0,0,0,0,171196.jpg,329,8,DEU,0.336628,7749.0,5.5e-05,0.335967,7974.0,0.000225,0.25
20147,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,1,1,0,0,0,0,0,0,171196.jpg,385,51,SRB,0.282037,114.0,0.003725,0.420168,119.0,0.017657,0.25
20210,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,1,0,0,1,0,0,0,0,171196.jpg,392,51,SRB,0.282037,114.0,0.003725,0.420168,119.0,0.017657,0.25
29617,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,2,0,2,0,0,1,0,0,171196.jpg,548,7,FRA,0.334684,2882.0,0.000151,0.336101,3011.0,0.000586,0.25
34467,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,1,0,0,1,0,0,0,0,171196.jpg,616,48,ITA,0.386174,1761.0,0.000232,0.529815,1895.0,0.001091,0.25
34652,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,1,0,0,1,0,1,0,0,171196.jpg,619,51,SRB,0.282037,114.0,0.003725,0.420168,119.0,0.017657,0.25
34658,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,2,1,0,1,0,0,0,0,171196.jpg,620,51,SRB,0.282037,114.0,0.003725,0.420168,119.0,0.017657,0.25
34966,filip-malbasic,Filip Malbašić,1899 Hoffenheim,Germany,18.11.1992,,,Left Winger,4,2,2,0,0,0,0,0,171196.jpg,627,51,SRB,0.282037,114.0,0.003725,0.420168,119.0,0.017657,0.25


We see that the values are always missing. Therefore, we remove the players with the missing values.

In [39]:
df_players_no_nan = df_players.dropna()

In [40]:
# Check for null values
df_players_no_nan.isnull().any()

playerShort      False
player           False
club             False
leagueCountry    False
birthday         False
height           False
weight           False
position         False
games            False
victories        False
ties             False
defeats          False
goals            False
yellowCards      False
yellowReds       False
redCards         False
skin_colour      False
name_date        False
dtype: bool

No more null values. =)

Therefore, we can save this new csv.

In [41]:
df_players_no_nan.to_csv('./data/players_no_nan.csv', index=False)

## Cross-Featuring

We can introduce some cross-features with the features we removed. We know for example that the columns with
- **IAT** refer to the **race IAT test**, *i.e.* a high value correspond to someone who will prefer a white person (thinks it's a better person) over a black person.
- **Exp** refer to the use of a **racial thermometer task**, *i.e.* a high value correspond to greater feelings of warmth toward whites versus blacks.

If we want to keep these information, we can use the cross these information with the yellow and red cards. 

Let's introduce a **new variable**. We will call it the `gravity_factor`. It is given by:


\begin{equation}
G = Y + \frac{\sum Y}{\sum YR}  YR + \frac{\sum Y }{\sum R} R
\end{equation}
where:
- $Y$ is the number of yellow cards given to a player
- $\sum Y$ is the total number of yellow cards given
- $YR$ is the number of yellow then red cards given to a player
- $\sum YR$ is the total number of yellow then red cards given
- $R$ is the number of red cards given to a player
- $\sum R$ is the total number of red cards given


So, let's add this new value to the big DF.

In [42]:
print("Percentage of yellow cards: ", 100*df['yellowCards'].sum()/
      df['games'].sum(), "%")
print("Percentage of yellow then red cards: ", 100*df['yellowReds'].sum()/
      df['games'].sum(), "%")
print("Percentage of red cards: ", 100*df['redCards'].sum()/
      df['games'].sum(), "%")

Percentage of yellow cards:  13.33592576700279 %
Percentage of yellow then red cards:  0.4030787384681399 %
Percentage of red cards:  0.4261424586998498 %


In [43]:
nbr_yellow = df['yellowCards'].sum()
nbr_yellowRed = df['yellowReds'].sum()
nbr_red = df['redCards'].sum()

In [44]:
gravity = df['yellowCards'] + nbr_yellow/nbr_yellowRed * df['yellowReds'] + nbr_yellow/nbr_red * df['redCards']
gravity.unique()

array([   0.        ,    1.        ,   31.29452486,   33.08516301,
         32.29452486,    2.        ,    4.        ,    3.        ,
         35.08516301,   34.08516301,   34.29452486,    5.        ,
         38.08516301,   36.08516301,   11.        ,   71.17032601,
          6.        ,    8.        ,   43.08516301,   33.29452486,
         42.08516301,   69.37968787,    7.        ,   35.29452486,
         37.08516301,   40.08516301,   39.08516301,  111.46485087,
         65.37968787,   66.17032601,   38.29452486,   67.37968787,
        101.25548902,   36.29452486,   62.58904972,   66.37968787,
         64.37968787,   64.58904972,   67.17032601,    9.        ,
         68.17032601,   69.17032601,   70.17032601,   74.37968787,
         70.37968787,   37.29452486,   10.        ,   68.58904972,
         71.37968787,  101.67421272,   41.08516301,   63.58904972,
         99.67421272,   40.29452486,   39.29452486,   99.25548902,
         73.37968787,   44.08516301,   66.58904972,   46.08516

In [45]:
df_with_features = df
df_with_features['gravity'] = gravity
df_with_features.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,photoID,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,skin_colour,gravity
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,0,1,0,0,0,0,95212.jpg,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696,0.375,0.0
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,0,1,0,1,0,0,1663.jpg,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504,0.75,1.0
5,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,1,0,0,1,0,0,0,0,3868.jpg,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125,0.0
6,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,1,1,0,0,0,0,0,0,47704.jpg,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0.125,0.0
7,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,1,0,0,1,0,0,0,0,22356.jpg,4,4,LUX,0.325185,127.0,0.003297,0.538462,130.0,0.013752,1.0,0.0


In [46]:
gravity_players = df_players['yellowCards'] + nbr_yellow/nbr_yellowRed * df_players['yellowReds'] + nbr_yellow/nbr_red * df_players['redCards']

In [47]:
df_players_with_features = df_players
df_players_with_features['gravity'] = gravity_players
df_players_with_features.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,skin_colour,name_date,gravity
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,144.0,65.0,32.0,47.0,10.0,21.0,1.0,2.0,0.375,lucas-wilchez31.08.1983,116.674213
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,427.0,183.0,101.0,143.0,88.0,33.0,0.0,2.0,0.75,john-utaka08.01.1982,95.58905
2,aaron-hughes,Aaron Hughes,Fulham FC,England,08.11.1979,182.0,71.0,Center Back,654.0,247.0,179.0,228.0,9.0,19.0,0.0,0.0,0.125,aaron-hughes08.11.1979,19.0
3,aleksandar-kolarov,Aleksandar Kolarov,Manchester City,England,10.11.1985,187.0,80.0,Left Fullback,285.0,138.0,57.0,90.0,28.0,50.0,4.0,3.0,0.125,aleksandar-kolarov10.11.1985,276.224227
4,alexander-tettey,Alexander Tettey,Norwich City,England,04.04.1986,180.0,68.0,Defensive Midfielder,214.0,88.0,54.0,72.0,11.0,34.0,0.0,0.0,1.0,alexander-tettey04.04.1986,34.0


In [48]:
# Just check that the aggregation is correct
print("Gravity for 1st player in aggregated table: ", df_players_with_features['gravity'][0])
print("Gravity for 1st player in aggregated table: ", 
      df_with_features[df_with_features['player'] == 'Lucas Wilchez']['gravity'].sum())

# It's ok.. =)

Gravity for 1st player in aggregated table:  116.674212724
Gravity for 1st player in aggregated table:  116.6742127241217


We can now introduce another cross-features. The purpose this time is to use the *meanIAT* and *meanExp* values. To do so, we will link them with the 4 following features: *yellowCards*, *yellowReds*, *redCards*, and *gravity*. We will use the following equation:

\begin{equation}
F_{I-C} (P) = \frac{1}{\#R(P)} \sum_{r\in R(P)} \frac{C(r)}{E(r)} \cdot I(r)
\end{equation}

where:
- $F_{I-C} (P)$ defines the new feature.
- $I$ is the indicator. It can be *meanIAT* or *meanExp*
- $C$ defines the types of card received. It can be *yellowCards*, *yellowReds*, *redCards*, or *gravity*
- $P$ is a given player.
- $\#R(P)$ is the number of referees with whom a player played.
- $\sum_{r\in R(P)}$ is the sum over all referees who played with a given player.
- $C(r)$ is the value of $C$ for a specific Player and a specific referee.
- $E(r)$ is the number of times a referee played with a player.
- $I(r)$ is the indicator of the referee.

This will create 8 new features.

In [49]:
indicators = ['meanIAT', 'meanExp']
cards = ['yellowCards', 'yellowReds', 'redCards', 'gravity']

In [None]:
for idcts in indicators:
    for crds in cards:
        feature_name = idcts + "_" + crds
        array_feature = []
        print("Start feature ", feature_name)
        for plyr in df_players['player']:
            feature = 0
            # Get the DF with the player
            df_player = df[df['player'] == plyr]
            # Number of different referee
            nbr_ref = len(df_player)
            # Reindex
            df_player.index = np.arange(nbr_ref)
            # Sum for the new feature
            for i in range(nbr_ref):
                feature += df_player[crds][i]/df_player['games'][i]*df_player[idcts][i]
            # Add it into the array
            array_feature.append(feature/nbr_ref)
        # Add it into the DataFrame
        df_players_with_features[feature_name] = array_feature
df_players_with_features.head()

Start feature  meanIAT_yellowCards


In [None]:
# Let's save the new DF
df_players_with_features.to_csv('./data/players_with features.csv', index=False)

In [None]:
# We can also save the one without the nan
df_players_with_features_no_nan = df_players_with_features.dropna()
df_players_with_features_no_nan.to_csv('./data/players_with features_no_nan.csv', index=False)

In [None]:
# We can remove all the columns with String values instead of numerical values.
non_numeric = ['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'position']
df_players_with_features.drop(non_numeric, axis=1).to_csv('./data/players_with features_only_num_values.csv', index=False)
df_players_with_features_no_nan.drop(non_numeric, axis=1).to_csv('./data/players_with features_no_nan_only_num_values.csv', index=False)

## Visualization of the data

Let's do some scatter plot to see if some features are interesting with the *skin_colour*. We expect that some features such as the number of games played will be irrelevant with the *skin_colour*. 

In [None]:
df_players_with_features_no_nan.shape

In [None]:
features_tmp = df_players_with_features.columns
features = []
label = 'skin_colour'
for i in features_tmp:
    if i != 'skin_colour':
        features.append(i)
len(features)

In [None]:
for ftre in features:
    if ftre not in non_numeric:
        plt.figure()
        plt.scatter(list(df_players_with_features_no_nan[ftre]), list(df_players_with_features_no_nan[label]))
        plt.title('Feature: %s'%(ftre))
        plt.xlabel(ftre)
        plt.ylabel(label)
        plt.show()

In [None]:
# Just plot the values of meanExp because some values are less than 0
plt.figure()
plt.plot(df['meanExp'])
plt.plot(df['meanIAT'])

Mmmmhhhh

Seems that it will be difficul