In [1]:
import pandas as pd

In [2]:
!ls ../../datasets/

elo_rating.csv				 fifa_world_cup_2022
elo_rating_preprocessed.csv		 international_football_results
fifa_ranking_before_wc.csv		 predictions
fifa_ranking_before_wc_preprocessed.csv


## Datasets

In [3]:
df_fifa = pd.read_csv("../../datasets/fifa_ranking_before_wc.csv")
df_fifa.head()

Unnamed: 0,WorldCup,RK,Team,PTS
0,1994,1,GermanyGER,60.0
1,1994,2,NetherlandsNED,59.0
2,1994,3,BrazilBRA,59.0
3,1994,4,ItalyITA,57.0
4,1994,5,SpainESP,56.0


In [20]:
df_fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1575 entries, 0 to 1574
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   WorldCup  1575 non-null   int64  
 1   RK        1575 non-null   int64  
 2   Team      1575 non-null   object 
 3   PTS       1575 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 49.3+ KB


In [4]:
df_elo = pd.read_csv("../../datasets/elo_rating.csv")
df_elo.head()

Unnamed: 0,Year,RK,Team,Rating,AverageRank,AverageRating,YearRank,YearRating,MatchesTotal,MathcesHome,MatchesAway,MatchesNeutral,MatchesWins,MatchesLosses,MatchesDraws,GoalsFor,GoalsAgainst
0,1993,1,Germany,2015,9,1883,+2,+21,612,269,261,82,343,145,124,1378,786
1,1993,2,Italy,2000,8,1901,+2,+20,507,246,192,69,270,107,130,942,557
2,1993,3,Netherlands,1995,18,1798,+2,+27,512,246,219,47,238,170,104,1075,802
3,1993,4,Brazil,1950,5,1975,−2,−49,590,244,192,154,359,108,123,1270,608
4,1993,5,Yugoslavia,1933,19,1775,+2,0,523,192,253,78,252,169,102,1097,816


In [21]:
df_elo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1848 entries, 0 to 1847
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Year            1848 non-null   int64 
 1   RK              1848 non-null   int64 
 2   Team            1848 non-null   object
 3   Rating          1848 non-null   int64 
 4   AverageRank     1848 non-null   int64 
 5   AverageRating   1848 non-null   int64 
 6   YearRank        1848 non-null   object
 7   YearRating      1848 non-null   object
 8   MatchesTotal    1848 non-null   int64 
 9   MathcesHome     1848 non-null   int64 
 10  MatchesAway     1848 non-null   int64 
 11  MatchesNeutral  1848 non-null   int64 
 12  MatchesWins     1848 non-null   int64 
 13  MatchesLosses   1848 non-null   int64 
 14  MatchesDraws    1848 non-null   int64 
 15  GoalsFor        1848 non-null   int64 
 16  GoalsAgainst    1848 non-null   int64 
dtypes: int64(14), object(3)
memory usage: 245.6+ KB


## Preprocess

In [28]:
def get_intermediate(df):
    global _df
    _df = df
    return df

In [17]:
def preprocess_fifa(df):
    """
    Preprocess FIFA Ratings dataset.
    """
    df = (
        df
        .assign(code = df.Team.apply(lambda x: x[-3:]))
        .assign(Team = df_fifa.Team.apply(lambda x: x[:-3]))
    )
    df = df[['WorldCup', 'RK', 'code', 'Team', 'PTS']]
    
    return df

In [19]:
df_fifa_preprocessed = preprocess_fifa(df_fifa)
df_fifa_preprocessed.head()

Unnamed: 0,WorldCup,RK,code,Team,PTS
0,1994,1,GER,Germany,60.0
1,1994,2,NED,Netherlands,59.0
2,1994,3,BRA,Brazil,59.0
3,1994,4,ITA,Italy,57.0
4,1994,5,ESP,Spain,56.0


In [149]:
df_fifa_preprocessed.shape

(1575, 5)

In [153]:
df_fifa_preprocessed.WorldCup.agg(['nunique', 'unique'])

nunique                                                   8
unique     [1994, 1998, 2002, 2006, 2010, 2014, 2018, 2022]
Name: WorldCup, dtype: object

In [163]:
df_fifa_preprocessed.Team.nunique()

227

In [147]:
def preprocess_elo(df):
    """
    Preprocess Elo Ratings dataset.
    """
    
    for col in ['YearRank', 'YearRating']:
        df = (
            df
            .replace(to_replace=r'[\+]',value='', regex=True)
            .replace(to_replace=r'[\-]$',value='', regex=True)
            .pipe(get_intermediate)
            .assign(**{col: _df[col].apply(lambda x: re.sub("−", '-', x))})
            .replace('', '0')
            .pipe(get_intermediate)
            .assign(**{col: _df[col].astype('float')})
        )
        
#         .assign(YearRank = _df.YearRank.astype('int'))
#         .assign(YearRating = _df.YearRating.astype('int'))
#         .rename(columns={'YearRank': 'YearRankChange', 'YearRating': 'YearRatingChange'})
    
    return df

df_elo_preprocessed = preprocess_elo(df_elo)
df_elo_preprocessed.head()

Unnamed: 0,Year,RK,Team,Rating,AverageRank,AverageRating,YearRank,YearRating,MatchesTotal,MathcesHome,MatchesAway,MatchesNeutral,MatchesWins,MatchesLosses,MatchesDraws,GoalsFor,GoalsAgainst
0,1993,1,Germany,2015,9,1883,2.0,21.0,612,269,261,82,343,145,124,1378,786
1,1993,2,Italy,2000,8,1901,2.0,20.0,507,246,192,69,270,107,130,942,557
2,1993,3,Netherlands,1995,18,1798,2.0,27.0,512,246,219,47,238,170,104,1075,802
3,1993,4,Brazil,1950,5,1975,-2.0,-49.0,590,244,192,154,359,108,123,1270,608
4,1993,5,Yugoslavia,1933,19,1775,2.0,0.0,523,192,253,78,252,169,102,1097,816


In [148]:
df_elo_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1848 entries, 0 to 1847
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            1848 non-null   int64  
 1   RK              1848 non-null   int64  
 2   Team            1848 non-null   object 
 3   Rating          1848 non-null   int64  
 4   AverageRank     1848 non-null   int64  
 5   AverageRating   1848 non-null   int64  
 6   YearRank        1848 non-null   float64
 7   YearRating      1848 non-null   float64
 8   MatchesTotal    1848 non-null   int64  
 9   MathcesHome     1848 non-null   int64  
 10  MatchesAway     1848 non-null   int64  
 11  MatchesNeutral  1848 non-null   int64  
 12  MatchesWins     1848 non-null   int64  
 13  MatchesLosses   1848 non-null   int64  
 14  MatchesDraws    1848 non-null   int64  
 15  GoalsFor        1848 non-null   int64  
 16  GoalsAgainst    1848 non-null   int64  
dtypes: float64(2), int64(14), object(

In [159]:
df_elo_preprocessed.Year.agg(['nunique', 'unique'])

nunique                                                   8
unique     [1993, 1997, 2001, 2005, 2009, 2013, 2017, 2021]
Name: Year, dtype: object

In [162]:
df_elo_preprocessed.Team.nunique()

249

# Save

In [164]:
df_fifa_preprocessed.to_csv('../datasets/fifa_ranking_before_wc_preprocessed.csv')
df_elo_preprocessed.to_csv('../datasets/elo_rating_preprocessed.csv')

In [165]:
!ls ../datasets/

elo_rating.csv		     fifa_ranking_before_wc.csv
elo_rating_preprocessed.csv  fifa_ranking_before_wc_preprocessed.csv
