The **FIFA World Cup**, often simply called the World Cup, is an international association football competition contested by the senior men's national teams of the members of the Fédération Internationale de Football Association (FIFA), the sport's global governing body. The championship has been awarded **every four years** since the inaugural tournament in 1930, except in 1942 and 1946 when it was not held because of the Second World War.

The current format of the competition involves a **qualification phase, which currently takes place over the preceding three years**, to determine which teams qualify for the tournament phase, which is often called the World Cup Finals. After this, **32 teams, including the automatically qualifying host nation(s)**, compete in the tournament phase for the title at venues within the host nation(s) over a period of about a month.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline

In [2]:
pd.set_option('display.max_rows', 500)
warnings.filterwarnings('ignore')

# 1. Importing the Data and Data Exploration

In [3]:
wc = pd.read_csv('data/results.csv')
wc['date'] = pd.to_datetime(wc['date'], infer_datetime_format=True)
wc.rename(columns={'country': 'host_country', 'neutral': 'neutral_site'}, inplace=True)

# we're only insterested in analysing FIFA World Cup related games
wc = wc[wc['tournament'].str.contains('FIFA')]

# the current tournament format with 32 teams has been used since 1998
wc = wc[(wc['date'].dt.year >= 1995) & (wc['date'].dt.year <= 2018)]
wc.reset_index(drop=True, inplace=True)

In [4]:
wc.shape

(5002, 9)

In [5]:
display(wc.head())
display(wc.tail())

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,host_country,neutral_site
0,1996-03-10,Dominica,Antigua and Barbuda,3,3,FIFA World Cup qualification,Roseau,Dominica,False
1,1996-03-24,Dominican Republic,Aruba,3,2,FIFA World Cup qualification,Santo Domingo,Dominican Republic,False
2,1996-03-29,Guyana,Grenada,1,2,FIFA World Cup qualification,Georgetown,Guyana,False
3,1996-03-31,Antigua and Barbuda,Dominica,1,3,FIFA World Cup qualification,St. John's,Antigua and Barbuda,False
4,1996-03-31,Aruba,Dominican Republic,1,3,FIFA World Cup qualification,Oranjestad,Aruba,False


Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,host_country,neutral_site
4997,2018-07-07,Russia,Croatia,2,2,FIFA World Cup,Sochi,Russia,False
4998,2018-07-10,France,Belgium,1,0,FIFA World Cup,St. Petersburg,Russia,True
4999,2018-07-11,Croatia,England,2,1,FIFA World Cup,Moscow,Russia,True
5000,2018-07-14,Belgium,England,2,0,FIFA World Cup,St. Petersburg,Russia,True
5001,2018-07-15,France,Croatia,4,2,FIFA World Cup,Moscow,Russia,True


In [6]:
wc.info() # no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5002 entries, 0 to 5001
Data columns (total 9 columns):
date            5002 non-null datetime64[ns]
home_team       5002 non-null object
away_team       5002 non-null object
home_score      5002 non-null int64
away_score      5002 non-null int64
tournament      5002 non-null object
city            5002 non-null object
host_country    5002 non-null object
neutral_site    5002 non-null bool
dtypes: bool(1), datetime64[ns](1), int64(2), object(5)
memory usage: 317.6+ KB


In [7]:
wc.describe()

Unnamed: 0,home_score,away_score
count,5002.0,5002.0
mean,1.7495,1.106357
std,1.853952,1.37982
min,0.0,0.0
25%,1.0,0.0
50%,1.0,1.0
75%,2.0,2.0
max,31.0,17.0


# 2. Feature Engineering

In [8]:
# WC champions and number of titles
# wc_titles = {'Brazil': 5, 'Italy': 4, 'Germany': 4, 'Uruguay': 2, 'Argentina': 2, 'France': 2, 'England': 1, 'Spain': 1}

# we're only interested in games related to each WC edition
years = wc['date'].dt.year.copy()
wc_years = [1994, 1998, 2002, 2006, 2010, 2014]
for y in wc_years:
    years.mask((y < years) & (years < y + 4), y + 4, inplace=True)

    
# revamping the tournament feature so that each game is labeled according to its WC edition
wc['tournament'] = (wc['tournament']
                    .map({'FIFA World Cup qualification': 'WC_q', 'FIFA World Cup': 'WC'})
                    .str.cat(years.astype('str'), sep=' '))


# creating a "stage" feature representing at which stage of the competition the game occurred
stage = np.empty(0, dtype=object)
for y in range(6):
    new_stage = np.empty(64, dtype=object)
    new_stage[0:48] = 'G' # group stage
    new_stage[48:56] = 'R16' # round of 16
    new_stage[56:60] = 'QF' # quarter-finals
    new_stage[60:62] = 'SF' # semi-finals
    new_stage[62:63] = 'LF' # losers finals
    new_stage[63:64] = 'WF' # winners finals
    stage = np.concatenate((stage, new_stage), axis = 0)

wc['stage'] = ['Quals'] * len(wc) # qualifications
wc_mask = wc['tournament'].str.contains(r'WC \d{4}', regex=True)
wc.loc[wc_mask, 'stage'] = stage


# creating a "winner" feature, labeled according to the winner of the game
wc['winner'] = [None] * len(wc)
wc['winner'] = ((wc['home_score'] > wc['away_score']) * wc['home_team'] +
                (wc['home_score'] < wc['away_score']) * wc['away_team'])


# most tie games had no winners; only those in the knockout stage of a WC had a winner decided by a penalty shoot-out
winners_idx = [686, 689, 692, 1512, 1520, 2415, 2418, 2421, 2425, 3276, 
               3279, 4124, 4126, 4135, 4137, 4988, 4989, 4993, 4997]
winners = ['Argentina', 'France', 'Brazil', 'Spain', 'South Korea', 'Ukraine', 'Germany', 'Portugal', 'Italy', 'Paraguay', 
           'Uruguay', 'Brazil', 'Costa Rica', 'Netherlands', 'Argentina', 'Russia', 'Croatia', 'England', 'Croatia']

wc['winner'].iloc[winners_idx] = winners


# cleaning the tournament feature labels
wc.loc[wc['tournament'].str.contains(r'WC_q \d{4}', regex=True), 'tournament'] = \
wc.loc[wc['tournament'].str.contains(r'WC_q \d{4}', regex=True), 'tournament'].str.split('_q').str.join('')


# total number of points accumulated up until the knockout stage in the WC



# dropping unnecessary features and reshaping the dataframe
wc.drop(['date', 'city'], axis=1, inplace=True)
wc = wc[['host_country', 'tournament', 'stage', 'home_team', 'home_score', 
         'away_team', 'away_score', 'winner', 'neutral_site']]

display(wc.head())
display(wc.tail())

Unnamed: 0,host_country,tournament,stage,home_team,home_score,away_team,away_score,winner,neutral_site
0,Dominica,WC 1998,Quals,Dominica,3,Antigua and Barbuda,3,,False
1,Dominican Republic,WC 1998,Quals,Dominican Republic,3,Aruba,2,Dominican Republic,False
2,Guyana,WC 1998,Quals,Guyana,1,Grenada,2,Grenada,False
3,Antigua and Barbuda,WC 1998,Quals,Antigua and Barbuda,1,Dominica,3,Dominica,False
4,Aruba,WC 1998,Quals,Aruba,1,Dominican Republic,3,Dominican Republic,False


Unnamed: 0,host_country,tournament,stage,home_team,home_score,away_team,away_score,winner,neutral_site
4997,Russia,WC 2018,QF,Russia,2,Croatia,2,Croatia,False
4998,Russia,WC 2018,SF,France,1,Belgium,0,France,True
4999,Russia,WC 2018,SF,Croatia,2,England,1,Croatia,True
5000,Russia,WC 2018,LF,Belgium,2,England,0,Belgium,True
5001,Russia,WC 2018,WF,France,4,Croatia,2,France,True


# 3. Data Analysis