First, we need to import the necessary libraries.
*   Pandas, for Data Analysis and handling.
*   Scikit-Learn, for Machine Learning



In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
from scipy.stats import randint

Read the CSV containing matches from all World Cups

In [19]:
matches_path = './WorldCupMatches.csv'
data = pd.read_csv(matches_path)

Drop rows that have only NaN values

In [20]:
data = data.dropna(how='all')
data.tail()

Unnamed: 0,Year,Datetime,Stage,Stadium,City,Home Team Name,Home Team Goals,Away Team Goals,Away Team Name,Win conditions,Attendance,Half-time Home Goals,Half-time Away Goals,Referee,Assistant 1,Assistant 2,RoundID,MatchID,Home Team Initials,Away Team Initials
847,2014.0,05 Jul 2014 - 17:00,Quarter-finals,Arena Fonte Nova,Salvador,Netherlands,0.0,0.0,Costa Rica,Netherlands win on penalties (4 - 3),51179.0,0.0,0.0,Ravshan IRMATOV (UZB),RASULOV Abduxamidullo (UZB),KOCHKAROV Bakhadyr (KGZ),255953.0,300186488.0,NED,CRC
848,2014.0,08 Jul 2014 - 17:00,Semi-finals,Estadio Mineirao,Belo Horizonte,Brazil,1.0,7.0,Germany,,58141.0,0.0,5.0,RODRIGUEZ Marco (MEX),TORRENTERA Marvin (MEX),QUINTERO Marcos (MEX),255955.0,300186474.0,BRA,GER
849,2014.0,09 Jul 2014 - 17:00,Semi-finals,Arena de Sao Paulo,Sao Paulo,Netherlands,0.0,0.0,Argentina,Argentina win on penalties (2 - 4),63267.0,0.0,0.0,C�neyt �AKIR (TUR),DURAN Bahattin (TUR),ONGUN Tarik (TUR),255955.0,300186490.0,NED,ARG
850,2014.0,12 Jul 2014 - 17:00,Play-off for third place,Estadio Nacional,Brasilia,Brazil,0.0,3.0,Netherlands,,68034.0,0.0,2.0,HAIMOUDI Djamel (ALG),ACHIK Redouane (MAR),ETCHIALI Abdelhak (ALG),255957.0,300186502.0,BRA,NED
851,2014.0,13 Jul 2014 - 16:00,Final,Estadio do Maracana,Rio De Janeiro,Germany,1.0,0.0,Argentina,Germany win after extra time,74738.0,0.0,0.0,Nicola RIZZOLI (ITA),Renato FAVERANI (ITA),Andrea STEFANI (ITA),255959.0,300186501.0,GER,ARG


Create a new column that is Goal Difference (Home Team Goals - Away Team Goals)

In [21]:
data['Goal Difference'] = data['Home Team Goals'] - data['Away Team Goals']

Select the columns that we could use for prediction

In [22]:
X = data[['Year', 'Home Team Name', 'Away Team Name']]
y = data['Goal Difference']

Convert team names to numbers using a dictionary

In [23]:
team_name = {}
index = 0
for idx, row in X.iterrows():
    name = row['Home Team Name']
    if(name not in team_name.keys()):
        team_name[name] = index
        index += 1
    name = row['Away Team Name']
    if(name not in team_name.keys()):
        team_name[name] = index
        index += 1

for key, value in list(team_name.items())[:10]:
    print(f'{key}: {value}')

France: 0
Mexico: 1
USA: 2
Belgium: 3
Yugoslavia: 4
Brazil: 5
Romania: 6
Peru: 7
Argentina: 8
Chile: 9


Replace team names using the dictionary created above

In [24]:
def replace_team_name_by_id(df):
    df['Home Team Name'] = team_name[df['Home Team Name']]
    df['Away Team Name'] = team_name[df['Away Team Name']]
    return df

X = X.apply(replace_team_name_by_id, axis='columns')
X

Unnamed: 0,Year,Home Team Name,Away Team Name
0,1930.0,0,1
1,1930.0,2,3
2,1930.0,4,5
3,1930.0,6,7
4,1930.0,8,0
...,...,...,...
847,2014.0,17,58
848,2014.0,5,19
849,2014.0,17,8
850,2014.0,5,17


Impute missing values

In [25]:
# Imputation
my_imputer = SimpleImputer()
X = pd.DataFrame(my_imputer.fit_transform(X))

# Imputation removed column names; put them back
X.columns = X.columns

Split values for training and testing

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Test different tunning for models

In [28]:
param_distributions = {
    'n_estimators': randint(100, 5000),
    'max_depth': randint(1, 100),
    'random_state': randint(1, 1000),
    'min_samples_leaf': randint(1, 10),
    'min_samples_split': randint(1, 10)
}

search = RandomizedSearchCV(estimator=RandomForestRegressor(),
                            n_iter=40,
                            param_distributions=param_distributions,
                            random_state=0
                            )
search.fit(X, y)
search.best_params_

25 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\JeronimoBalestra\AppData\Roaming\Python\Python39\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\JeronimoBalestra\AppData\Roaming\Python\Python39\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\JeronimoBalestra\AppData\Roaming\Python\Python39\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\JeronimoBalestra\AppData\Roaming\Python\Python39\sit

{'max_depth': 66,
 'min_samples_leaf': 7,
 'min_samples_split': 8,
 'n_estimators': 2235,
 'random_state': 175}

Setup model

In [29]:
search.score(X_test, y_test)

0.2121093960578898

Make prediction and print result

In [62]:
home_team = "Brazil"
away_team = "France"
test_data = {"Year": [2022], "Home Team Name": [team_name[home_team]], "Away Team Name": [team_name[away_team]]}
df = pd.DataFrame(data=test_data)
result = search.predict(df)

def find_keys_by_value(input_dict, value_to_find):
    keys_with_value = []
    for key, value in input_dict.items():
        if value == value_to_find:
            keys_with_value.append(key)
    return keys_with_value

if result[0] > 0:
  print(f'{home_team} won')
elif result[0] < 0:
  print(f'{away_team} won')
else:
  print('Draw')

result[0]



France won


-0.6808939641262512