In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

# importing the dataset
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
fifa_data = pd.read_csv("international_matches.csv")

y_trial = fifa_data['home_team_result']       # target
X = fifa_data.drop(['home_team_result'], axis=1)

# encoding the target values
categories = ['Win', 'Draw', 'Lose']
ordinal_encoder = OrdinalEncoder(categories=[categories])
y_new = [[x] for x in y_trial]
y = ordinal_encoder.fit_transform(y_new)
y = pd.Series(y.flatten())

# splitting the data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, test_size=0.33)

# dropping columns with missing values
cols_with_missing = [cols for cols in X_train_full.columns if X_train_full[cols].isnull().any()]
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# selecting categorical columns with relatively low cardinality (convinient but arbitary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

# selecting numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# keeping selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [3]:
# getting list of categorical values
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical Values:")
print(object_cols)

Categorical Values:
['home_team_continent', 'away_team_continent', 'shoot_out']


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# function for comparing two different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return accuracy_score(y_valid, preds)

In [5]:
""" approach 1: dropping categorical values """

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
# drop_X_train.drop(drop_X_train.index, axis=1, inplace=True)

print("Accuracy Score from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

Accuracy Score from Approach 1 (Drop categorical variables):
0.98416518875095


In [6]:
""" approach 2: ordinal encoding """

# making copy so as to avoid changing the original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# applying ordinal encoding to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("Accuracy Score from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

Accuracy Score from Approach 2 (Ordinal Encoding):
0.9920192551304788
