In [40]:
# Step 0. Load libraries and modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.formula.api as smf

from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, confusion_matrix, classification_report, roc_curve, auc


'''
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
'''

'\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error, r2_score\n'

In [4]:
df_raw = pd.read_csv('../data/raw/results.csv')

In [17]:
# Pasar date a datetime (format: AÑO-MES-DIA --> %Y-%m-%d)
df_interim = df_raw.copy()
df_interim['date'] = pd.to_datetime(df_interim['date'], format='%Y-%m-%d')
df_interim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43752 entries, 0 to 43751
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        43752 non-null  datetime64[ns]
 1   home_team   43752 non-null  object        
 2   away_team   43752 non-null  object        
 3   home_score  43752 non-null  int64         
 4   away_score  43752 non-null  int64         
 5   tournament  43752 non-null  object        
 6   city        43752 non-null  object        
 7   country     43752 non-null  object        
 8   neutral     43752 non-null  bool          
dtypes: bool(1), datetime64[ns](1), int64(2), object(5)
memory usage: 2.7+ MB


In [18]:
# ELIMINAR ROWS CON FECHA < 2000 O 2010
start_date = pd.Timestamp(2015,1,1)
end_date = pd.Timestamp(2018,7,15) # Final del Mundial 2018
df_interim = df_interim[(df_interim['date'] >= start_date)]
df_interim = df_interim[(df_interim['date'] <= end_date)]

In [19]:
df_interim = df_interim.drop(['date', 'city', 'country'], axis='columns')

In [20]:
df_interim = pd.get_dummies(df_interim, dummy_na=True)
df_interim.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3323 entries, 36815 to 40137
Columns: 568 entries, home_score to tournament_nan
dtypes: bool(1), int64(2), uint8(565)
memory usage: 1.9 MB


In [22]:
df_interim.head()

Unnamed: 0,home_score,away_score,neutral,home_team_Abkhazia,home_team_Afghanistan,home_team_Albania,home_team_Alderney,home_team_Algeria,home_team_American Samoa,home_team_Andorra,...,tournament_Oceania Nations Cup,tournament_Oceania Nations Cup qualification,tournament_SAFF Cup,tournament_Superclásico de las Américas,tournament_UEFA Euro,tournament_UEFA Euro qualification,tournament_UNCAF Cup,tournament_Windward Islands Tournament,tournament_World Unity Cup,tournament_nan
36815,1,0,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36816,1,0,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36817,2,0,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36818,1,0,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36819,4,1,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# First Model:
'''
X_train = pd.get_dummies(X_train, dummy_na=True)
Y_train = pd.get_dummies(Y_train, dummy_na=True)

X_test = pd.get_dummies(X_test, dummy_na=True)
Y_test = pd.get_dummies(Y_test, dummy_na=True)

# train test split:
Train = df_interim[(df_interim['tournament'] != 'FIFA World Cup')]
Test = df_interim[(df_interim['tournament'] == 'FIFA World Cup')]

X_train = Train.drop(['home_score', 'away_score'], axis='columns')
Y_train = Train[['home_score', 'away_score']]

X_test = Test.drop(['home_score', 'away_score'], axis='columns')
Y_test = Test[['home_score', 'away_score']]
'''

In [24]:
# train test split:
Train = df_interim[(df_interim['tournament_FIFA World Cup'] != 1)]
Test = df_interim[(df_interim['tournament_FIFA World Cup'] == 1)]

X_train = Train.drop(['home_score', 'away_score'], axis='columns')
Y_train = Train[['home_score', 'away_score']]

X_test = Test.drop(['home_score', 'away_score'], axis='columns')
Y_test = Test[['home_score', 'away_score']]


In [28]:
model_DT = MultiOutputClassifier(DecisionTreeClassifier())
model_DT.fit(X_train, Y_train)

In [31]:
Y_pred = model_DT.predict(X_test)

In [39]:
# Get the score of train data just to verify its 1.
score = model_DT.score(X_train, Y_train)
print(f'The score for Decision Tree with X_train & Y_trains is: {score}')

#Get the score for the predictions:
score = model_DT.score(X_test, Y_test)
print(f'The score for Decision Tree with X_test & Y_test is: {score}')

# Tree params
print(f'Tree params: \n {model_DT.get_params()}')

The score for Decision Tree with X_train & Y_trains is: 0.9582694077938018
The score for Decision Tree with X_test & Y_test is: 0.078125
Tree params: 
 {'estimator__ccp_alpha': 0.0, 'estimator__class_weight': None, 'estimator__criterion': 'gini', 'estimator__max_depth': None, 'estimator__max_features': None, 'estimator__max_leaf_nodes': None, 'estimator__min_impurity_decrease': 0.0, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__min_weight_fraction_leaf': 0.0, 'estimator__random_state': None, 'estimator__splitter': 'best', 'estimator': DecisionTreeClassifier(), 'n_jobs': None}


AttributeError: 'MultiOutputClassifier' object has no attribute 'get_depth'

In [41]:
# Grid Search
dt_parms = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150],'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
clf = GridSearchCV(DecisionTreeClassifier(), dt_parms, cv=5)
clf.fit(X_train, Y_train)
#print(clf.best_params_)
#print(clf.best_estimator_)
estimator = clf.best_estimator_
print(f'BEST HYPERPARAMETERS:')
print(f'criterion: {estimator.criterion}')
print(f'max_depth: {estimator.max_depth}')
print(f'min_samples_split: {estimator.min_samples_split}')

Traceback (most recent call last):
  File "/home/jgayoso/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/jgayoso/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/home/jgayoso/.local/lib/python3.10/site-packages/sklearn/base.py", line 666, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/home/jgayoso/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 211, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/home/jgayoso/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 104, in _check_targets
    raise ValueError("{0} is not supported".format(y_type))
ValueError: multiclass-multioutput is not supported

Traceback (most recent call last):
  File "/ho

BEST HYPERPARAMETERS:
criterion: gini
max_depth: 4
min_samples_split: 2


Traceback (most recent call last):
  File "/home/jgayoso/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/jgayoso/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/home/jgayoso/.local/lib/python3.10/site-packages/sklearn/base.py", line 666, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/home/jgayoso/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 211, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/home/jgayoso/.local/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 104, in _check_targets
    raise ValueError("{0} is not supported".format(y_type))
ValueError: multiclass-multioutput is not supported

 nan nan nan nan nan nan nan nan nan nan nan n

In [42]:
model_best_dt = MultiOutputClassifier(DecisionTreeClassifier(criterion=estimator.criterion, max_depth=estimator.max_depth, min_samples_split=estimator.min_samples_split))
model_best_dt.fit(X_train, Y_train)

In [43]:
# Get the score of train data just to verify its 1.
score = model_DT.score(X_train, Y_train)
print(f'The score for Decision Tree with X_train & Y_trains is: {score}')

#Get the score for the predictions:
score = model_DT.score(X_test, Y_test)
print(f'The score for Decision Tree with X_test & Y_test is: {score}')


The score for Decision Tree with X_train & Y_trains is: 0.9582694077938018
The score for Decision Tree with X_test & Y_test is: 0.078125
