### Carregando os pacotes necessários

In [1]:
from random import random
import pandas as pd
import pygal
import sklearn
from sklearn.preprocessing import StandardScaler
import os

### Carregando os arquivos de resultados dos jogos e pódio das copas

In [2]:
path = os.path.abspath(os.path.join('..', 'input'))
path_data_matches = os.path.join(path, 'ano_jogo_time_gol.csv')
path_data_winners = os.path.join(path, 'ano_vencedor_copa.csv')

In [3]:
#tabela do pódio
winners = pd.read_csv(path_data_winners)

### Definindo a tabela com os resultados dos jogos

In [4]:
def get_matches(with_team_stats = False, exclude_ties=False):
    i_matches = pd.read_csv(path_data_matches)
    
    
    def winner_from_score_diff(x):
        if x > 0:
            return 'a'
        elif x < 0:
            return 'b'
        else:
            return 0
    
    i_matches['score_diff'] = i_matches['goals_team_a'] - i_matches['goals_team_b']
    i_matches['winner'] = i_matches['score_diff']
    i_matches['winner'] = i_matches['winner'].map(winner_from_score_diff)

    if exclude_ties:
        i_matches = i_matches[i_matches['winner'] != 0]

    if with_team_stats:
        stats = get_team_stats(i_matches)
        i_matches = i_matches.merge(stats,how='inner', left_on=['team_a','year'], right_on=['team','year'])\
        .merge(stats,how='inner', left_on=['team_b','year'], right_on=['team','year'], suffixes=('_a', '_b'))\
        .T.drop_duplicates().T.reset_index(drop=True)

    return i_matches

### Calculando as estatísticas diversas para cada time

In [5]:
def get_team_stats(i_matches):
    result = []
    indexes = {}

    def add_pair(list, indexes, team, year):
        key = str(year) + ',' + str(team)
        if key in indexes:
           return list[indexes[key]]
        else:
            pair = {}
            pair["team"] = team
            pair["year"] = year
            pair['matches_played'] = 0
            pair['matches_won'] = 0
            pair['podium_score'] = 0
            pair['cups_won'] = 0
            list.append(pair)
            indexes[key] = len(list) - 1
            return pair

    for i in range(len(i_matches)):
        match = i_matches.iloc[i,]

        pair_a = add_pair(result, indexes, match["team_a"], match["year"])
        pair_b = add_pair(result, indexes, match["team_b"], match["year"])

        pair_a['matches_played'] += 1
        pair_b['matches_played'] += 1

        if match["goals_team_a"] > match["goals_team_b"]:
            pair_a['matches_won'] +=1
        elif match["goals_team_b"] > match["goals_team_a"]:
            pair_b['matches_won'] +=1

    for i in range(len(winners)):
        winner = winners.loc[i,]
        pair = add_pair(result, indexes, winner["team"], winner["year"])
        pair['podium_score'] += 2 ** (5 - winner["position"])
        if winner["position"] == 1:
            pair['cups_won'] += 1 

    stats = pd.DataFrame(result)
    stats = stats.sort_values(by= ['team','year']).reset_index(drop=True)
    stats['matches_played_cum'] = stats.groupby(['team'])['matches_played'].cumsum(axis=0)
    stats['matches_won_cum'] = stats.groupby(['team'])['matches_won'].cumsum(axis=0)
    stats['podium_score_cum'] = stats.groupby(['team'])['podium_score'].cumsum(axis=0)
    stats['cups_won'] = stats.groupby(['team'])['cups_won'].cumsum(axis=0)
    stats['matches_won_cum_percent'] = stats['matches_won_cum'] / stats['matches_played_cum'] * 100.0

    return stats

### Definindo o conjunto de dados para teste

In [6]:
def extract_samples(matches, origin_features, result_feature):
    inputs = [tuple(matches.loc[i, feature]
                    for feature in origin_features)
                for i in matches.index]

    outputs = tuple(matches[result_feature].values)

    assert len(inputs) == len(outputs)

    return inputs, outputs

### Definindo a função de normalização

In [7]:
def normalize(array):
    scaler = StandardScaler ()
    array = scaler.fit_transform(array)

    return scaler , array

### Definindo as amostras de dados para treinamento


In [8]:
def split_samples(inputs, outputs, percent=0.75):
    assert len(inputs) == len(outputs)
    
    inputs1 = []
    inputs2 = []
    outputs1 = []
    outputs2 = []
    
    for i, inputs_row in enumerate(inputs):
        if random() < percent:
            input_to = inputs1
            output_to = outputs1
        else:
            input_to = inputs2
            output_to = outputs2
        
        input_to.append(inputs_row)
        output_to.append(outputs[i])

    return inputs1 , outputs1 , inputs2 , outputs2

### O que nos utilizaremos como input e qual o output da rede neural

In [9]:
#Dados de input
input_features = ['year',
                  'matches_won_cum_percent_a',
                  'podium_score_cum_a',
                  'matches_won_cum_percent_b',
                  'podium_score_cum_b',]

#Definindo o que é desejado como output:
output_feature = 'winner'

### Pegar os dados das partidas e excluindo os empates

In [10]:
matches = get_matches(with_team_stats=True, exclude_ties=True)

In [11]:
matches

Unnamed: 0,year,match_number,team_a,goals_team_a,goals_team_b,team_b,score_diff,winner,matches_played_a,matches_won_a,...,podium_score_cum_a,matches_won_cum_percent_a,matches_played_b,matches_won_b,podium_score_b,cups_won_b,matches_played_cum_b,matches_won_cum_b,podium_score_cum_b,matches_won_cum_percent_b
0,1991,1,China PR,4,0,Norway,4,a,3,2,...,0,66.666667,6,4,8,0,6,4,8,66.666667
1,1991,23,Sweden,1,4,Norway,-3,b,6,4,...,4,66.666667,6,4,8,0,6,4,8,66.666667
2,1991,13,China PR,4,1,New Zealand,3,a,3,2,...,0,66.666667,3,0,0,0,3,0,0,0.0
3,1991,2,Denmark,3,0,New Zealand,3,a,3,1,...,0,33.333333,3,0,0,0,3,0,0,0.0
4,1991,8,Norway,4,0,New Zealand,4,a,6,4,...,8,66.666667,3,0,0,0,3,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,2019,27,South Africa,0,4,Germany,-4,b,3,0,...,0,0.0,5,4,0,2,39,30,44,76.923077
241,2019,29,Jamaica,1,4,Australia,-3,b,3,0,...,0,0.0,3,2,0,0,20,7,0,35.0
242,2019,33,Netherlands,2,1,Canada,1,a,7,6,...,8,70.0,4,2,0,0,22,8,2,36.363636
243,2019,42,Sweden,1,0,Canada,1,a,7,5,...,20,65.714286,4,2,0,0,22,8,2,36.363636


In [12]:
path = os.path.abspath(os.path.join('..', 'input'))
path_df_m = os.path.join(path, 'matches.csv')

In [13]:
matches.to_csv(path_df_m, index=False)

### Iniciando o processo de aprendizagem, definindo os inputs


In [14]:
inputs, outputs = extract_samples(matches, input_features, output_feature)

### Normalizando os dados do input

In [15]:
normalizer , inputs = normalize(inputs)

### Definindo as amostras de dados para o treinamento

In [16]:
train_inputs , train_outputs , test_inputs , test_outputs = split_samples(inputs, outputs)