Esse notebook é responsável por analisar os dados do arquivo _db/2017/cartola_2017.csv_, fazer as limpezas necessárias nos dados e criar as amostras para treinamento dos modelos.

# 0. Libraries used and Settings

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 100)

cols_scouts_def = ['CA','CV','DD','DP','FC','GC','GS','RB','SG'] # alphabetical order
cols_scouts_atk = ['A','FD','FF','FS','FT','G','I','PE','PP'] # alphabetical order
cols_scouts = cols_scouts_def + cols_scouts_atk

points = np.array([-2.0, -5.0, 3.0, 7.0, -0.5, -6.0, -2.0, 1.7, 5.0, 5.0, 1.0, 0.7, 0.5, 3.5, 8.0, -0.5, -0.3, -3.5])

# 1. Data Wrangling 

Problemas com os dados que eu percebi analisando-os:

- __Jogadores com todos os scouts NANs__.
- __Jogadores com a coluna 'atletas.clube.id.full.name' = NAN__.
- __Jogadores com scouts que não equivalem a pontuação__.
- __A coluna 'atletas.clube_id' tem campos repetidos e divergentes__: por exemplo, todos os Atléticos (MG, PR, e GO) são ATL. Além disso, há jogadores com siglas diferentes das equipes que eles jogam (por exemplo, Maicosuel [id: 37851]).
- __A coluna 'athletes.atletas.scout' não é informativa__.
- __Os scouts do jogadores são cumulativos__: ou seja, os scouts dos jogadores vão sendo somados a cada rodada. Entretanto, a pontuação não é.

### 1.1 Import

In [2]:
df = pd.read_csv('../../db/2017/cartola_2017.csv')
df.head(10)

Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,RB,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa
0,Juan Silveira dos Santos,36540,Juan,https://s.glbimg.com/es/sde/f/2017/04/24/b3a08...,0,FLA,zag,Flamengo,Nulo,0.0,5.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
1,José Roberto da Silva Júnior,36612,Zé Roberto,https://s.glbimg.com/es/sde/f/2017/04/18/83977...,0,PAL,lat,Palmeiras,Provável,0.0,8.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
2,Paulo Autuori,36943,Paulo Autuori,https://s.glbimg.com/es/sde/f/2017/04/23/51bd7...,0,ATL,tec,,Provável,0.0,10.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
3,Augusto Sérgio Ferreira,37245,Guto Ferreira,https://s.glbimg.com/es/sde/f/2017/02/21/f5264...,0,BAH,tec,,Provável,0.0,4.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
4,Ney Franco da Silveira Júnior,37246,Ney Franco,https://s.glbimg.com/es/sde/f/2017/05/01/e3a7e...,0,SPO,tec,,Provável,0.0,4.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
5,Luis Antônio Venker de Menezes,37281,Mano Menezes,https://s.glbimg.com/es/sde/f/2016/08/09/24673...,0,CRU,tec,Cruzeiro,Provável,0.0,12.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
6,Gilson Kleina,37306,Gilson Kleina,https://s.glbimg.com/es/sde/f/2017/04/25/170c2...,0,PON,tec,Ponte Preta,Provável,0.0,4.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
7,Marcelo Ribeiro Cabo,37333,Marcelo Cabo,https://s.glbimg.com/es/sde/f/2017/04/28/b7b4b...,0,ATL,tec,,Provável,0.0,2.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
8,Paulo André Cren Benini,37604,Paulo André,https://s.glbimg.com/es/sde/f/2017/04/23/755b6...,0,ATL,zag,Atlético-PR,Nulo,0.0,6.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
9,Michel Fernandes Bastos,37607,Michel Bastos,https://s.glbimg.com/es/sde/f/2017/04/18/fa3a7...,0,PAL,mei,Palmeiras,Nulo,0.0,8.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,


### 1.2 Cleaning

In [3]:
print("Original data shape: ", df.shape)

# remove all rows which all scouts are NANs
df_clean = df.dropna(how='all', subset=cols_scouts)
print('#players with scouts: ', df_clean.shape[0])

# remove rows which team's full name are NANs
df_clean = df_clean.dropna(how='all', subset=['atletas.clube.id.full.name'])
print('#players with full team\'s name: ', df_clean.shape[0])

# remove rows from round 0 (sanity check only!)
df_clean = df_clean[df_clean['atletas.rodada_id'] > 0]
print("#rows from 1st round: ", df_clean.shape[0])

# remove all coachs (sanity check only!)
df_clean = df_clean[df_clean['atletas.posicao_id'] != "tec"]
print("#rows without coachs: ", df_clean.shape[0])

# fill NANs with 0
df_clean.fillna(value=0, inplace=True)

print("Data shape after cleaning: ", df_clean.shape)
df_clean.head(10)

Original data shape:  (8818, 37)
#players with scouts:  3986
#players with full team's name:  3977
#rows from 1st round:  3977
#rows without coachs:  3977
Data shape after cleaning:  (3977, 37)


Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,RB,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa
774,Carlos Alberto Gomes de Jesus,37652,Carlos Alberto,https://s.glbimg.com/es/sde/f/2017/04/23/e6ee0...,10,ATL,mei,Atlético-PR,Nulo,0.0,5.72,0.0,1.55,2,0.0,6.0,0.0,2.0,5.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,Casa
775,Lins Lima de Britto,53669,Lins,https://s.glbimg.com/es/sde/f/2017/05/02/152a2...,10,PON,ata,Ponte Preta,Provável,-0.8,3.84,-0.16,0.92,8,0.0,16.0,0.0,11.0,6.0,1.0,4.0,2.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.0,Casa
776,Ebert Willian Amâncio,37646,Betão,https://s.glbimg.com/es/sde/f/2017/04/28/89274...,10,AVA,zag,Avaí,Provável,5.0,6.69,0.92,2.06,9,0.0,22.0,3.0,6.0,11.0,0.0,7.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-2.0,Fora
777,Paulo André Cren Benini,37604,Paulo André,https://s.glbimg.com/es/sde/f/2017/04/23/755b6...,10,ATL,zag,Atlético-PR,Nulo,0.0,4.08,0.0,0.87,4,0.0,10.0,0.0,4.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,Casa
778,Juan Silveira dos Santos,36540,Juan,https://s.glbimg.com/es/sde/f/2017/04/24/b3a08...,10,FLA,zag,Flamengo,Dúvida,0.0,5.92,0.0,3.98,5,0.0,6.0,1.0,4.0,4.0,0.0,8.0,2.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,Fora
779,Fábio Deivson Lopes Maciel,37656,Fábio,https://s.glbimg.com/es/sde/f/2017/04/05/8723e...,10,CRU,gol,Cruzeiro,Provável,10.9,14.19,2.53,4.52,10,0.0,6.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,Casa
780,José Roberto da Silva Júnior,36612,Zé Roberto,https://s.glbimg.com/es/sde/f/2017/04/18/83977...,10,PAL,lat,Palmeiras,Nulo,0.0,7.28,0.0,1.77,3,0.0,6.0,1.0,6.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.0,Fora
781,Rafael Martiniano de Miranda Moura,37655,Rafael Moura,https://s.glbimg.com/es/sde/f/2017/04/03/0262c...,10,ATL,ata,Atlético-MG,Nulo,-1.8,5.27,-1.81,2.66,8,0.0,7.0,0.0,14.0,13.0,7.0,2.0,3.0,0.0,2.0,5.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,Fora
783,Michel Fernandes Bastos,37607,Michel Bastos,https://s.glbimg.com/es/sde/f/2017/04/18/fa3a7...,10,PAL,mei,Palmeiras,Contundido,0.0,5.01,0.0,0.44,5,0.0,21.0,0.0,1.0,2.0,2.0,5.0,1.0,0.0,0.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.0,Fora
784,Jonathan Cícero Moreira,37662,Jonathan,https://s.glbimg.com/es/sde/f/2017/04/23/2e907...,10,ATL,lat,Atlético-PR,Provável,0.0,8.14,0.0,3.17,8,0.0,20.0,3.0,4.0,8.0,0.0,10.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,Casa


### 1.3 Create Features

#### 1.3.1 Add oponent to each player

In [4]:
df_matches = pd.read_csv('../../db/2017/matches-brasileirao-2017.csv')
print(df_matches.shape)
df_matches.head()

(190, 9)


Unnamed: 0.1,Unnamed: 0,game,round,date,home_team,score,away_team,arena,X
0,1,1,1,14/05/2017 - 11:00,Fluminense - RJ,3 x 2,Santos - SP,Maracanã - Rio de Janeiro - RJ,
1,2,2,1,13/05/2017 - 16:00,Flamengo - RJ,1 x 1,Atlético - MG,Maracanã - Rio de Janeiro - RJ,
2,3,3,1,14/05/2017 - 16:00,Palmeiras - SP,4 x 0,Vasco da Gama - RJ,Allianz Parque - Sao Paulo - SP,
3,4,4,1,13/05/2017 - 19:00,Corinthians - SP,1 x 1,Chapecoense - SC,Arena Corinthians - Sao Paulo - SP,
4,5,5,1,14/05/2017 - 16:00,Cruzeiro - MG,1 x 0,São Paulo - SP,Mineirão - Belo Horizonte - MG,


In [5]:
def clean_team(xs):
    if xs.find("Atlético") != -1: return xs.replace(" ", "")
    elif xs.find("Vasco da Gama") != -1: return "Vasco"
    
    return xs[:xs.find("-")-1]

df_matches.home_team = df_matches.home_team.apply(clean_team)
df_matches.away_team = df_matches.away_team.apply(clean_team)
df_matches.head()

Unnamed: 0.1,Unnamed: 0,game,round,date,home_team,score,away_team,arena,X
0,1,1,1,14/05/2017 - 11:00,Fluminense,3 x 2,Santos,Maracanã - Rio de Janeiro - RJ,
1,2,2,1,13/05/2017 - 16:00,Flamengo,1 x 1,Atlético-MG,Maracanã - Rio de Janeiro - RJ,
2,3,3,1,14/05/2017 - 16:00,Palmeiras,4 x 0,Vasco,Allianz Parque - Sao Paulo - SP,
3,4,4,1,13/05/2017 - 19:00,Corinthians,1 x 1,Chapecoense,Arena Corinthians - Sao Paulo - SP,
4,5,5,1,14/05/2017 - 16:00,Cruzeiro,1 x 0,São Paulo,Mineirão - Belo Horizonte - MG,


In [6]:
df_matches = df_matches[['round', 'home_team', 'away_team']]
print(df_matches.shape)
df_matches.head()

(190, 3)


Unnamed: 0,round,home_team,away_team
0,1,Fluminense,Santos
1,1,Flamengo,Atlético-MG
2,1,Palmeiras,Vasco
3,1,Corinthians,Chapecoense
4,1,Cruzeiro,São Paulo


In [7]:
df_merge_home = df_clean.merge(df_matches, how='inner', left_on=['atletas.clube.id.full.name', 'atletas.rodada_id'], right_on=['home_team', 'round'])
df_merge_home.drop(['home_team'], axis=1, inplace=True)
df_merge_home.rename(columns={'away_team':'opponent'}, inplace=True)

df_merge_away = df_clean.merge(df_matches, how='inner', left_on=['atletas.clube.id.full.name', 'atletas.rodada_id'], right_on=['away_team', 'round'])
df_merge_away.drop(['away_team'], axis=1, inplace=True)
df_merge_away.rename(columns={'home_team':'opponent'}, inplace=True)

df_merge = pd.concat((df_merge_home, df_merge_away))
df_merge.drop(['round'], axis=1, inplace=True)
print(df_merge.shape)
df_merge.head()

(3977, 38)


Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,RB,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa,opponent
0,Carlos Alberto Gomes de Jesus,37652,Carlos Alberto,https://s.glbimg.com/es/sde/f/2017/04/23/e6ee0...,10,ATL,mei,Atlético-PR,Nulo,0.0,5.72,0.0,1.55,2,0.0,6.0,0.0,2.0,5.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,Casa,Vitória
1,Paulo André Cren Benini,37604,Paulo André,https://s.glbimg.com/es/sde/f/2017/04/23/755b6...,10,ATL,zag,Atlético-PR,Nulo,0.0,4.08,0.0,0.87,4,0.0,10.0,0.0,4.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,Casa,Vitória
2,Jonathan Cícero Moreira,37662,Jonathan,https://s.glbimg.com/es/sde/f/2017/04/23/2e907...,10,ATL,lat,Atlético-PR,Provável,0.0,8.14,0.0,3.17,8,0.0,20.0,3.0,4.0,8.0,0.0,10.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,Casa,Vitória
3,Edinaldo Batista Libânio,38230,Grafite,https://s.glbimg.com/es/sde/f/2017/04/23/4fec1...,10,ATL,ata,Atlético-PR,Nulo,2.8,7.44,0.63,0.58,8,0.0,17.0,0.0,7.0,4.0,3.0,3.0,4.0,0.0,0.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,Casa,Vitória
4,Thiago Heleno Henrique Ferreira,38394,Thiago Heleno,https://s.glbimg.com/es/sde/f/2017/04/23/1f11c...,10,ATL,zag,Atlético-PR,Provável,2.1,8.65,-0.2,3.42,8,0.0,17.0,3.0,8.0,5.0,0.0,8.0,0.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,Casa,Vitória


### 1.4 Update scouts

In [8]:
def get_scouts_for_round(df, round_):
    suffixes = ('_curr', '_prev')
    cols_current = [col + suffixes[0] for col in cols_scouts]
    cols_prev = [col + suffixes[1] for col in cols_scouts]
    
    df_round = df[df['atletas.rodada_id'] == round_]
    if round_ == 1: return df_round
    
    df_round_prev = df[df['atletas.rodada_id'] < round_].groupby('atletas.atleta_id', as_index=False)[cols_scouts].max()
    df_players = df_round.merge(df_round_prev, how='left', on=['atletas.atleta_id'], suffixes=suffixes)
    
    # if is the first round of a player, the scouts of previous rounds will be NaNs. Thus, set them to zero
    df_players.fillna(value=0, inplace=True)
    
    # compute the scouts 
    df_players[cols_current] = df_players[cols_current].values - df_players[cols_prev].values
    
    # update the columns
    df_players.drop(labels=cols_prev, axis=1, inplace=True)
    df_players = df_players.rename(columns=dict(zip(cols_current, cols_scouts)))
    df_players.SG = df_players.SG.clip_lower(0)
    
    return df_players

In [9]:
# test of get_scouts_for_round method
# df_players = get_scouts_for_round(df_merge, 4)
# print(df_players.shape)
# df_players.head()

In [10]:
# create am empty dataframe with same columns
df_scouts = pd.DataFrame(data=[], columns=df_merge.columns)
n_rounds = df_merge['atletas.rodada_id'].max()

for i in range(1, n_rounds+1):
    df_round = get_scouts_for_round(df_merge, i)
    print("Shape of round #{0}: {1}".format(i, df_round.shape))
    df_scouts = df_scouts.append(df_round, ignore_index=True)
    
print(df_scouts.shape)
df_scouts.head()

Shape of round #1: (0, 38)
Shape of round #2: (270, 38)
Shape of round #3: (379, 38)
Shape of round #4: (413, 38)
Shape of round #5: (440, 38)
Shape of round #6: (463, 38)
Shape of round #7: (487, 38)
Shape of round #8: (495, 38)
Shape of round #9: (507, 38)
Shape of round #10: (523, 38)
(3977, 38)


Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,RB,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa,opponent
0,Rafael Marques Pinto,37623,Rafael Marques,https://s.glbimg.com/es/sde/f/2017/04/24/b0576...,2,VAS,zag,Vasco,Provável,-1.2,2.12,-1.98,-1.2,2,0.0,3.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,Casa,Bahia
1,Luis Fabiano Clemente,37919,Luis Fabiano,https://s.glbimg.com/es/sde/f/2017/04/17/37ff0...,2,VAS,ata,Vasco,Provável,13.3,12.08,3.8,7.0,2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,Casa,Bahia
2,Anderson Luiz de Carvalho,38913,Nenê,https://s.glbimg.com/es/sde/f/2017/04/24/45951...,2,VAS,mei,Vasco,Nulo,0.0,16.06,0.0,-0.3,1,0.0,9.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,Casa,Bahia
3,Bruno Vieira Gallo,60802,Bruno Gallo,https://s.glbimg.com/es/sde/f/2017/04/06/68766...,2,VAS,mei,Vasco,Nulo,0.0,1.7,0.0,-0.6,1,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,Casa,Bahia
4,Martín Andrés Silva,69249,Martín Silva,https://s.glbimg.com/es/sde/f/2017/04/06/5380d...,2,VAS,gol,Vasco,Provável,-1.3,9.9,-2.83,1.35,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,Casa,Bahia


In [11]:
def check_scouts(row):
    return np.sum(points*row[cols_scouts])

players_points = df_scouts.apply(check_scouts, axis=1)
errors = np.where(~np.isclose(df_scouts['atletas.pontos_num'].values, players_points))[0]
print("#players with wrong pontuation: ", errors.shape)
df_scouts.iloc[errors, :].tail(10)

#players with wrong pontuation:  (536,)


Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,RB,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa,opponent
634,Paulo Roberto Valoura Júnior,71937,Juninho,https://s.glbimg.com/es/sde/f/2017/02/21/1185b...,3,BAH,mei,Bahia,Nulo,-1.5,10.49,-1.6,2.57,3,0.0,10.0,0.0,8.0,5.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Fora,Botafogo
636,Edigar Junio Teixeira Lima,74271,Edigar Junio,https://s.glbimg.com/es/sde/f/2017/02/21/707d8...,3,BAH,ata,Bahia,Provável,0.9,9.27,-0.38,3.17,3,0.0,3.0,0.0,3.0,3.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Fora,Botafogo
637,Tiago Pagnussat,79042,Tiago,https://s.glbimg.com/es/sde/f/2017/02/21/dd2cb...,3,BAH,zag,Bahia,Provável,-0.6,9.83,-1.18,3.53,3,0.0,2.0,0.0,5.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Fora,Botafogo
639,Edson Felipe da Cruz,80370,Edson,https://s.glbimg.com/es/sde/f/2017/02/18/786dd...,3,BAH,mei,Bahia,Provável,3.8,17.56,-0.01,7.03,3,0.0,5.0,0.0,1.0,3.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Fora,Botafogo
640,José Rafael Vivian,82455,Zé Rafael,https://s.glbimg.com/es/sde/f/2017/02/21/0e763...,3,BAH,mei,Bahia,Provável,5.0,17.37,0.44,8.1,3,0.0,3.0,0.0,1.0,2.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Fora,Botafogo
642,Agustín Lionel Allione,84585,Allione,https://s.glbimg.com/es/sde/f/2017/02/18/b7a33...,3,BAH,mei,Bahia,Provável,1.6,10.92,-0.44,5.03,3,0.0,4.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Fora,Botafogo
643,Gustavo Henrique da Silva Sousa,86932,Gustavo,https://s.glbimg.com/es/sde/f/2017/02/21/e7a9a...,3,BAH,ata,Bahia,Nulo,0.0,7.41,-1.22,3.87,3,0.0,2.0,0.0,2.0,2.0,1.0,2.0,0.0,0.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Fora,Botafogo
645,Matheus de Sales Cabral,89285,Matheus Sales,https://s.glbimg.com/es/sde/f/2017/02/18/4a3b4...,3,BAH,mei,Bahia,Nulo,0.0,2.86,0.0,-1.8,1,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Fora,Botafogo
646,Jean Paulo Fernandes Filho,90933,Jean,https://s.glbimg.com/es/sde/f/2017/02/21/b7de2...,3,BAH,gol,Bahia,Provável,0.7,8.6,-1.08,4.37,3,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Fora,Botafogo
647,Éder Ferreira Graminho,91203,Éder,https://s.glbimg.com/es/sde/f/2017/02/21/48c32...,3,BAH,zag,Bahia,Nulo,0.0,2.95,0.0,3.4,1,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,Fora,Botafogo


In [12]:
# remove such players with wrong pontuation (DO NOT RUN TWICE!)
df_scouts.drop(df.index[errors], inplace=True)
df_scouts.head()

Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,RB,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa,opponent
0,Rafael Marques Pinto,37623,Rafael Marques,https://s.glbimg.com/es/sde/f/2017/04/24/b0576...,2,VAS,zag,Vasco,Provável,-1.2,2.12,-1.98,-1.2,2,0.0,3.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,Casa,Bahia
41,Maicosuel Reginaldo de Matos,37851,Maicosuel,https://s.glbimg.com/es/sde/f/2017/04/03/984fa...,2,ATL,mei,São Paulo,Nulo,1.7,6.25,-0.61,1.7,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,Casa,Avaí
76,Roberto Junior Fernández Torres,74061,Gatito Fernández,https://s.glbimg.com/es/sde/f/2017/04/17/f87ed...,2,BOT,gol,Botafogo,Provável,11.0,19.92,3.65,11.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,Casa,Ponte Preta
130,Ricardo César Dantas da Silva,79334,Ricardo Silva,https://s.glbimg.com/es/sde/f/2017/04/27/e409f...,2,ATL,zag,Atlético-GO,Provável,0.8,3.58,-0.52,0.8,2,0.0,1.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,-3.0,Casa,Flamengo
227,Orlando Enrique Berrío Meléndez,85300,Berrío,https://s.glbimg.com/es/sde/f/2017/04/24/7b390...,2,FLA,ata,Flamengo,Contundido,0.0,4.95,0.0,0.0,1,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,-3.0,Fora,Atlético-GO


In [13]:
print("#scouts after all data cleaning steps: ", df_scouts.shape)
df_scouts.to_csv('../../db/2017/cartola_2017_scouts.csv', index=False)

#scouts after all data cleaning steps:  (3441, 38)


# 2. Samples Creator

In [14]:
df_samples = pd.read_csv('../../db/2017/cartola_2017_scouts.csv', )
print(df_samples.shape)
df_samples.head()

(3441, 38)


Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,RB,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa,opponent
0,Rafael Marques Pinto,37623,Rafael Marques,https://s.glbimg.com/es/sde/f/2017/04/24/b0576...,2,VAS,zag,Vasco,Provável,-1.2,2.12,-1.98,-1.2,2,0.0,3.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,Casa,Bahia
1,Maicosuel Reginaldo de Matos,37851,Maicosuel,https://s.glbimg.com/es/sde/f/2017/04/03/984fa...,2,ATL,mei,São Paulo,Nulo,1.7,6.25,-0.61,1.7,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,Casa,Avaí
2,Roberto Junior Fernández Torres,74061,Gatito Fernández,https://s.glbimg.com/es/sde/f/2017/04/17/f87ed...,2,BOT,gol,Botafogo,Provável,11.0,19.92,3.65,11.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,Casa,Ponte Preta
3,Ricardo César Dantas da Silva,79334,Ricardo Silva,https://s.glbimg.com/es/sde/f/2017/04/27/e409f...,2,ATL,zag,Atlético-GO,Provável,0.8,3.58,-0.52,0.8,2,0.0,1.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,-3.0,Casa,Flamengo
4,Orlando Enrique Berrío Meléndez,85300,Berrío,https://s.glbimg.com/es/sde/f/2017/04/24/7b390...,2,FLA,ata,Flamengo,Contundido,0.0,4.95,0.0,0.0,1,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,-3.0,Fora,Atlético-GO


In [15]:
# select only columns of interest to use as features
cols_of_interest = ['atletas.posicao_id', 
                    'atletas.clube.id.full.name', 
                    'atletas.pontos_num',
                    'atletas.preco_num', 
                    'atletas.variacao_num', 
                    'atletas.media_num', 
                    'atletas.jogos_num',
                    'home_score',
                    'away_score',
                    'goals_dif',
                    'casa',
                    'opponent'] + cols_scouts

# we need atleta_id to know the best players after apply model and rodadas_id will be used to create samples only
cols_info = ['atletas.atleta_id', 'atletas.rodada_id']

df_samples = df_samples[cols_info + cols_of_interest]
df_samples.head()

Unnamed: 0,atletas.atleta_id,atletas.rodada_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,home_score,away_score,goals_dif,casa,opponent,CA,CV,DD,DP,FC,GC,GS,RB,SG,A,FD,FF,FS,FT,G,I,PE,PP
0,37623,2,zag,Vasco,-1.2,2.12,-1.98,-1.2,2,2.0,1.0,1.0,Casa,Bahia,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
1,37851,2,mei,São Paulo,1.7,6.25,-0.61,1.7,2,2.0,0.0,2.0,Casa,Avaí,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,74061,2,gol,Botafogo,11.0,19.92,3.65,11.0,2,2.0,0.0,2.0,Casa,Ponte Preta,0.0,0.0,5.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,79334,2,zag,Atlético-GO,0.8,3.58,-0.52,0.8,2,0.0,3.0,-3.0,Casa,Flamengo,1.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,85300,2,ata,Flamengo,0.0,4.95,0.0,0.0,1,0.0,3.0,-3.0,Fora,Atlético-GO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0


In [16]:
teams_full = pd.Series(df_samples['atletas.clube.id.full.name'].unique()).sort_values().values
print(teams_full.shape, teams_full)

(20,) ['Atlético-GO' 'Atlético-MG' 'Atlético-PR' 'Avaí' 'Bahia' 'Botafogo'
 'Chapecoense' 'Corinthians' 'Coritiba' 'Cruzeiro' 'Flamengo' 'Fluminense'
 'Grêmio' 'Palmeiras' 'Ponte Preta' 'Santos' 'Sport' 'São Paulo' 'Vasco'
 'Vitória']


In [17]:
def dict_positions(to_int = True):
    dict_map = {'gol':1, 'zag':2, 'lat':3, 'mei':4, 'ata':5}
    return  dict_map if to_int else dict(zip(dict_map.values(), dict_map.keys()))

def dict_teams(to_int = True):
    teams_map = {team:(index+1) for index, team in enumerate(teams_full)}
    return teams_map if to_int else dict(zip(teams_map.values(), teams_map.keys()))

In [18]:
print(dict_positions(), dict_teams(), sep='\n')

{'gol': 1, 'zag': 2, 'lat': 3, 'mei': 4, 'ata': 5}
{'Atlético-GO': 1, 'Atlético-MG': 2, 'Atlético-PR': 3, 'Avaí': 4, 'Bahia': 5, 'Botafogo': 6, 'Chapecoense': 7, 'Corinthians': 8, 'Coritiba': 9, 'Cruzeiro': 10, 'Flamengo': 11, 'Fluminense': 12, 'Grêmio': 13, 'Palmeiras': 14, 'Ponte Preta': 15, 'Santos': 16, 'Sport': 17, 'São Paulo': 18, 'Vasco': 19, 'Vitória': 20}


In [19]:
# maps "casa", "atletas.clube_id" and "atletas.posicao_id" to integer numbers
df_samples['casa'] = df_samples['casa'].map({'Casa':1, 'Fora':0})
df_samples['atletas.clube.id.full.name'] = df_samples['atletas.clube.id.full.name'].map(dict_teams(to_int=True)) 
df_samples['opponent'] = df_samples['opponent'].map(dict_teams(to_int=True))
df_samples['atletas.posicao_id'] = df_samples['atletas.posicao_id'].map(dict_positions(to_int=True))
df_samples.head()

Unnamed: 0,atletas.atleta_id,atletas.rodada_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,home_score,away_score,goals_dif,casa,opponent,CA,CV,DD,DP,FC,GC,GS,RB,SG,A,FD,FF,FS,FT,G,I,PE,PP
0,37623,2,2,19,-1.2,2.12,-1.98,-1.2,2,2.0,1.0,1.0,1,5,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
1,37851,2,4,18,1.7,6.25,-0.61,1.7,2,2.0,0.0,2.0,1,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,74061,2,1,6,11.0,19.92,3.65,11.0,2,2.0,0.0,2.0,1,15,0.0,0.0,5.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,79334,2,2,1,0.8,3.58,-0.52,0.8,2,0.0,3.0,-3.0,1,11,1.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,85300,2,5,11,0.0,4.95,0.0,0.0,1,0.0,3.0,-3.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0


In [20]:
df_samples.to_csv('../../db/2017/cartola_2017_samples.csv', index=False)