# 0. Libraries used and Settings

In [128]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 40)

# 1. Data Wrangling 

### 1.1 Import

In [129]:
df = pd.read_csv('../../db/2017/cartola_2017.csv')
print("Original data shape: ", df.shape)
df.head(10)

Original data shape:  (8818, 37)


Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,RB,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa
0,Juan Silveira dos Santos,36540,Juan,https://s.glbimg.com/es/sde/f/2017/04/24/b3a08...,0,FLA,zag,Flamengo,Nulo,0.0,5.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
1,José Roberto da Silva Júnior,36612,Zé Roberto,https://s.glbimg.com/es/sde/f/2017/04/18/83977...,0,PAL,lat,Palmeiras,Provável,0.0,8.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
2,Paulo Autuori,36943,Paulo Autuori,https://s.glbimg.com/es/sde/f/2017/04/23/51bd7...,0,ATL,tec,,Provável,0.0,10.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
3,Augusto Sérgio Ferreira,37245,Guto Ferreira,https://s.glbimg.com/es/sde/f/2017/02/21/f5264...,0,BAH,tec,,Provável,0.0,4.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
4,Ney Franco da Silveira Júnior,37246,Ney Franco,https://s.glbimg.com/es/sde/f/2017/05/01/e3a7e...,0,SPO,tec,,Provável,0.0,4.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
5,Luis Antônio Venker de Menezes,37281,Mano Menezes,https://s.glbimg.com/es/sde/f/2016/08/09/24673...,0,CRU,tec,Cruzeiro,Provável,0.0,12.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
6,Gilson Kleina,37306,Gilson Kleina,https://s.glbimg.com/es/sde/f/2017/04/25/170c2...,0,PON,tec,Ponte Preta,Provável,0.0,4.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
7,Marcelo Ribeiro Cabo,37333,Marcelo Cabo,https://s.glbimg.com/es/sde/f/2017/04/28/b7b4b...,0,ATL,tec,,Provável,0.0,2.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
8,Paulo André Cren Benini,37604,Paulo André,https://s.glbimg.com/es/sde/f/2017/04/23/755b6...,0,ATL,zag,Atlético-PR,Nulo,0.0,6.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,
9,Michel Fernandes Bastos,37607,Michel Bastos,https://s.glbimg.com/es/sde/f/2017/04/18/fa3a7...,0,PAL,mei,Palmeiras,Nulo,0.0,8.0,0.0,0.0,0,,,,,,,,,,,,,,,,,,,,,,,


### 1.2 Cleaning

In [130]:
teams_short = pd.Series(df['atletas.clube_id'].unique()).sort_values().values
teams_full = pd.Series(df['atletas.clube.id.full.name'].unique()).sort_values().values

def dict_positions(to_int = True):
    dict_map = {'tec':0, 'gol':1, 'zag':2, 'lat':3, 'mei':4, 'ata':5}
    return  dict_map if to_int else dict(zip(dict_map.values(), dict_map.keys()))

def dict_teams(to_int = True, full_name = True):
    teams_map = {team:(index+1) for index, team in enumerate(teams_full if full_name else teams_short)}
    return teams_map if to_int else dict(zip(teams_map.values(), teams_map.keys()))

In [131]:
# remove all rows which all scouts are NANs
scouts_columns = df.columns[df.columns.get_loc('PE'):df.columns.get_loc('home_score')]
df_clean = df.dropna(how='all', subset=list(scouts_columns))
print('#players with scouts: ', df_clean.shape[0])

# remove rows from round 0 (sanity check only!)
df_clean = df_clean[df_clean['atletas.rodada_id'] > 0]
print("#rows from 1st round: ", df_clean.shape[0])

# maps "casa", "atletas.clube_id" and "atletas.posicao_id" to integer numbers
df_clean['casa'] = df_clean['casa'].map({'Casa':1, 'Fora':0})
df_clean['atletas.clube_id'] = df_clean['atletas.clube_id'].map(dict_teams(to_int=True, full_name=False)) 
df_clean['atletas.posicao_id'] = df_clean['atletas.posicao_id'].map(dict_positions(to_int=True))

# fill NANs with 0
df_clean.fillna(value=0, inplace=True)

print()
print("Data shape after cleaning: ", df_clean.shape)
df_clean.head(10)

#players with scouts:  3986
#rows from 1st round:  3986

Data shape after cleaning:  (3986, 37)


Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,RB,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa
774,Carlos Alberto Gomes de Jesus,37652,Carlos Alberto,https://s.glbimg.com/es/sde/f/2017/04/23/e6ee0...,10,1,4,Atlético-PR,Nulo,0.0,5.72,0.0,1.55,2,0.0,6.0,0.0,2.0,5.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,1.0
775,Lins Lima de Britto,53669,Lins,https://s.glbimg.com/es/sde/f/2017/05/02/152a2...,10,12,5,Ponte Preta,Provável,-0.8,3.84,-0.16,0.92,8,0.0,16.0,0.0,11.0,6.0,1.0,4.0,2.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.0,1.0
776,Ebert Willian Amâncio,37646,Betão,https://s.glbimg.com/es/sde/f/2017/04/28/89274...,10,2,2,Avaí,Provável,5.0,6.69,0.92,2.06,9,0.0,22.0,3.0,6.0,11.0,0.0,7.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-2.0,0.0
777,Paulo André Cren Benini,37604,Paulo André,https://s.glbimg.com/es/sde/f/2017/04/23/755b6...,10,1,2,Atlético-PR,Nulo,0.0,4.08,0.0,0.87,4,0.0,10.0,0.0,4.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,1.0
778,Juan Silveira dos Santos,36540,Juan,https://s.glbimg.com/es/sde/f/2017/04/24/b3a08...,10,8,2,Flamengo,Dúvida,0.0,5.92,0.0,3.98,5,0.0,6.0,1.0,4.0,4.0,0.0,8.0,2.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0
779,Fábio Deivson Lopes Maciel,37656,Fábio,https://s.glbimg.com/es/sde/f/2017/04/05/8723e...,10,7,1,Cruzeiro,Provável,10.9,14.19,2.53,4.52,10,0.0,6.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,1.0
780,José Roberto da Silva Júnior,36612,Zé Roberto,https://s.glbimg.com/es/sde/f/2017/04/18/83977...,10,11,3,Palmeiras,Nulo,0.0,7.28,0.0,1.77,3,0.0,6.0,1.0,6.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.0,0.0
781,Rafael Martiniano de Miranda Moura,37655,Rafael Moura,https://s.glbimg.com/es/sde/f/2017/04/03/0262c...,10,1,5,Atlético-MG,Nulo,-1.8,5.27,-1.81,2.66,8,0.0,7.0,0.0,14.0,13.0,7.0,2.0,3.0,0.0,2.0,5.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0
783,Michel Fernandes Bastos,37607,Michel Bastos,https://s.glbimg.com/es/sde/f/2017/04/18/fa3a7...,10,11,4,Palmeiras,Contundido,0.0,5.01,0.0,0.44,5,0.0,21.0,0.0,1.0,2.0,2.0,5.0,1.0,0.0,0.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.0,0.0
784,Jonathan Cícero Moreira,37662,Jonathan,https://s.glbimg.com/es/sde/f/2017/04/23/2e907...,10,1,3,Atlético-PR,Provável,0.0,8.14,0.0,3.17,8,0.0,20.0,3.0,4.0,8.0,0.0,10.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,1.0


### 1.3 Create Features

In [132]:
df_matches = pd.read_csv('../../db/2017/matches-brasileirao-2017.csv')
print(df_matches.shape)
df_matches.head()

(190, 9)


Unnamed: 0.1,Unnamed: 0,game,round,date,home_team,score,away_team,arena,X
0,1,1,1,14/05/2017 - 11:00,Fluminense - RJ,3 x 2,Santos - SP,Maracanã - Rio de Janeiro - RJ,
1,2,2,1,13/05/2017 - 16:00,Flamengo - RJ,1 x 1,Atlético - MG,Maracanã - Rio de Janeiro - RJ,
2,3,3,1,14/05/2017 - 16:00,Palmeiras - SP,4 x 0,Vasco da Gama - RJ,Allianz Parque - Sao Paulo - SP,
3,4,4,1,13/05/2017 - 19:00,Corinthians - SP,1 x 1,Chapecoense - SC,Arena Corinthians - Sao Paulo - SP,
4,5,5,1,14/05/2017 - 16:00,Cruzeiro - MG,1 x 0,São Paulo - SP,Mineirão - Belo Horizonte - MG,


In [133]:
def clean_team(xs):
    if xs.find("Atlético") != -1: return xs.replace(" ", "")
    elif xs.find("Vasco da Gama") != -1: return "Vasco"
    
    return xs[:xs.find("-")-1]

df_matches.home_team = df_matches.home_team.apply(clean_team)
df_matches.away_team = df_matches.away_team.apply(clean_team)
df_matches.head()

Unnamed: 0.1,Unnamed: 0,game,round,date,home_team,score,away_team,arena,X
0,1,1,1,14/05/2017 - 11:00,Fluminense,3 x 2,Santos,Maracanã - Rio de Janeiro - RJ,
1,2,2,1,13/05/2017 - 16:00,Flamengo,1 x 1,Atlético-MG,Maracanã - Rio de Janeiro - RJ,
2,3,3,1,14/05/2017 - 16:00,Palmeiras,4 x 0,Vasco,Allianz Parque - Sao Paulo - SP,
3,4,4,1,13/05/2017 - 19:00,Corinthians,1 x 1,Chapecoense,Arena Corinthians - Sao Paulo - SP,
4,5,5,1,14/05/2017 - 16:00,Cruzeiro,1 x 0,São Paulo,Mineirão - Belo Horizonte - MG,


In [150]:
def get_opponent_for_player(player_row):
    player_team = player_row['atletas.clube.id.full.name']
    round_id = player_row['atletas.rodada_id']
    home = player_row['casa']

    opponent = ""
    if home == 1:
        opponent = df_matches[(df_matches['round'] == round_id) & (df_matches['home_team'] == player_team)]['away_team'].values
    else:
        opponent = df_matches[(df_matches['round'] == round_id) & (df_matches['away_team'] == player_team)]['home_team'].values
    
    return opponent[0] if len(opponent) > 0 else ""

df_clean['opponent'] = df_clean.apply(get_opponent_for_player, axis=1)
print(df_clean.shape)
df_clean.head(10)

(3986, 38)


Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,RB,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa,opponent
774,Carlos Alberto Gomes de Jesus,37652,Carlos Alberto,https://s.glbimg.com/es/sde/f/2017/04/23/e6ee0...,10,1,4,Atlético-PR,Nulo,0.0,5.72,0.0,1.55,2,0.0,6.0,0.0,2.0,5.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,1.0,Vitória
775,Lins Lima de Britto,53669,Lins,https://s.glbimg.com/es/sde/f/2017/05/02/152a2...,10,12,5,Ponte Preta,Provável,-0.8,3.84,-0.16,0.92,8,0.0,16.0,0.0,11.0,6.0,1.0,4.0,2.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.0,1.0,Palmeiras
776,Ebert Willian Amâncio,37646,Betão,https://s.glbimg.com/es/sde/f/2017/04/28/89274...,10,2,2,Avaí,Provável,5.0,6.69,0.92,2.06,9,0.0,22.0,3.0,6.0,11.0,0.0,7.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-2.0,0.0,Botafogo
777,Paulo André Cren Benini,37604,Paulo André,https://s.glbimg.com/es/sde/f/2017/04/23/755b6...,10,1,2,Atlético-PR,Nulo,0.0,4.08,0.0,0.87,4,0.0,10.0,0.0,4.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,1.0,Vitória
778,Juan Silveira dos Santos,36540,Juan,https://s.glbimg.com/es/sde/f/2017/04/24/b3a08...,10,8,2,Flamengo,Dúvida,0.0,5.92,0.0,3.98,5,0.0,6.0,1.0,4.0,4.0,0.0,8.0,2.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,Bahia
779,Fábio Deivson Lopes Maciel,37656,Fábio,https://s.glbimg.com/es/sde/f/2017/04/05/8723e...,10,7,1,Cruzeiro,Provável,10.9,14.19,2.53,4.52,10,0.0,6.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,1.0,Coritiba
780,José Roberto da Silva Júnior,36612,Zé Roberto,https://s.glbimg.com/es/sde/f/2017/04/18/83977...,10,11,3,Palmeiras,Nulo,0.0,7.28,0.0,1.77,3,0.0,6.0,1.0,6.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.0,0.0,Ponte Preta
781,Rafael Martiniano de Miranda Moura,37655,Rafael Moura,https://s.glbimg.com/es/sde/f/2017/04/03/0262c...,10,1,5,Atlético-MG,Nulo,-1.8,5.27,-1.81,2.66,8,0.0,7.0,0.0,14.0,13.0,7.0,2.0,3.0,0.0,2.0,5.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,Chapecoense
783,Michel Fernandes Bastos,37607,Michel Bastos,https://s.glbimg.com/es/sde/f/2017/04/18/fa3a7...,10,11,4,Palmeiras,Contundido,0.0,5.01,0.0,0.44,5,0.0,21.0,0.0,1.0,2.0,2.0,5.0,1.0,0.0,0.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,-1.0,0.0,Ponte Preta
784,Jonathan Cícero Moreira,37662,Jonathan,https://s.glbimg.com/es/sde/f/2017/04/23/2e907...,10,1,3,Atlético-PR,Provável,0.0,8.14,0.0,3.17,8,0.0,20.0,3.0,4.0,8.0,0.0,10.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,3.0,1.0,Vitória


In [152]:
df_matches_clean = df_matches[['round', 'home_team', 'away_team']]
print(df_matches_clean.shape)
df_matches_clean.head()

(190, 3)


Unnamed: 0,round,home_team,away_team
0,1,Fluminense,Santos
1,1,Flamengo,Atlético-MG
2,1,Palmeiras,Vasco
3,1,Corinthians,Chapecoense
4,1,Cruzeiro,São Paulo


In [153]:
df_merge = df_clean.merge(df_matches_clean, how='inner', left_on=['atletas.clube.id.full.name', 'atletas.rodada_id'], right_on=['away_team', 'round'])
print(df_merge.shape)
df_merge.head(5)

(1995, 41)


Unnamed: 0,atletas.nome,atletas.atleta_id,atletas.apelido,atletas.foto,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.clube.id.full.name,atletas.status_id,atletas.pontos_num,atletas.preco_num,atletas.variacao_num,atletas.media_num,atletas.jogos_num,athletes.atletas.scout,PE,SG,FC,FS,I,...,FD,A,G,FF,DD,CA,GS,FT,CV,PP,DP,GC,home_score,away_score,goals_dif,casa,opponent,round,home_team,away_team
0,Ebert Willian Amâncio,37646,Betão,https://s.glbimg.com/es/sde/f/2017/04/28/89274...,10,2,2,Avaí,Provável,5.0,6.69,0.92,2.06,9,0.0,22.0,3.0,6.0,11.0,0.0,...,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-2.0,0.0,Botafogo,10,Botafogo,Avaí
1,Juan Maldonado Jaimez Júnior,37705,Juan,https://s.glbimg.com/es/sde/f/2017/06/07/4171a...,10,2,4,Avaí,Provável,1.7,4.95,0.14,1.96,7,0.0,25.0,0.0,12.0,15.0,0.0,...,2.0,0.0,1.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-2.0,0.0,Botafogo,10,Botafogo,Avaí
2,Gustavo Franchin Schiavolin,37840,Gustavo,https://s.glbimg.com/es/sde/f/2017/04/28/1a067...,10,2,2,Avaí,Nulo,0.0,4.14,0.0,-0.6,3,0.0,4.0,0.0,2.0,2.0,2.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-2.0,0.0,Botafogo,10,Botafogo,Avaí
3,Marcos Vicente dos Santos,38150,Marquinhos,https://s.glbimg.com/es/sde/f/2017/04/28/16b2b...,10,2,4,Avaí,Nulo,0.0,2.81,0.0,-0.09,8,0.0,33.0,0.0,3.0,9.0,0.0,...,4.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,-2.0,0.0,Botafogo,10,Botafogo,Avaí
4,Vinícius Pacheco dos Santos,38400,Vinícius Pacheco,https://s.glbimg.com/es/sde/f/2017/04/28/c75c0...,10,2,4,Avaí,Nulo,0.0,2.8,0.0,-0.15,2,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-2.0,0.0,Botafogo,10,Botafogo,Avaí
