In [1]:
import concurrent.futures
import missingno as msno
import traceback
import pandas as pd
from GameFeatures import GameFeatures
from PreGameFeatures import PreGameFeatures
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


In [2]:
df = pd.read_csv('gamesDB.csv')

In [3]:
df.drop(columns='Unnamed: 0', inplace=True)

In [4]:
df.data = pd.to_datetime(df.data)

In [5]:
mask = df.data.dt.year > 2009

In [6]:
df[mask].head()

Unnamed: 0,data,time_casa,time_visitante,resultado,id_partida,link
1140,2010-05-08 19:30:00,Botafogo,Santos,3-3,gsm_id_919738,https://www.academiadasapostasbrasil.com/stats...
1141,2010-05-08 19:30:00,Atlético GO,Grêmio,0-0,gsm_id_919738,https://www.academiadasapostasbrasil.com/stats...
1142,2010-05-08 19:35:00,Palmeiras,Vitória,1-0,gsm_id_919738,https://www.academiadasapostasbrasil.com/stats...
1143,2010-05-09 17:00:00,Flamengo,São Paulo,1-1,gsm_id_919738,https://www.academiadasapostasbrasil.com/stats...
1144,2010-05-09 17:00:00,Atlético MG,Vasco,2-1,gsm_id_919738,https://www.academiadasapostasbrasil.com/stats...


In [7]:
links = df.link

In [8]:
# Descarta os 3 primeiros (380 x 3 = 1140) anos por ausencia de dados no site
links = links[1140:]

Executa as chamadas utilizando 6 Threads simultâneas. Cada Thread executa chamada pra duas páginas:
- GameFeatures ({url}/live)
- PreGameFeatures({url/prelive}

Após executar o processamento, combina alguns dicts e os adiciona numa lista chamada rows. Essa lista será usada pra criação de um DataFrame.

In [None]:
i = 0
games_len = len(links)
print(f'games_len: ${games_len}')
sfx='prelive'
rows = []
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(GameFeatures, url): url for url in links}
    for i, future in enumerate(concurrent.futures.as_completed(future_to_url)):
        url = future_to_url[future]
        try:
            data = future.result()
            game = {'url_live': url, 'home_team': data[0], 'away_team': data[1], 
                     'home_team_score': data[2], 'away_team_score': data[3], 
                    'referee': data[4], 'stadium': data[5], 'city': data[6], 
                    'datetime': data[7], 'round': data[8] }
            events = {'eventList1st': data[9], 'eventList2nd': data[10]}
            lineup = {'lineupList': data[11], 'sublineupList': data[12]}
            
            pre_game = PreGameFeatures(url[:-4]+sfx, data[0], data[1])
            pre_game = pre_game.build()
            row = {**game, **events, **lineup, **data[13], **data[14], **pre_game }
            rows.append(row)
            print('Game number: '+str(i+1)+'/'+str(games_len))
        except Exception as exc:
            print('%r generated an exception' % (url))
            print(traceback.format_exc())

games_len: $3420
Game number: 1/3420
Game number: 2/3420
Game number: 3/3420
Game number: 4/3420
Game number: 5/3420
Game number: 6/3420
Game number: 7/3420
Game number: 8/3420
Game number: 9/3420
Game number: 10/3420
Game number: 11/3420
Game number: 12/3420
Game number: 13/3420
Game number: 14/3420
Game number: 15/3420
Game number: 16/3420
Game number: 17/3420
Game number: 18/3420
Game number: 19/3420
Game number: 20/3420
Game number: 21/3420
Game number: 22/3420
Game number: 23/3420
Game number: 24/3420
Game number: 25/3420
Game number: 26/3420
Game number: 27/3420
Game number: 28/3420
Game number: 29/3420
Game number: 30/3420
Game number: 31/3420
Game number: 32/3420
Game number: 33/3420
Game number: 34/3420
Game number: 35/3420
Game number: 36/3420
Game number: 37/3420
Game number: 38/3420


In [None]:
df_csv = pd.DataFrame(rows)

In [None]:
df_csv.columns

In [None]:
#Lista todas as colunas do dataset
pd.set_option('display.max_columns', None) 
df_csv.head()

In [None]:
df_csv.to_csv('crawler.csv')

In [5]:
df = pd.read_csv('crawler.csv')

In [None]:
!pip install missingno

In [None]:
msno.bar(df)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.shots_woodwork.count()

In [None]:
#df.dtypes
#tempo = df.datetime.to_dastype('datetime64[ns]')
#08 maio 2010 - 19:30
#dir(df)
import locale
locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8')
tempo = pd.to_datetime(df['datetime'], format='%d %B %Y - %H:%M')


In [None]:
df.shots_target.count()

In [None]:
print(df.shots_on_target.count()

In [None]:
df.shots_on_target.value_counts(dropna=False)

In [6]:
filtro = (df.possession.isnull())
df[filtro][['datetime', 'home_team', 'away_team','round', 'possession']]


Unnamed: 0,datetime,home_team,away_team,round,possession
0,08 maio 2010 - 19:30,Botafogo,Santos,1,
1,08 maio 2010 - 19:30,Atlético GO,Grêmio,1,
2,09 maio 2010 - 17:00,Atlético MG,Vasco,1,
3,09 maio 2010 - 17:00,Internacional,Cruzeiro,1,
4,08 maio 2010 - 19:35,Palmeiras,Vitória,1,
5,09 maio 2010 - 17:00,Flamengo,São Paulo,1,
6,09 maio 2010 - 17:00,Corinthians,Atlético PR,1,
7,09 maio 2010 - 19:30,Ceará,Fluminense,1,
8,09 maio 2010 - 19:30,Guarani,Goiás,1,
9,15 maio 2010 - 19:30,Fluminense,Atlético GO,2,


In [None]:
import ast
lineup = ast.literal_eval(df.lineupList[0])
print(type(lista))
print(f'TIPO: {type(lineup[0])} \t  TITULARES: {lineup[0]}')
print(f'TIPO: {type(lineup[0])} \t RESERVAS: {lineup[1]}')

In [5]:
url = 'https://www.academiadasapostasbrasil.com/stats/match/brasil-stats/brasileirao-serie-a/palmeiras/portuguesa/1269317/1/live'
url2= 'https://www.academiadasapostasbrasil.com/stats/match/brasil-stats/brasileirao-serie-a/figueirense/nautico/1269322/1/live'
url3 = 'https://www.academiadasapostasbrasil.com/stats/match/brasil-stats/brasileirao-serie-a/botafogo/santos/919738/1/live'
data = GameFeatures(url)
print(data[13])
print(data[14])

{'h_Posse de bola': '53%', 'h_Chutes a gol': '16', 'h_Chutes fora': '4', 'h_Impedimentos': '0', 'h_Faltas': '16', 'h_Escanteios': '9'}
{'a_Posse de bola': '47%', 'a_Chutes a gol': '9', 'a_Chutes fora': '9', 'a_Impedimentos': '2', 'a_Faltas': '20', 'a_Escanteios': '11'}


In [6]:
lista = []
dict1_jogo_h = {'h_Posse de bola': '53%', 'h_Chutes a gol': '16', 'h_Chutes fora': '4', 'h_Impedimentos': '0', 'h_Faltas': '16', 'h_Escanteios': '9'}
dict1_jogo_a = {'a_Posse de bola': '47%', 'a_Chutes a gol': '9', 'a_Chutes fora': '9', 'a_Impedimentos': '2', 'a_Faltas': '20', 'a_Escanteios': '11'}

dict2_jogo_h = {}
dict2_jogo_a = {}


dict_jogo1 = {**dict1_jogo_h, **dict1_jogo_a}
dict_jogo2 = {**dict2_jogo_h, **dict2_jogo_a}
lista.append(dict_jogo1)
lista.append(dict_jogo2)



In [7]:
dfteste = pd.DataFrame(lista)

In [8]:
#Valores NaN serao dos jogos que nao possuem dados
dfteste.head()

Unnamed: 0,a_Chutes a gol,a_Chutes fora,a_Escanteios,a_Faltas,a_Impedimentos,a_Posse de bola,h_Chutes a gol,h_Chutes fora,h_Escanteios,h_Faltas,h_Impedimentos,h_Posse de bola
0,9.0,9.0,11.0,20.0,2.0,47%,16.0,4.0,9.0,16.0,0.0,53%
1,,,,,,,,,,,,
