# Import Libs

In [15]:
# for scraping data
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# for SQL database
from mysql.connector import MySQLConnection, Error
from configparser import ConfigParser


import time

from db_helpers import read_db_config, execute, insert

# Scrape list of matches

In [2]:
url = "https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=12357;type=tournament"

r = requests.get(url)

## Extract Table and create dataframe

In [3]:
soup = BeautifulSoup(r.text, 'html.parser')

table = soup.find_all('table')[0]


In [4]:
data = pd.DataFrame()

tmp_rows = []
for i,row in enumerate(table.find_all('tr')[1:]):
    cols = row.find_all('td')
    tmp = {
        'match_num':i+1,
        'date': cols[5].text.strip(),
        'team1': cols[0].text.strip(),
        'team2': cols[1].text.strip(),
        'id_team1': int(cols[0].find('a',href=True)['href'].split('/')[4].split(".")[0]),
        'id_team2': int(cols[1].find('a',href=True)['href'].split('/')[4].split(".")[0]),
        'winner': cols[2].text.strip(),
        'margin': cols[3].text.strip(),
        'ground': cols[4].text.strip().replace("'",""),
        'series_id':1144415,
        'match_id': int(cols[6].find('a',href=True)['href'].split('/')[-1].split('.')[0]),
        'url': "https://stats.espncricinfo.com"+cols[6].find('a',href=True)['href']
    }
    
    tmp_rows.append(tmp)
    
    
data = pd.concat([data,pd.DataFrame(tmp_rows)],ignore_index=True)

data[['margin','margin_type']] = data['margin'].str.split(' ',expand=True).fillna('NULL').replace('','NULL')
data['date'] = pd.to_datetime(data['date']).dt.strftime('%Y-%m-%d')
data

Unnamed: 0,match_num,date,team1,team2,id_team1,id_team2,winner,margin,ground,series_id,match_id,url,margin_type
0,1,2019-05-30,England,South Africa,1,3,England,104.0,The Oval,1144415,1144483,https://stats.espncricinfo.com/ci/engine/match...,runs
1,2,2019-05-31,Pakistan,West Indies,7,4,West Indies,7.0,Nottingham,1144415,1144484,https://stats.espncricinfo.com/ci/engine/match...,wickets
2,3,2019-06-01,New Zealand,Sri Lanka,5,8,New Zealand,10.0,Cardiff,1144415,1144485,https://stats.espncricinfo.com/ci/engine/match...,wickets
3,4,2019-06-01,Afghanistan,Australia,40,2,Australia,7.0,Bristol,1144415,1144486,https://stats.espncricinfo.com/ci/engine/match...,wickets
4,5,2019-06-02,Bangladesh,South Africa,25,3,Bangladesh,21.0,The Oval,1144415,1144487,https://stats.espncricinfo.com/ci/engine/match...,runs
5,6,2019-06-03,England,Pakistan,1,7,Pakistan,14.0,Nottingham,1144415,1144488,https://stats.espncricinfo.com/ci/engine/match...,runs
6,7,2019-06-04,Afghanistan,Sri Lanka,40,8,Sri Lanka,34.0,Cardiff,1144415,1144489,https://stats.espncricinfo.com/ci/engine/match...,runs
7,8,2019-06-05,India,South Africa,6,3,India,6.0,Southampton,1144415,1144490,https://stats.espncricinfo.com/ci/engine/match...,wickets
8,9,2019-06-05,Bangladesh,New Zealand,25,5,New Zealand,2.0,The Oval,1144415,1144491,https://stats.espncricinfo.com/ci/engine/match...,wickets
9,10,2019-06-06,Australia,West Indies,2,4,Australia,15.0,Nottingham,1144415,1144492,https://stats.espncricinfo.com/ci/engine/match...,runs


# Create and Insert Main Database

In [6]:
table_name = 'main'

q = f'''
drop table if exists {table_name};
'''

execute(q,multi=False)

Connected...
Executing...
Done...


[]

In [7]:
q = f"""
create table {table_name}(
match_num int NOT NULL,
date date NOT NULL,
team1 varchar(30) NOT NULL,
team2 varchar(30) NOT NULL,
id_team1 int NOT NULL,
id_team2 int NOT NULL,
winner varchar(30) NOT NULL,
margin int,
margin_type varchar(10),
ground varchar(50) NOT NULL,
series_id int NOT NULL,
match_id int NOT NULL,
url varchar(500) NOT NULL
);

"""

execute(q,multi=False)

Connected...
Executing...
Done...


[]

In [8]:
columns = ('match_num','date', 'team1', 'team2', 'id_team1', 'id_team2', 'winner', 'margin', 'margin_type', 'ground', 'series_id', 'match_id', 'url')
tmp = '(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) '

query = f'insert into {table_name} {columns} values '+tmp
query = query.replace("'","")

# query

values = []
for i in range(len(data)):
    tmp = data.iloc[i,].to_dict()
    
    try:

        values.append((tmp['match_num'],tmp['date'],tmp['team1'],tmp['team2'],tmp['id_team1'],tmp['id_team2'],tmp['winner'],int(tmp['margin']),tmp['margin_type'],tmp['ground'],tmp['series_id'],tmp['match_id'],tmp['url']))
    except:
        values.append((tmp['match_num'],tmp['date'],tmp['team1'],tmp['team2'],tmp['id_team1'],tmp['id_team2'],tmp['winner'],None,None,tmp['ground'],tmp['series_id'],tmp['match_id'],tmp['url']))
# values
        

insert(query,values,commit=True)

Connected...
Executing...
Done...


[]

# Create and Insert Scorecard Database

Create database `scorecard` 

get JSON data from private data API (reverse engineered it by using Network tab in DevTools in browser)

This data contains all general stats.

Scrape it for all matches.

In [9]:
table_name = 'scorecard'

q = f"""
drop table if exists {table_name}
"""

execute(q)

Connected...
Executing...
Done...


[]

In [10]:
q = f"""
create table {table_name} (
match_num int not null,
data JSON not null
);
"""

execute(q)

Connected...
Executing...
Done...


[]

In [11]:
columns = ('match_num','data')
tmp = '(%s, %s) '

query = f'insert into {table_name} {columns} values '+tmp
query = query.replace("'","")

scorecard_api = 'https://hs-consumer-api.espncricinfo.com/v1/pages/match/scorecard'

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0',
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.5',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.espncricinfo.com/',
    'Origin': 'https://www.espncricinfo.com',
    'DNT': '1',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'Sec-GPC': '1',
}




for i in range(len(data)):
    tmp = data.iloc[i].to_dict()
    
    match_num = tmp['match_num']
    series_id = tmp['series_id']
    match_id = tmp['match_id']
    
    params_scorecard = {
    'lang': 'en',
    'seriesId': series_id,
    'matchId': match_id
    }
    
    print(match_num,series_id,match_id)
    scorecard = requests.get(scorecard_api, params=params_scorecard, headers=headers)
    
    val = (match_num,scorecard.text)
    
#     print(val)
    insert(query,[val],commit=True)
    
#     time.sleep(0.5+np.random.rand(1)[0])
    

    
    

1 1144415 1144483
Connected...
Executing...
Done...
2 1144415 1144484
Connected...
Executing...
Done...
3 1144415 1144485
Connected...
Executing...
Done...
4 1144415 1144486
Connected...
Executing...
Done...
5 1144415 1144487
Connected...
Executing...
Done...
6 1144415 1144488
Connected...
Executing...
Done...
7 1144415 1144489
Connected...
Executing...
Done...
8 1144415 1144490
Connected...
Executing...
Done...
9 1144415 1144491
Connected...
Executing...
Done...
10 1144415 1144492
Connected...
Executing...
Done...
11 1144415 1144493
Connected...
Executing...
Done...
12 1144415 1144494
Connected...
Executing...
Done...
13 1144415 1144495
Connected...
Executing...
Done...
14 1144415 1144496
Connected...
Executing...
Done...
15 1144415 1144497
Connected...
Executing...
Done...
16 1144415 1144498
Connected...
Executing...
Done...
17 1144415 1144499
Connected...
Executing...
Done...
18 1144415 1144500
Connected...
Executing...
Done...
19 1144415 1144501
Connected...
Executing...
Done...
20

# Create and Insert Overs Database

Create database `overs` 

get JSON data from private data API (reverse engineered it by using Network tab in DevTools in browser)

This data all over-wise data.

Scrape it for all matches.

In [12]:
table_name = 'overs'

q = f"""
drop table if exists {table_name}
"""

execute(q)

Connected...
Executing...
Done...


[]

In [13]:
q = f"""
create table {table_name} (
match_num int not null,
data JSON not null
);
"""

execute(q)

Connected...
Executing...
Done...


[]

In [14]:
columns = ('match_num','data')
tmp = '(%s, %s) '

query = f'insert into {table_name} {columns} values '+tmp
query = query.replace("'","")

overs_api = 'https://hs-consumer-api.espncricinfo.com/v1/pages/match/overs/details'

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0',
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Referer': 'https://www.espncricinfo.com/',
    'Origin': 'https://www.espncricinfo.com',
    'DNT': '1',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'Sec-GPC': '1',
}




for i in range(len(data)):
    tmp = data.iloc[i].to_dict()
    
    match_num = tmp['match_num']
    series_id = tmp['series_id']
    match_id = tmp['match_id']
    
    params_overs = {
        'lang': 'en',
        'seriesId': series_id,
        'matchId': match_id,
        'mode': 'ALL',
    }
    
    print(match_num,series_id,match_id)
    scorecard = requests.get(overs_api, params=params_overs, headers=headers)
    
    val = (match_num,scorecard.text)
    
#     print(val)
    insert(query,[val],commit=True)
    
#     time.sleep(0.5+np.random.rand(1)[0])
    
    

    
    

1 1144415 1144483
Connected...
Executing...
Done...
2 1144415 1144484
Connected...
Executing...
Done...
3 1144415 1144485
Connected...
Executing...
Done...
4 1144415 1144486
Connected...
Executing...
Done...
5 1144415 1144487
Connected...
Executing...
Done...
6 1144415 1144488
Connected...
Executing...
Done...
7 1144415 1144489
Connected...
Executing...
Done...
8 1144415 1144490
Connected...
Executing...
Done...
9 1144415 1144491
Connected...
Executing...
Done...
10 1144415 1144492
Connected...
Executing...
Done...
11 1144415 1144493
Connected...
Executing...
Done...
12 1144415 1144494
Connected...
Executing...
Done...
13 1144415 1144495
Connected...
Executing...
Done...
14 1144415 1144496
Connected...
Executing...
Done...
15 1144415 1144497
Connected...
Executing...
Done...
16 1144415 1144498
Connected...
Executing...
Done...
17 1144415 1144499
Connected...
Executing...
Done...
18 1144415 1144500
Connected...
Executing...
Done...
19 1144415 1144501
Connected...
Executing...
Done...
20