In [1]:
import pandas as pd
import tqdm
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

In [27]:
# read csv file
df = pd.read_csv('../data/raw/cricsheet/people.csv')

# drop columns
df = df.drop(columns=['key_bcci', 'key_bcci_2',
       'key_bigbash', 'key_cricbuzz', 'key_cricheroes', 'key_crichq', 'key_cricingif', 'key_cricketarchive',
       'key_cricketarchive_2', 'key_cricketworld', 'key_nvplay',
       'key_nvplay_2', 'key_opta', 'key_opta_2', 'key_pulse', 'key_pulse_2'])

df

Unnamed: 0,identifier,name,unique_name,key_cricinfo,key_cricinfo_2
0,b4a23876,AAA Amsterdam,AAA Amsterdam,772407.0,
1,482762af,AA Adeoye,AA Adeoye,380588.0,
2,4b0e3049,AA Alleyne,AA Alleyne,661441.0,
3,fd3c5c00,AAA Patel,AAA Patel,1163136.0,
4,bc005f0d,AAA White,AAA White,474307.0,
...,...,...,...,...,...
15867,ee9bdbc8,Zulqarnain Haider,Zulqarnain Haider,43860.0,
15868,2d46e8ed,Zulqarnain Haider,Zulqarnain Haider (2),1046641.0,
15869,b71670a0,Zulufat Mutoniwase,Zulufat Mutoniwase,1380564.0,
15870,d35c1fc7,Zumika Azmi,Zumika Azmi,1050037.0,


In [28]:
# check if key_cricinfo is unique
print(df['key_cricinfo'].is_unique)

# change rows with NaN to -1
df['key_cricinfo'] = df['key_cricinfo'].fillna(-1)
df['key_cricinfo_2'] = df['key_cricinfo_2'].fillna(-1)

# make key_cricinfo integer
df['key_cricinfo'] = df['key_cricinfo'].astype(int)
df['key_cricinfo_2'] = df['key_cricinfo_2'].astype(int)

False


In [5]:
# link to scrap data from cricinfo:  https://www.espncricinfo.com/ci/content/player/{}.html
url = 'https://www.espncricinfo.com/ci/content/player/{}.html'

def get_image_url(soup, id):
    images = soup.find_all('img')
    try:
        img = [image['src'] for image in images if 'upload' in image['src']][0]
    except:
        img = 'https://wassets.hscicdn.com/static/images/lazyimage-transparent.png'
        print('No image found for this player {}'.format(id))
    return img

def save_image(url, filename):
    if url is None:
        return
    response = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(response.content)

In [None]:
def process_row(row):
    if row['key_cricinfo'] == -1:
        save_image('https://wassets.hscicdn.com/static/images/lazyimage-transparent.png', 'images/{}-{}.jpg'.format(row['identifier'], row['name']))
    response = requests.get(url.format(row['key_cricinfo']))
    soup = BeautifulSoup(response.text, 'html.parser')
    img = get_image_url(soup, row['key_cricinfo'])
    save_image(img, 'images/{}-{}.jpg'.format(row['identifier'], row['name']))

In [None]:
with ThreadPoolExecutor(max_workers=4) as executor:
    list(tqdm.tqdm(executor.map(process_row, [row for _, row in df.iloc[1000:].iterrows()]), total=df.shape[0]))

In [None]:
# give the row where name is 'Rohit Sharma'
print(df[df['name'] == 'RG Sharma'])

In [None]:
save_image('https://wassets.hscicdn.com/static/images/lazyimage-transparent.png', 'image.jpg')

In [None]:
# calculate the percentage of images in the folder which are the same as image.jpg 8256
import os
import cv2
import numpy as np

def compare_images(image1, image2):
    img1 = cv2.imread(image1)
    img2 = cv2.imread(image2)
    if img1 is None or img2 is None:
        return 0
    return np.array_equal(img1, img2)

images = os.listdir('images')
print(len(images))
count = 0
for image in tqdm.tqdm(images):
    if compare_images('image.jpg', 'images/{}'.format(image)):
        # remove the image
        # os.remove('images/{}'.format(image))
        count += 1
print(count/len(images)*100, count, len(images))

In [None]:
url = 'https://www.espncricinfo.com/ci/content/player/{}.html'

# id = df[df['name'] == 'RG Sharma']['key_cricinfo'].values[0]
# soup = BeautifulSoup(requests.get(url.format(id)).text, 'html.parser')
# # get all paragraphs
# paragraphs = soup.find_all('p')

# get full name, Batting style, Bowling style, Playing role
def get_data(paragraphs, data, id):
    try:
        p_index = [i for i, p in enumerate(paragraphs) if data in p.text][0]
        res = paragraphs[p_index+1].text
    except IndexError:
        # print('No data found for this player {} for {}'.format(id, data))
        res = None
    return res

In [5]:
# make a new dataframe with the above data for each player along with identifier column
data = []
a = 0
not_found = 0
def process_row_data(row):
    global a, data, not_found
    id = row['key_cricinfo']
    if id == -1:
        data.append([row['identifier'], 'Not Found', 'Not Found', 'Not Found', 'Not Found'])
        a += 1
        not_found += 1
        return
    soup = BeautifulSoup(requests.get(url.format(id)).text, 'html.parser')
    paragraphs = soup.find_all('p')
    full_name = get_data(paragraphs, 'Full Name', id)
    batting_style = get_data(paragraphs, 'Batting Style', id)
    bowling_style = get_data(paragraphs, 'Bowling Style' , id)
    playing_role = get_data(paragraphs, 'Playing Role' , id)
    data.append([row['identifier'], full_name, batting_style, bowling_style, playing_role])
    a += 1
    if a % 100 == 0:
        print("Processed {} rows".format(a))

In [None]:
with ThreadPoolExecutor(max_workers=8) as executor:
    list(executor.map(process_row_data, [row for _, row in df.iterrows()]))

Processed 100 rows
Processed 200 rows
Processed 300 rows
Processed 400 rows
Processed 600 rows
Processed 700 rows
Processed 800 rows
Processed 900 rows
Processed 1000 rows
Processed 1100 rows
Processed 1200 rows
Processed 1300 rows
Processed 1400 rows
Processed 1500 rows
Processed 1600 rows
Processed 1700 rows
Processed 1800 rows
Processed 1900 rows
Processed 2000 rows
Processed 2100 rows
Processed 2200 rows
Processed 2300 rows
Processed 2400 rows
Processed 2500 rows
Processed 2600 rows
Processed 2700 rows
Processed 2800 rows
Processed 2900 rows
Processed 3000 rows
Processed 3100 rows
Processed 3200 rows
Processed 3300 rows
Processed 3400 rows
Processed 3500 rows
Processed 3600 rows
Processed 3700 rows
Processed 3800 rows
Processed 3900 rows
Processed 4000 rows
Processed 4100 rows
Processed 4200 rows
Processed 4300 rows
Processed 4400 rows
Processed 4500 rows
Processed 4600 rows
Processed 4700 rows
Processed 4800 rows
Processed 4900 rows
Processed 5000 rows
Processed 5100 rows
Processe

KeyboardInterrupt: 

In [24]:
print(len(data),df.shape[0])
# remove duplicate rows in data
s = [list(x) for x in set(tuple(x) for x in data)]
len(s)

15872 15872


15872

In [None]:
df1 = pd.DataFrame(data, columns=['identifier', 'full_name', 'batting_style', 'bowling_style', 'playing_role'])
df1.to_csv('../data/raw/additional_data/player_data.csv', index=False)
df1

Unnamed: 0,identifier,full_name,batting_style,bowling_style,playing_role
0,bc005f0d,Aneka Akeilia Aresha White,Right hand Bat,Right arm Medium,
1,5c67c7a6,Azam Ali Baig,,,
2,6a58e069,Arun Bamal,Left hand Bat,Slow Left arm Orthodox,Bowling Allrounder
3,b4a23876,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter
4,482762af,Adewale A Adeoye,Right hand Bat,Right arm Medium,
...,...,...,...,...,...
15867,ee9bdbc8,Zulqarnain Haider,Right hand Bat,,Wicketkeeper Batter
15868,d35c1fc7,Zumika Azmi,Right hand Bat,Right arm Offbreak,
15869,78a3ae4b,Zach Xavier Meikle Vukusic,Right hand Bat,Right arm Medium,
15870,2b203501,Akhil Arjunan Saraladevi,Right hand Bat,Legbreak,


In [26]:
# check which identifier from df is not in df1
l = []
for i in df['identifier']:
    if i not in df1['identifier'].values:
        print(i)
        l.append(i)

In [23]:
process_row_data(df[df['identifier'] == l[0]].iloc[0])

In [43]:
identifiers_with_no_full_name = []
for row in df1.iterrows():
    if row[1]['full_name'] == None:
        identifiers_with_no_full_name.append(row[1]['identifier'])

data_2 = []

def process_row_data_2(row):
    global data_2
    id = row['key_cricinfo_2']
    if id == -1:
        print('No data found for this player {} for key_cricinfo_2 {} key_cricinfo {}'.format(row['identifier'], id, row['key_cricinfo']))
        data_2.append([row['identifier'], None, None, None, None])
        return
    soup = BeautifulSoup(requests.get(url.format(id)).text, 'html.parser')
    paragraphs = soup.find_all('p')
    full_name = get_data(paragraphs, 'Full Name', id)
    batting_style = get_data(paragraphs, 'Batting Style', id)
    bowling_style = get_data(paragraphs, 'Bowling Style' , id)
    playing_role = get_data(paragraphs, 'Playing Role' , id)
    data_2.append([row['identifier'], full_name, batting_style, bowling_style, playing_role])

In [44]:
for _, row in df[df['identifier'].isin(identifiers_with_no_full_name)].iterrows():
    process_row_data_2(row)

No data found for this player 4095f16b for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player 2a5ca58b for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player 7be2bafa for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player f315f713 for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player 56d0ab23 for key_cricinfo_2 -1 key_cricinfo 1156633
No data found for this player 2475b98f for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player c65e0275 for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player 7725cb8a for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player b77d2322 for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player 543b2173 for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player 75a667c0 for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player 6eb27d55 for key_cricinfo_2 -1 key_cricinfo -1
No data found for this player d0a98a96 for key_cricinfo_2 -1 key_cricin

In [None]:
# replace data_2 in df1 according to identifier
data_og = data
for i in range(len(data_2)):
    for j in range(len(data_og)):
        if data_2[i][0] == data_og[j][0]:
            data_og[j] = data_2[i]

df1 = pd.DataFrame(data_og, columns=['identifier', 'full_name', 'batting_style', 'bowling_style', 'playing_role'])
df1.to_csv('../data/raw/additional_data/player_data.csv', index=False)
df1

Unnamed: 0,identifier,full_name,batting_style,bowling_style,playing_role
0,bc005f0d,Aneka Akeilia Aresha White,Right hand Bat,Right arm Medium,
1,5c67c7a6,Azam Ali Baig,,,
2,6a58e069,Arun Bamal,Left hand Bat,Slow Left arm Orthodox,Bowling Allrounder
3,b4a23876,Alex Adrian Anthony Amsterdam,Left hand Bat,Right arm Offbreak,Top order Batter
4,482762af,Adewale A Adeoye,Right hand Bat,Right arm Medium,
...,...,...,...,...,...
15867,ee9bdbc8,Zulqarnain Haider,Right hand Bat,,Wicketkeeper Batter
15868,d35c1fc7,Zumika Azmi,Right hand Bat,Right arm Offbreak,
15869,78a3ae4b,Zach Xavier Meikle Vukusic,Right hand Bat,Right arm Medium,
15870,2b203501,Akhil Arjunan Saraladevi,Right hand Bat,Legbreak,
