In [1]:
import numpy as np
import pandas as pd 
import requests
import json
from bs4 import BeautifulSoup, SoupStrainer

import aiohttp
import asyncio
import pickle
PICKLE_FILEPATHS = {'overview':'overview_htmls.pkl', 'player':'player_htmls.pkl'}

def get_constants():
    with open('constants.json', 'r') as f:
        constants = json.load(f)
    return constants

async def fetch(session, url):
    with aiohttp.Timeout(30):
        async with session.get(url) as response:
            return await response.text()

async def fetch_all(session, urls, loop):
    results = await asyncio.gather(
        *[fetch(session, url) for url in urls],
        return_exceptions=True  # so we can deal with exceptions later
    )

    return results

def get_htmls_from_pickle(file_key):
    with open(PICKLE_FILEPATHS[file_key], 'rb') as f:
        htmls = pickle.load(f)
    return htmls

def save_htmls_to_pickle(htmls, file_key):
    with open(PICKLE_FILEPATHS[file_key], 'wb') as f:
        pickle.dump(overview_htmls, f)
    

def get_htmls(urls, from_file=False, file_key=None):
    if from_file:
        return get_htmls_from_pickle(file_key)
    else:
        loop = asyncio.get_event_loop()
        with aiohttp.ClientSession(loop=loop) as session:
            htmls = loop.run_until_complete(fetch_all(session, urls, loop))
    return dict(zip(urls, htmls))

def get_overview_urls():
    urls = []
    base_url = "https://sofifa.com/players?offset="
    offset_increment = 80
    for i in range(226): # WARNING: this may not be invariant
        url = base_url + str(i * offset_increment)
        urls.append(url)
    return urls

def get_player_urls(IDs):
    urls = []
    base_url = 'https://sofifa.com/player/'
    for ID in IDs:
        url = base_url + str(ID)
        urls.append(url)
    return urls

def get_player_htmls(IDs, from_file=False):
    urls = get_player_urls(IDs)
    return get_htmls(urls, from_file, file_key='player')

def get_overview_htmls(from_file=False):
    urls = get_overview_urls()
    return get_htmls(urls, from_file, file_key='overview')

In [923]:
%time overview_htmls = get_overview_htmls(from_file=True)

Wall time: 7.3 s


In [447]:
save_htmls_to_pickle(overview_htmls, 'overview')

In [1073]:
def parse_single_row(overview_table_row):
    
    record_dict = {}
    tds = overview_table_row.find_all('td', recursive=False)
    record_dict['photo'] = tds[0].find('img').get('data-src')
    record_dict['ID'] = tds[0].find('img').get('id')
    record_dict['nationality'] = tds[1].find('a').get('title')
    record_dict['flag'] = tds[1].find('img').get('data-src')
    record_dict['name'] = tds[1].find_all('a')[1].text
    record_dict['age'] = tds[2].find('div').text.strip()
    record_dict['overall'] = tds[3].text.strip()
    record_dict['potential'] = tds[4].text.strip()
    record_dict['club'] = tds[5].find('a').text
    record_dict['club_logo'] = tds[5].find('img').get('data-src')
    record_dict['value'] = tds[7].text
    record_dict['wage'] = tds[8].text
    record_dict['special'] = tds[17].text
    
    return record_dict

In [1085]:
def parse_single_overview_page(html, strainer):
    soup = BeautifulSoup(html, 'lxml', parse_only=strainer)
    row_dicts = []
    for row in soup.tbody.find_all('tr', recursive=False):
        row_dicts.append(parse_single_row(row))
    return row_dicts

In [936]:
def parse_overview_data(overview_htmls):
    strainer = SoupStrainer('tbody')
    data = []
    for html in overview_htmls.values():
        row_dicts = parse_single_overview_page(html, strainer)
        data.extend(row_dicts)
    return pd.DataFrame.from_dict(data)

In [14]:
# doesn't work in notebook but should work otherwise
# import multiprocessing as mp
# num_workers = mp.cpu_count()
# pool = mp.Pool(num_workers)

In [None]:
# %%time

# def square(x):
#     return x**2

# if __name__ == '__main__':
#     pool.map(square, [1,3,5,7])

In [None]:
# %%time
# x = pool.map(parse_single_overview_page, overview_htmls)

In [932]:
def convert_currency(curr_col):
    without_euro_symbol = curr_col.str[1:]
    unit_symbol = without_euro_symbol.str[-1]
    numeric_part = np.where(unit_symbol == '0', 0, without_euro_symbol.str[:-1].pipe(pd.to_numeric))
    multipliers = unit_symbol.replace({'M':1e6, 'K':1e3}).pipe(pd.to_numeric)
    return numeric_part * multipliers

def clean_overview_data(df):
    return (df.drop_duplicates('ID')
            .assign(EUR_value = lambda df: df['value'].pipe(convert_currency), 
                                EUR_wage = lambda df: df['wage'].pipe(convert_currency))
            .drop(['value', 'wage'], axis=1))

#player_personal_data = df.pipe(clean_personal_data)

In [933]:
def get_overview_data(from_file=False):
    overview_htmls = get_overview_htmls()
    return parse_overview_data(overview_htmls).pipe(clean_overview_data)

In [916]:
get_overview_urls()[-1]

'https://sofifa.com/players?offset=18000'

In [937]:
%time player_overview_data = get_overview_data(from_file=True)

Wall time: 1min 15s


In [938]:
player_overview_data

Unnamed: 0,ID,age,club,club_logo,flag,name,nationality,overall,photo,potential,special,EUR_value,EUR_wage
0,20801,32,Real Madrid CF,https://cdn.sofifa.org/24/18/teams/243.png,https://cdn.sofifa.org/flags/38.png,Cristiano Ronaldo,Portugal,94,https://cdn.sofifa.org/48/18/players/20801.png,94,2228,95500000.0,565000.0
1,158023,30,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,https://cdn.sofifa.org/flags/52.png,L. Messi,Argentina,93,https://cdn.sofifa.org/48/18/players/158023.png,93,2158,105000000.0,565000.0
2,190871,25,Paris Saint-Germain,https://cdn.sofifa.org/24/18/teams/73.png,https://cdn.sofifa.org/flags/54.png,Neymar,Brazil,92,https://cdn.sofifa.org/48/18/players/190871.png,94,2100,123000000.0,280000.0
3,176580,30,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,https://cdn.sofifa.org/flags/60.png,L. Suárez,Uruguay,92,https://cdn.sofifa.org/48/18/players/176580.png,92,2291,97000000.0,510000.0
4,167495,31,FC Bayern Munich,https://cdn.sofifa.org/24/18/teams/21.png,https://cdn.sofifa.org/flags/21.png,M. Neuer,Germany,92,https://cdn.sofifa.org/48/18/players/167495.png,92,1493,61000000.0,230000.0
5,188545,28,FC Bayern Munich,https://cdn.sofifa.org/24/18/teams/21.png,https://cdn.sofifa.org/flags/37.png,R. Lewandowski,Poland,91,https://cdn.sofifa.org/48/18/players/188545.png,91,2146,92000000.0,355000.0
6,193080,26,Manchester United,https://cdn.sofifa.org/24/18/teams/11.png,https://cdn.sofifa.org/flags/45.png,De Gea,Spain,90,https://cdn.sofifa.org/48/18/players/193080.png,92,1458,64500000.0,215000.0
7,183277,26,Chelsea,https://cdn.sofifa.org/24/18/teams/5.png,https://cdn.sofifa.org/flags/7.png,E. Hazard,Belgium,90,https://cdn.sofifa.org/48/18/players/183277.png,91,2096,90500000.0,295000.0
8,182521,27,Real Madrid CF,https://cdn.sofifa.org/24/18/teams/243.png,https://cdn.sofifa.org/flags/21.png,T. Kroos,Germany,90,https://cdn.sofifa.org/48/18/players/182521.png,90,2165,79000000.0,340000.0
9,167664,29,Juventus,https://cdn.sofifa.org/24/18/teams/45.png,https://cdn.sofifa.org/flags/52.png,G. Higuaín,Argentina,90,https://cdn.sofifa.org/48/18/players/167664.png,90,1961,77000000.0,275000.0


In [64]:
#player_personal_data.to_csv('Complete/PlayerPersonalData.csv', index=False)

In [823]:
def headline_attribute_from_line(line):
    equals_sign_loc = line.find('=')
    attribute_name = line[equals_sign_loc - 4: equals_sign_loc - 1].lower()
    attribute_value = int(line[equals_sign_loc+2:equals_sign_loc+4])
    return {'name':attribute_name, 'value':attribute_value}

In [125]:
player_data_url = 'https://sofifa.com/player/20801'
# skill_names = ['ID', 'crossing', 'finishing', 'heading_accuracy','short_passing', 'volleys', 'dribbling', 'curve',
#                'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility',
#                'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
#                'interceptions', 'positioning', 'vision', 'penalties', 'composure', 'marking', 'standing_tackle',
#                'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']
# headline_attribute_names = ['PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY']
# all_attribute_names = skill_names + headline_attribute_names
player_attribute_dict = {'ID': 20801}

In [495]:
source_code = requests.get(player_data_url)
gk_source_code = requests.get('https://sofifa.com/player/228736')

In [810]:
plain_text = source_code.text
strainer = SoupStrainer(['section', 'script'])
soup = BeautifulSoup(plain_text, 'lxml', parse_only=strainer)
gk_soup = BeautifulSoup(gk_source_code.text, 'lxml', parse_only=strainer)

Wall time: 65.5 ms


In [1182]:
def _get_main_soup(soup):
    return soup.find_all('section', recursive=False)[2]

In [1187]:
def _get_main_article(main_soup):
    return main_soup.section.article

In [1181]:
def _get_col3_divs(main_article):
    col_divs = (main_article
                .find_all('div', class_='columns', recursive=False))
    col3_divs = []
    for sub_div in col_divs:
        col3_divs.extend(sub_div.find_all('div', class_='col-3', recursive=False))
    return col3_divs

In [890]:
def parse_main_attributes(col3_divs):
    attribute_dict = {}
    for sub_div in col3_divs[:-1]: # last one is traits and specialities
        for li in sub_div.div.ul.find_all('li', recursive=False):
            stripped_strings = list(li.stripped_strings)
            attribute_name = stripped_strings[-1]
            attribute_value = stripped_strings[0]
            attribute_dict[attribute_name] = attribute_value
    return attribute_dict

In [826]:
def parse_headline_attributes(soup):
    attribute_dict = {}
    headline_attribute_script = soup.find_all('script', recursive=False)[1]
    for line in headline_attribute_script.text.split('\r\n'):
        if 'point' in line:
            attr_subdict = headline_attribute_from_line(line)
            attribute_dict[attr_subdict['name']] = attr_subdict['value']
    return attribute_dict

In [1193]:
def parse_player_metadata(main_article):
    
    attribute_dict = {}
    player_info_soup = main_article.div.div.div
    stripped_strings = list(player_info_soup.span.stripped_strings)
    attribute_dict['preferred_positions'] = stripped_strings[1:-1]
    age_height_weight = stripped_strings[-1].split()
    attribute_dict['Birth date'] = ' '.join(age_height_weight[2:5]).replace(',', '').strip('(').strip(')')
    attribute_dict['Height_cm'] = age_height_weight[5].strip('cm')
    attribute_dict['Weight_kg'] = age_height_weight[-1].strip('kg')
    
    return attribute_dict

In [1301]:
def _get_traits_and_specialities_dict(player_traits, player_specialities, all_traits, all_specialities):
    trait_dict = {trait: (trait in player_traits) for trait in all_traits}
    speciality_dict = {speciality: (speciality in specialities) for speciality in all_specialities}
    return {**trait_dict, **speciality_dict}

def parse_traits_and_specialities(col3_divs, all_traits, all_specialities):
    last_div = col3_divs[-1]
    if not last_div.text.strip():
        player_traits, player_specialities = [np.nan], [np.nan]
    else:
        uls = last_div.div.find_all('ul', recursive=False)
        n_uls = len(uls)
        if n_uls == 1:
            ul = uls[0]
            ul_strings = list(ul.stripped_strings)
            ul_h5 = ul.parent.h5.text
            if ul_h5 == 'Traits':
                player_traits = ul_strings
                player_specialities = [np.nan]
            elif ul_h5 == 'Specialities':
                player_traits = [np.nan]
                player_specialities = ul_strings
        else:
            player_traits = list(uls[0].stripped_strings)
            player_specialities = list(uls[1].stripped_strings)
    result = _get_traits_and_specialities_dict(player_traits, player_specialities, all_traits, all_specialities)
    return result

In [1203]:
def parse_player_miscellaneous_data(main_article):
    ul = (main_article.div
          .find('div', class_='teams', recursive=False)
          .table.tr.ul)
    attribute_dict = {}
    strings = ul.stripped_strings
    for key in strings:
        attribute_dict[key] = next(strings)
    work_rates = attribute_dict.pop('Work rate').split(' / ')
    attribute_dict['Work rate att'] = work_rates[0]
    attribute_dict['Work rate def'] = work_rates[1]
    return attribute_dict

In [1225]:
def get_position_ratings(main_soup, main_article, all_positions):
    position_col_name = 'Position'
    ratings_div = main_soup.aside.find('div', class_='toast mb-20', recursive=False)
    if ratings_div.h5.text == 'Real overall rating':
        ratings_table = ratings_div.table
        position_ratings_df = pd.read_html(str(ratings_table))[0][[position_col_name, 'OVA']]
        split_df = (position_ratings_df[position_col_name]
                    .str.split(expand=True)
                    .assign(ova=p['ova']))
        position_ratings_dict = (pd.concat(split_df[[i, 'ova']].rename(columns={i:position_col_name}) for i in range(3))
                                 .dropna()
                                 .set_index(position_col_name)
                                 .to_dict()['ova'])
        position_ratings_dict.update({'GK':np.nan})
    else:
        gk_rating = main_article.div.find('div', class_='stats', recursive=False).td.span.text
        position_ratings_dict = {'GK':gk_rating, **{pos:np.nan for pos in all_positions}}
    return position_ratings_dict

In [315]:
# this isn't actually any better than hard-coding as these positions are invariant
# def get_unique_positions(position_ratings):
#     return position_ratings.keys()

In [449]:
# put in separate script
# def get_all_traits_and_specialities():
#     url = 'https://sofifa.com/players/top'
#     html = requests.get(url).text
#     strainer = SoupStrainer('form', action='/players', class_='pjax relative')
#     soup = BeautifulSoup(html, 'lxml', parse_only=strainer)
#     traits1 = list(traits_soup.find(attrs={'name':'t1[]'}).stripped_strings)
#     traits2 = list(traits_soup.find(attrs={'name':'t2[]'}).stripped_strings)
#     all_traits = [*traits1, *traits2]
#     all_specialities = list(traits_soup.find(attrs={'name':'sc[]'}).stripped_strings)
#     return {'traits':all_traits, 'specialities':all_specialities}

In [None]:
#x = get_all_traits_and_specialities()

# import json
# constants = {**x, 'positions':unique_positions}
# with open('constants.json', 'w') as f:
#     json.dump(constants, f)

def get_full_position_preferences(preferred_positions_list, all_positions):
    return {'prefers_' + pos: (pos in preferred_positions_list) for pos in all_positions}

def parse_single_player_page(html, strainer, constants):
    
    soup = BeautifulSoup(html, 'lxml', parse_only=strainer)
    all_traits = constants['traits']
    all_specialities = constants['specialities']
    all_positions = constants['positions']
    
    main_soup = _get_main_soup(soup)
    main_article = _get_main_article(main_soup)
    col3_divs = _get_col3_divs(main_article)
    main_attributes = parse_main_attributes(col3_divs)
    headline_attributes = parse_headline_attributes(soup)
    metadata = parse_player_metadata(main_article)
    _preferred_positions = metadata.pop('preferred_positions')
    traits_and_specialities = parse_traits_and_specialities(col3_divs, all_traits, all_specialities)
    miscellaneous_data = parse_player_miscellaneous_data(main_article)
    position_ratings = get_position_ratings(main_soup, main_article, all_positions)
    position_preferences = get_full_position_preferences(_preferred_positions, all_positions)
    return {**main_attributes, **headline_attributes, **metadata, 
            **traits_and_specialities, **miscellaneous_data, **position_ratings,
           **position_preferences}

def id_from_url(url):
    return url.split('/')[-1]

def parse_player_detailed_data(player_htmls, constants):
    strainer = SoupStrainer(['section', 'script'])
    data = []
    for player_id, html in player_htmls.items():
        row_dict = parse_single_player_page(html, strainer, constants)
        row_dict['ID'] = id_from_url(player_id)
        data.append(row_dict)
    return pd.DataFrame.from_dict(data)

In [834]:
def get_player_detailed_data(IDs, from_file=False):
    constants = get_constants()
    player_htmls = get_player_htmls(IDs, from_file)
    return parse_player_detailed_data(player_soups, constants)

In [1298]:
test_IDs = player_overview_data['ID'].sample(50)
player_htmls = get_player_htmls(test_IDs)

In [1320]:
x = parse_player_detailed_data(player_htmls, constants)

In [528]:
def main(from_file=False):
    player_overview_data = get_overview_data(from_file)
    IDs = player_overview_data['ID']
    player_detailed_data = get_player_detailed_data(IDs, from_file)
    merged = player_overview_data.merge(player_detailed_data, on='ID')
    return merged

In [179]:
heights = ['1cm' for i in range(1000000)]

In [201]:
%%time
s = pd.Series(heights)
s2 = s.str.strip('cm').astype('int')

Wall time: 694 ms


In [203]:
%%time
heights2 = [item.strip('cm') for item in heights]
s2 = pd.Series(heights2).astype('int')

Wall time: 453 ms


.strip method is faster on individual strings in a loop than in pandas Series, for some reason. Type conversion from str to int is still faster with Series.