In [1]:
import numpy as np
import pandas as pd 
import requests
import json
from bs4 import BeautifulSoup, SoupStrainer

In [813]:
import aiohttp
import asyncio
import pickle
PICKLE_FILEPATHS = {'overview':'overview_htmls.pkl', 'player':'player_htmls.pkl'}

def get_almost_constants():
    with open('almost_constants.json', 'r') as f:
        almost_constants = json.load(f)
    return almost_constants

async def fetch(session, url):
    with aiohttp.Timeout(30):
        async with session.get(url) as response:
            return await response.text()

async def fetch_all(session, urls, loop):
    results = await asyncio.gather(
        *[fetch(session, url) for url in urls],
        return_exceptions=True  # so we can deal with exceptions later
    )

    return results

def get_htmls_from_pickle(file_key):
    with open(PICKLE_FILEPATHS[file_key], 'rb') as f:
        htmls = pickle.load(f)
    return htmls

def save_htmls_to_pickle(htmls, file_key):
    with open(PICKLE_FILEPATHS[file_key], 'wb') as f:
        pickle.dump(overview_htmls, f)
    

def get_htmls(urls, from_file=False, file_key=None):
    if from_file:
        return get_htmls_from_pickle(file_key)
    else:
        loop = asyncio.get_event_loop()
        with aiohttp.ClientSession(loop=loop) as session:
            htmls = loop.run_until_complete(fetch_all(session, urls, loop))
    return dict(zip(urls, htmls))

In [814]:
def get_overview_urls():
    urls = []
    base_url = "https://sofifa.com/players?offset="
    offset_increment = 80
    for i in range(226): # WARNING: this may not be invariant
        url = base_url + str(i * offset_increment)
        urls.append(url)
    return urls

In [815]:
def get_player_urls(IDs):
    urls = []
    base_url = 'https://sofifa.com/player/'
    for ID in IDs:
        url = base_url + str(ID)
        urls.append(url)
    return urls

In [816]:
def get_player_htmls(IDs, from_file=False):
    urls = get_player_urls(IDs)
    return get_htmls(urls, from_file, file_key='player')

def id_from_url(url):
    return url.split('/')[-1]

In [817]:
def get_overview_htmls(from_file=False):
    urls = get_overview_urls()
    return get_htmls(urls, from_file, file_key='overview')

In [444]:
%time overview_htmls = get_htmls(get_overview_urls())

Wall time: 17.4 s


In [447]:
save_htmls_to_pickle(overview_htmls, 'overview')

In [818]:
def parse_single_row(overview_table_row):
    
    record_dict = {}
    td = overview_table_row.find_all('td')
    record_dict['photo'] = td[0].find('img').get('data-src')
    record_dict['ID'] = td[0].find('img').get('id')
    record_dict['nationality'] = td[1].find('a').get('title')
    record_dict['flag'] = td[1].find('img').get('data-src')
    record_dict['name'] = td[1].find_all('a')[1].text
    record_dict['age'] = td[2].find('div').text.strip()
    record_dict['overall'] = td[3].text.strip()
    record_dict['potential'] = td[4].text.strip()
    record_dict['club'] = td[5].find('a').text
    record_dict['club_logo'] = td[5].find('img').get('data-src')
    record_dict['value'] = td[7].text
    record_dict['wage'] = td[8].text
    record_dict['special'] = td[17].text
    
    return record_dict

In [819]:
def parse_single_overview_page(html, strainer):
    soup = BeautifulSoup(html, 'lxml', parse_only=strainer)
    row_dicts = []
    for row in soup.find_all('tr'):
        row_dicts.append(parse_single_row(row))
    return row_dicts

In [820]:
def parse_overview_data(overview_htmls):
    strainer = SoupStrainer('tbody')
    data = []
    for html in overview_htmls:
        row_dicts = parse_single_overview_page(html, strainer)
        data.extend(row_dicts)
    return pd.DataFrame.from_dict(data)

In [14]:
# doesn't work in notebook but should work otherwise
# import multiprocessing as mp
# num_workers = mp.cpu_count()
# pool = mp.Pool(num_workers)

In [None]:
# %%time

# def square(x):
#     return x**2

# if __name__ == '__main__':
#     pool.map(square, [1,3,5,7])

In [None]:
# %%time
# x = pool.map(parse_single_overview_page, overview_htmls)

In [821]:
def convert_currency(curr_col):
    without_euro_symbol = curr_col.str[1:]
    unit_symbol = without_euro_symbol.str[-1]
    numeric_part = np.where(unit_symbol == '0', 0, without_euro_symbol.str[:-1].pipe(pd.to_numeric))
    multipliers = unit_symbol.replace({'M':1e6, 'K':1e3}).pipe(pd.to_numeric)
    return numeric_part * multipliers

def clean_overview_data(df):
    return (df.assign(EUR_value = lambda df: df['value'].pipe(convert_currency), 
                                EUR_wage = lambda df: df['wage'].pipe(convert_currency))
            .drop(['value', 'wage'], axis=1))

#player_personal_data = df.pipe(clean_personal_data)

In [822]:
def get_overview_data(from_file=False):
    overview_htmls = get_overview_htmls()
    return parse_overview_data(overview_htmls).pipe(clean_overview_data)

In [437]:
player_overview_data = get_overview_data(from_file=True)

In [64]:
#player_personal_data.to_csv('Complete/PlayerPersonalData.csv', index=False)

In [823]:
def headline_attribute_from_line(line):
    equals_sign_loc = line.find('=')
    attribute_name = line[equals_sign_loc - 4: equals_sign_loc - 1].lower()
    attribute_value = int(line[equals_sign_loc+2:equals_sign_loc+4])
    return {'name':attribute_name, 'value':attribute_value}

In [22]:
# def standardise_spelling(player_attribute_name):
#     return player_attribute_name.lower().replace(' ', '_')

In [377]:
def standardise_spelling(x):
    return x

In [125]:
player_data_url = 'https://sofifa.com/player/20801'
# skill_names = ['ID', 'crossing', 'finishing', 'heading_accuracy','short_passing', 'volleys', 'dribbling', 'curve',
#                'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility',
#                'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
#                'interceptions', 'positioning', 'vision', 'penalties', 'composure', 'marking', 'standing_tackle',
#                'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']
# headline_attribute_names = ['PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY']
# all_attribute_names = skill_names + headline_attribute_names
player_attribute_dict = {'ID': 20801}

In [495]:
source_code = requests.get(player_data_url)
gk_source_code = requests.get('https://sofifa.com/player/228736')

In [810]:
plain_text = source_code.text
strainer = SoupStrainer(['section', 'script'])
soup = BeautifulSoup(plain_text, 'lxml', parse_only=strainer)
gk_soup = BeautifulSoup(gk_source_code.text, 'lxml', parse_only=strainer)

Wall time: 65.5 ms


In [824]:
def _parse_col3_divs(soup):
    col_divs = (soup.find('section', class_='container', recursive=False)
                .section
                .article
                .find_all('div', class_='columns', recursive=False))
    col3_divs = []
    for sub_div in col_divs:
        col3_divs.extend(sub_div.find_all('div', class_='col-3', recursive=False))
    return col3_divs

In [889]:
list(messi_col3_divs[4].div.ul.find_all('li', recursive=False)[4].stripped_strings)

['78', '+4', 'Penalties']

In [890]:
def parse_main_attributes(col3_divs):
    attribute_dict = {}
    for sub_div in col3_divs[:-1]: # last one is traits and specialities
        for li in sub_div.div.ul.find_all('li', recursive=False):
            stripped_strings = list(li.stripped_strings)
            attribute_name = stripped_strings[-1]
            attribute_value = stripped_strings[0]
            attribute_dict[attribute_name] = attribute_value
    return attribute_dict

In [788]:
col3_divs = _parse_col3_divs(soup)

In [866]:
%time x = parse_main_attributes(col3_divs)

Wall time: 499 µs


headline attributes like PHY: seems to be related to Ultimate Team

In [826]:
def parse_headline_attributes(soup):
    attribute_dict = {}
    headline_attribute_script = soup.find_all('script')[1]
    for line in headline_attribute_script.text.split('\r\n'):
        if 'point' in line:
            attr_subdict = headline_attribute_from_line(line)
            attribute_dict[attr_subdict['name']] = attr_subdict['value']
    return attribute_dict

meta section at top of player page

In [827]:
def parse_player_metadata(soup):
    
    attribute_dict = {}
    player_info_html = soup.find('div', class_='meta').find('span')
    # nationality, age and flag were found in player overview
    attribute_dict['preferred_positions'] = [span.text for span in player_info_html.find_all('span')]
    age_height_weight = player_info_html.contents[-1].split()
    attribute_dict['Birth date'] = ' '.join(age_height_weight[2:5]).replace(',', '').strip('(').strip(')')
    attribute_dict['Height_cm'] = age_height_weight[5].strip('cm')
    attribute_dict['Weight_kg'] = age_height_weight[-1].strip('kg')
    
    return attribute_dict

In [None]:
page_uls = soup.find_all('ul', class_='pl')

In [858]:
def _standardise_ul(ul):
    return list(standardise_spelling(item) for item in ul.stripped_strings)

def _get_traits_and_specialities_dict(player_traits, player_specialities, all_traits, all_specialities):
    trait_dict = {trait: (trait in player_traits) for trait in all_traits}
    speciality_dict = {speciality: (speciality in specialities) for speciality in all_specialities}
    return {**trait_dict, **speciality_dict}

def parse_traits_and_specialities(col3_divs, all_traits, all_specialities):
    uls = col3_divs[-1].div.find_all('ul', recursive=False)
    player_traits = _standardise_ul(uls[0])
    player_specialities = _standardise_ul(page_uls[1])
    result = _get_traits_and_specialities_dict(player_traits, player_specialities, all_traits, all_specialities)
    return result

In [829]:
def parse_player_miscellaneous_data(page_uls):
    data = page_uls[0]
    attribute_dict = {}
    strings = data.stripped_strings
    for key in strings:
        attribute_dict[standardise_spelling(key)] = next(strings)
    work_rates = attribute_dict.pop('Work rate').split(' / ')
    attribute_dict['Work rate att'] = work_rates[0]
    attribute_dict['work rate def'] = work_rates[1]
    return attribute_dict

In [830]:
def get_position_ratings(soup, all_positions):
    position_col_name = 'Position'
    ratings_table = soup.find('table', class_='table table-hover')
    if ratings_table:
        position_ratings_df = (pd.read_html(str(ratings_table))[0][[position_col_name, 'OVA']]
                        .rename(columns=standardise_spelling))
        split_df = (position_ratings_df[position_col_name]
                    .str.split(expand=True)
                    .assign(ova=p['ova']))
        position_ratings_dict = (pd.concat(split_df[[i, 'ova']].rename(columns={i:position_col_name}) for i in range(3))
                                 .dropna()
                                 .set_index(position_col_name)
                                 .to_dict()['ova'])
        position_ratings_dict.update({'GK':np.nan})
    else:
        gk_rating = soup.find('div', class_='stats').td.span.text
        position_ratings_dict = {'GK':gk_rating, **{pos:np.nan for pos in all_positions}}
    return position_ratings_dict

In [315]:
# this isn't actually any better than hard-coding as these positions are invariant
# def get_unique_positions(position_ratings):
#     return position_ratings.keys()

In [449]:
# put in separate script
# def get_all_traits_and_specialities():
#     url = 'https://sofifa.com/players/top'
#     html = requests.get(url).text
#     strainer = SoupStrainer('form', action='/players', class_='pjax relative')
#     soup = BeautifulSoup(html, 'lxml', parse_only=strainer)
#     traits1 = list(standardise_spelling(item) for item in traits_soup.find(attrs={'name':'t1[]'}).stripped_strings)
#     traits2 = list(standardise_spelling(item) for item in traits_soup.find(attrs={'name':'t2[]'}).stripped_strings)
#     all_traits = [*traits1, *traits2]
#     all_specialities = list(standardise_spelling(item) for item in traits_soup.find(attrs={'name':'sc[]'}).stripped_strings)
#     return {'traits':all_traits, 'specialities':all_specialities}

In [450]:
#x = get_all_traits_and_specialities()

# import json
# almost_constants = {**x, 'positions':unique_positions}
# with open('almost_constants.json', 'w') as f:
#     json.dump(almost_constants, f)

In [831]:
def get_full_position_preferences(preferred_positions_list, all_positions):
    return {'prefers_' + pos: (pos in preferred_positions_list) for pos in all_positions}

In [860]:
def parse_single_player_page(html, strainer, almost_constants):
    
    soup = BeautifulSoup(html, 'lxml', parse_only=strainer)
    all_traits = almost_constants['traits']
    all_specialities = almost_constants['specialities']
    all_positions = almost_constants['positions']
    
    col3_divs = _parse_col3_divs(soup)
    main_attributes = parse_main_attributes(col3_divs)
    headline_attributes = parse_headline_attributes(soup)
    metadata = parse_player_metadata(soup)
    _preferred_positions = metadata.pop('preferred_positions')
    _page_uls = soup.find_all('ul', class_='pl')
    traits_and_specialities = parse_traits_and_specialities(col3_divs, all_traits, all_specialities)
    miscellaneous_data = parse_player_miscellaneous_data(_page_uls)
    position_ratings = get_position_ratings(soup, all_positions)
    position_preferences = get_full_position_preferences(_preferred_positions, all_positions)
    return {**main_attributes, **headline_attributes, **metadata, 
            **traits_and_specialities, **miscellaneous_data, **position_ratings,
           **position_preferences}

In [833]:
def parse_player_detailed_data(player_htmls, almost_constants):
    strainer = SoupStrainer(['section', 'script'])
    data = []
    for player_id, html in player_htmls.items():
        row_dict = parse_single_player_page(html, strainer, almost_constants)
        row_dict['ID'] = id_from_url(player_id)
        data.append(row_dict)
    return pd.DataFrame.from_dict(data)

In [834]:
def get_player_detailed_data(IDs, from_file=False):
    almost_constants = get_almost_constants()
    player_htmls = get_player_htmls(IDs, from_file)
    return parse_player_detailed_data(player_soups, almost_constants)

In [549]:
%prun -s "cumulative" z = parse_single_player_page(soup, almost_constants)

 

In [551]:
%time z = parse_single_player_page(soup, almost_constants)

Wall time: 41.5 ms


In [835]:
test_IDs = player_overview_data['ID'].head(10)
player_htmls = get_player_htmls(test_IDs)

In [891]:
x = parse_player_detailed_data(player_htmls, almost_constants)

In [893]:
x['work rate att']

KeyError: 'work rate att'

In [875]:
messi_html = player_htmls['https://sofifa.com/player/158023']

In [876]:
messi_strainer = SoupStrainer(['section', 'script'])
messi_soup = BeautifulSoup(messi_html, 'lxml', parse_only=messi_strainer)

In [896]:
messi_page_uls = messi_soup.find_all('ul', class_='pl')

In [898]:
parse_player_miscellaneous_data(messi_page_uls)

{'Body type': 'Messi',
 'International reputation': '5',
 'Preferred foot': 'Left',
 'Real face': 'Yes',
 'Release clause': '€215.3M',
 'Skill moves': '4',
 'Weak foot': '4',
 'Work rate att': 'Medium',
 'work rate def': 'Medium'}

In [878]:
messi_col3_divs = _parse_col3_divs(messi_soup)
parse_main_attributes(messi_col3_divs)

{'+4': '78',
 '96': 'Penalties',
 'Acceleration': '92',
 'Aggression': '48',
 'Agility': '90',
 'Balance': '95',
 'Ball control': '95',
 'Crossing': '77',
 'Curve': '89',
 'Dribbling': '97',
 'Finishing': '95',
 'Free kick accuracy': '90',
 'GK diving': '6',
 'GK handling': '11',
 'GK kicking': '15',
 'GK positioning': '14',
 'GK reflexes': '8',
 'Heading accuracy': '71',
 'Interceptions': '22',
 'Jumping': '68',
 'Long passing': '87',
 'Long shots': '88',
 'Marking': '13',
 'Positioning': '93',
 'Reactions': '95',
 'Short passing': '88',
 'Shot power': '85',
 'Sliding tackle': '26',
 'Sprint speed': '87',
 'Stamina': '73',
 'Standing tackle': '28',
 'Strength': '59',
 'Vision': '90',
 'Volleys': '85'}

In [888]:
list(messi_col3_divs[4].div.ul.find_all('li', recursive=False)[4].stripped_strings)

['78', '+4', 'Penalties']

In [850]:
col3_divs = _parse_col3_divs(soup)

In [857]:
col3_divs[-1].div.find_all('ul', recursive=False)

[<ul class="pl mb-20">
 <li>Power free kick</li>
 <li>Flair</li>
 <li>Long shot taker</li>
 <li>Skilled dribbling</li>
 </ul>, <ul class="pl">
 <li>Speedster</li>
 <li>Dribbler</li>
 <li>Distance shooter</li>
 <li>Acrobat</li>
 <li>Clinical finisher</li>
 <li>Complete forward</li>
 </ul>]

In [528]:
def main(from_file):
    # download overview htmls. Parse these into a dataframe and save this into a variable.
    # Use the df's ID column to get urls for player personal data.
    # for the first player url only, use the position ratings table to get a sequence of unique positions, and save this as a variable
    # actually just hard code it
    player_overview_data = get_overview_data(from_file)
    IDs = player_overview_data['ID']
    player_detailed_data = get_player_detailed_data(IDs, from_file)
    merged = player_overview_data.merge(player_detailed_data, on='ID')
    return merged

In [None]:
full_data.to_csv('Allplayer.csv', encoding='utf-8')

In [None]:
master_data.to_csv('Complete/PlayerAttributeData.csv', encoding='utf-8')

In [None]:
full_data.to_csv('Complete/Dataset.csv', encoding='utf-8')

In [None]:
full_data

In [None]:
full_data.drop('Unnamed: 0', 1,  inplace=True)

In [None]:
full_data

In [None]:
full_data.drop('ID_x', 1,  inplace=True)

In [None]:
full_data['ID_y']

In [None]:
f = full_data.rename(index=str, columns={"ID_y": "ID"})

In [None]:
f['ID']

In [None]:
f.to_csv('Complete/Dataset.csv', encoding='utf-8')

In [None]:
f

In [179]:
heights = ['1cm' for i in range(1000000)]

In [201]:
%%time
s = pd.Series(heights)
s2 = s.str.strip('cm').astype('int')

Wall time: 694 ms


In [203]:
%%time
heights2 = [item.strip('cm') for item in heights]
s2 = pd.Series(heights2).astype('int')

Wall time: 453 ms


.strip method is faster on individual strings in a loop than in pandas Series, for some reason. Type conversion from str to int is still faster with Series.