In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import os
from ast import literal_eval

In [None]:
stats_file_name = 'players_stats_by_season.csv'

### Step 1 -  scrap player stats per League + Season

In [None]:
leagues_dictionary = {
    'NBA':(1,'NBA'),
    'NBA-Playoffs':(9999,'NBA'),
    'Euroleague':(1,'Euroleague'),
    'Israel': (11,'Israeli-BSL'),
    'Argentina':(58,'Argentinian-Liga-A'),
    'Australia': (15,'Australian-NBL'),
    'Austria':(51,'Austrian-A-Bundesliga'),
    'Balkan':(65,'Balkan-BIL'),
    'Belarus':(30,'Belarusian-BPL'),
    'Belgium':(16,'Belgium-Scooore-League'),
    'Bosnia':(19,'Bosnian-BiH-Liga'),
    'Brazil':(59,'Brazilian-NBB'),
    'GB':(69,'British-BBL'),    
    'Bulgaria':(26,'Bulgarian-NBL'),
    'Canada':(74,'Canadian-NBL'),
    'China':(40,'Chinese-CBA'),
    'Croatia':(14,'Croatian-A-1-Liga'),
    'Czech':(24,'Czech-NBL'),
    'Denmark':(67,'Danish-Basketligaen'),
    'Estonia':(123,'Estonian-Latvian-Basketball-League'),
    'Eurocup':(2,'Eurocup'),
    'FIBA-Europe-Cup':(102,'FIBA-Europe-Cup'),
    'Finland':(37,'Finnish-Korisliiga'),
    'French':(12,'French-Jeep-Elite'),
    'Georgia':(33,'Georgian-Super-Liga'),
    'Germany':(15,'German-BBL'),
    'Greece':(8,'Greek-HEBA-A1'),
    'Hungaria':(28,'Hungarian-NBIA'),    
    'Italy':(6,'Italian-Lega-Basket-Serie-A'),
    'Japan':(105,'Japanese-BLeague'),
    'Kosovo':(104,'Kosovo-FBK'),     
    'Lebanon':(95,'Lebanese-Division-A'),
    'Lithuania':(10,'Lithuanian-LKL'),
    'Luxembourg':(92,'Luxembourg-Total-League'),
    'Macedonia':(64,'Macedonian-Superleague'),
    'Mexico':(76,'Mexican-LNBP'),
    'Netherland':(25,'Netherlands-DBL'),
    'New-Zealand':(75,'New-Zealand-NBL'),
    'Norway':(68,'Norwegian-BLNO'),
    'Poland':(21,'Polish-TBL'),    
    'Romania':(31,'Romanian-Divizia-A'),
    'Serbia':(13,'Serbian-KLS'),
    'Slovakia':(29,'Slovakian-Extraliga'),
    'Slovenia':(17,'Slovenian-SKL'),
    'South-Korea':(63,'South-Korean-KBL'),
    'Spain':(4,'Spanish-ACB'),
    'Sweden':(32,'Swedish-Basketligan'),
    'Switzerland':(70,'Swiss-LNA'),     
    'Turkey':(7,'Turkish-BSL'),
    'Ukrainia':(57,'Ukrainian-Superleague')
}


player_season_data_dic = {
    'League':[],'Season':[],'Stage':[],'Player':[],'Team':[],'GP': [],'MIN': [],
    'FGM': [],'FGA': [],'3PM': [],'3PA': [],'FTM': [],'FTA': [],'TOV': [],'PF': [],
    'ORB': [],'DRB': [],'REB': [],    'AST': [],'STL': [],'BLK': [],'PTS':[],'Player_URL':[]
}

# years to scrap
from_year = 2009
to_year = 2021

In [None]:
# helper method inside get_player_stats_method
def return_url(league_id,league,season,page,stage):
    output = ''
    if league == 'NBA':
        output = 'https://basketball.realgm.com/nba/stats/{}/Totals/Qualified/points/All/desc/{}/{}'.format(season,page,stage)
    else:
        output = 'https://basketball.realgm.com/international/league/{}/{}/stats/{}/Totals/Qualified/All/points/All/desc/{}'.format(league_id,league,season,page)
    return output

In [None]:
# scrap league + season page, append the data into player_season_data_dic dictionary
def append_players_stats(i_url,i_league,i_season,i_stage = 'All'):
    appended_flag = True
    full_season = str(i_season-1) + ' - ' + str(i_season)    
    
    page = requests.get(i_url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    tables = soup.find_all('table')
    try:
#        table_ind = 4 if i_league == 'NBA' else 0
        table_ind = 0
        rows = tables[table_ind].findAll('tr')[1:]
        for row in rows:
            cols = row.findAll('td')
            player_season_data_dic['League'].append(i_league)
            player_season_data_dic['Season'].append(full_season)  
            player_season_data_dic['Stage'].append(i_stage)
            player_season_data_dic['Player'].append(cols[1].text)  
            player_season_data_dic['Team'].append(cols[2].text)   
            player_season_data_dic['GP'].append(cols[3].text)   
            player_season_data_dic['MIN'].append(cols[4].text)   
            player_season_data_dic['FGM'].append(cols[5].text)   
            player_season_data_dic['FGA'].append(cols[6].text)  
            player_season_data_dic['3PM'].append(cols[8].text)   
            player_season_data_dic['3PA'].append(cols[9].text)   
            player_season_data_dic['FTM'].append(cols[11].text)   
            player_season_data_dic['FTA'].append(cols[12].text) 
            player_season_data_dic['TOV'].append(cols[14].text)   
            player_season_data_dic['PF'].append(cols[15].text)   
            player_season_data_dic['ORB'].append(cols[16].text)   
            player_season_data_dic['DRB'].append(cols[17].text)  
            player_season_data_dic['REB'].append(cols[18].text)   
            player_season_data_dic['AST'].append(cols[19].text)   
            player_season_data_dic['STL'].append(cols[20].text)   
            player_season_data_dic['BLK'].append(cols[21].text)     
            player_season_data_dic['PTS'].append(cols[22].text) 
            player_season_data_dic['Player_URL'].append(cols[1].find_all('a', href=True)[0]['href'])
    except IndexError:
        appended_flag = False
        
    return appended_flag

In [None]:
def get_players_stats(i_laegues_dictionary,i_from_year,i_to_year):

    for league_id,league in i_laegues_dictionary.values():
        for season in range(i_from_year,i_to_year):
            for page in range(1,999):
                stage = 'Regular_Season' if league_id == 1 and league == 'NBA' else 'Playoffs' if league_id == 9999 else 'International' 
                player_stats_url = return_url(league_id,league,season,page,stage)
                continue_append = append_players_stats(player_stats_url,league,season,stage)

                if not continue_append:
                    break
        

In [None]:
# get players stats dataframe into text file
get_players_stats(leagues_dictionary,from_year,to_year)
    
df_player_season_data = pd.DataFrame(player_season_data_dic)
df_player_season_data.to_csv(stats_file_name, index=False)

### Step 2 - add player details

In [None]:
# read step 1 file, in case we ran it in another time
df_player_season_data = pd.read_csv(stats_file_name)

In [None]:
# helper method inside write_player_details
def parse_player_details(i_paragraphs):
    counter = 5
    draft_list = []
    full_birth_text,height_weight,nationality,high_school,draft_round,draft_pick,draft_team = '','','','','','',''
    
    for p in i_paragraphs:
        if 'Born:' in p.text:
            full_birth_text = p.find_all('a', href=True)[0].text
            counter -= 1
        elif 'Height:' in p.text:
            height_weight = p.text
            counter -=1
        elif 'Nationality:' in p.text:
            nationality = p.text
            counter -= 1
        elif 'High School:' in p.text:
            high_school =  p.find_all('a', href=True)[0].text
            counter -= 1
        elif 'Drafted:' in p.text:
            draft_list = p.text.split(':')[1].split(',')
        if counter == 0: break

    try:
        birth_year = full_birth_text.split()[2]
    except:
        birth_year = None   
        
    try:
        birth_month = full_birth_text.split()[0]
    except:
        birth_month = None 
        
    birth_date = full_birth_text

    try:
        height = height_weight.split()[1]
    except:
        height = None            
    try:
        height_cm = height_weight.split()[2].replace('(','').replace('cm)','')
    except:
        height_cm = None
        
    try:
        weight = height_weight.split()[4]
    except:
        weight = None 
        
    try:
        weight_kg = height_weight.split()[5].replace('(','').replace('kg)','')
    except:
        weight_kg = None   

    try:
        nationality = nationality.split(':')[1].strip()
    except:
        nationality = None

    try:
        draft_round = draft_list[0].strip().split(' ')[1]
    except:
        draft_round = None        
                    
    try:
        draft_pick = draft_list[1].strip().split(' ')[1]
    except:
        draft_pick = None  
            
    try:
        draft_team = draft_list[2].strip()
    except:
        draft_team = None              

    if high_school == '':
        high_school = None
        
    output = {
        'birth_year':birth_year,
        'birth_month':birth_month,
        'birth_date':birth_date,
        'height':height,
        'height_cm':height_cm,
        'weight':weight,
        'weight_kg':weight_kg,
        'nationality':nationality,
        'high_school':high_school,
        'draft_round':draft_round,
        'draft_pick':draft_pick,
        'draft_team':draft_team
    }
    
    return output

In [None]:
# helper method inside write_player_details_files - scrap and parse player details
def get_player_details(i_player_url):
    url = 'https://basketball.realgm.com'+i_player_url 
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    paragraphs = soup.find_all('p')  
    return parse_player_details(paragraphs)

In [None]:
# helper method inside write_player_details_files - write player details into text file
def write_file_player_details(i_player_details):
    player_file_name = (list(i_player_details.keys())[0].replace('/','_')) + ".txt"
    player_file_name = 'player_details/'+ player_file_name

    f = open(player_file_name,"w")
    f.write( str(i_player_details) )
    f.close()

In [None]:
# loop over all player details - write each player details to a file
def write_player_details_files(player_url_series):
    player_url_list = player_url_series.unique().tolist()
    for i,player_url in enumerate(player_url_list):
        if not os.path.isfile('./player_details/'+player_url.replace('/','_')+'.txt'):
            
            player_details_dictionary = {}
            player_details_dictionary[player_url] = get_player_details(player_url)
            write_file_player_details(player_details_dictionary)

In [None]:
def read_player_details(player_url_series):
    player_details_dictionary = {}
    with os.scandir('player_details/') as it:
        for entry in it:
            if (player_url_series==entry.name[:-4].replace('_','/')).any():               
                f= open(entry,'r')
                contents =f.read()
                player_details_dictionary.update(literal_eval(contents))

    return player_details_dictionary

In [None]:
# helper method inside set_player_details_dataframe
def add_player_detail_to_player_df(i_detail):   
    df_player_season_data[i_detail] = df_player_season_data['player_details'].apply(lambda x: x[i_detail])

In [None]:
def set_player_details_dataframe(player_details_dictionary):
    # add player details to DataFrame, first json column, then all the attributes
    df_player_season_data['player_details'] = df_player_season_data.apply(lambda x: player_details_dictionary[x['Player_URL']],axis = 1)

    add_player_detail_to_player_df('birth_year')
    add_player_detail_to_player_df('birth_month')
    add_player_detail_to_player_df('birth_date')
    add_player_detail_to_player_df('height')
    add_player_detail_to_player_df('height_cm')
    add_player_detail_to_player_df('weight')
    add_player_detail_to_player_df('weight_kg')
    add_player_detail_to_player_df('nationality')
    add_player_detail_to_player_df('high_school')
    add_player_detail_to_player_df('draft_round')
    add_player_detail_to_player_df('draft_pick')
    add_player_detail_to_player_df('draft_team')

In [None]:
## runs over all player details store each of them into one file
write_player_details_files(df_player_season_data['Player_URL']) 

## go over all player details files and store them into dictionary
player_details_dictionary = read_player_details(df_player_season_data['Player_URL'].unique())

## add all player details columns into the dataframe
set_player_details_dataframe(player_details_dictionary)

In [None]:
# wite full stats & details into csv file
df_player_season_data.drop(['Player_URL', 'player_details'], axis=1, inplace=True)
df_player_season_data.to_csv('players_stats_by_season_full_details.csv', index=False)