In [1030]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import traceback

pd.set_option('display.max_columns', None)

In [999]:
def cleanText(string):
    if string is not None:
        string = string.replace('\n','')
        string = string.replace('\t','')

    return string

def getMatchHeader(source):
    # Get Event Name and Date
    header = source.find_all("div", class_="match-header-super")[0]
    event = header.find("a", class_='match-header-event')
    a = header.find(href=True)
    event_url = a['href'].lstrip()
    event_id = event_url.split("/")[2]
    event_name = a.div.div.text.lstrip()
    event_stage = a.div.div.find_next_sibling('div').text.lstrip()
    date_header = header.find("div", class_="match-header-date")
    
    date_div = date_header.div
    date = date_div['data-utc-ts']
    
    # Get patch
    patch = None
    try:
        patch = date_header.div.find_next_sibling('div').find_next_sibling('div').text.lstrip()
    except:
        pass
    
    # Get Teams
    match_header_vs = source.find("div", class_="match-header-vs")
    a = match_header_vs.find_all(href=True)
    
    team1_id = a[0]['href'].split("/")[2]
    team2_id = a[1]['href'].split("/")[2]
    team1 = a[0].div.div.text.lstrip()
    team2 = a[1].div.div.text.lstrip()
    
    # Get Score
    match_score = source.find("div", class_="js-spoiler")
    team1_mapscore = match_score.span.text.lstrip()
    team2_mapscore = match_score.span.find_next_sibling('span').find_next_sibling('span').text.lstrip()
    
    # Clean text
    
    event_stage = cleanText(event_stage)
    event_name = cleanText(event_name)
    patch = cleanText(patch)
    team1 = cleanText(team1)
    team2 = cleanText(team2)
    team1_mapscore = cleanText(team1_mapscore)
    team2_mapscore = cleanText(team2_mapscore)
    
    res = {
        'Date': date,
        'Patch': patch,
        'EventID': event_id,
        'EventName': event_name,
        'EventStage': event_stage,
        'Team1ID': int(team1_id),
        'Team2ID': int(team2_id),
        'Team1': team1,
        'Team2': team2,
        'Team1_MapScore': int(team1_mapscore),
        'Team2_MapScore': int(team2_mapscore)
    }
    
    return res

In [1061]:
def extractEconomyRoundByRound(cols):
    
    ecoDict = {
        '':'eco',
        '$':'semi-eco',
        '$$':'semi-buy',
        '$$$': 'full-buy'
    }
    
    ecoRoundByRound = []
    
    for col in cols:
        check = col.find('div', class_='ge-text-light round-num')
        if check is not None:
            t1_buytype = cleanText(col.div.find_next_sibling('div').find_next_sibling('div').text)
            t2_buytype = cleanText(col.div.find_next_sibling('div').find_next_sibling('div').find_next_sibling('div').text)
            roundNum = cleanText(check.text)
            bank = col.find_all('div', class_='bank')
            bankt1 = cleanText(bank[0].text)
            bankt2 = cleanText(bank[1].text)
            
            bankt1 = float(bankt1[:-1])*1000
            bankt2 = float(bankt2[:-1])*1000
            
            
            res = {
                'RoundNumber': int(roundNum),
                'Team1Bank': bankt1,
                'Team2Bank': bankt2,
                'Team1BuyType': ecoDict[t1_buytype],
                'Team2BuyType': ecoDict[t2_buytype]
            }
            
            ecoRoundByRound.append(res)
    
    return ecoRoundByRound

def extractEconomyRows(row, game_id):
    pistolWon_t1 = cleanText(row[0].text)
    eco_list_t1 = cleanText(row[1].text).replace('(', ' ').replace(')',' ').split()
    semi_eco_list_t1 = cleanText(row[2].text).replace('(', ' ').replace(')',' ').split()
    semi_buy_list_t1 = cleanText(row[3].text).replace('(', ' ').replace(')',' ').split()
    full_buy_list_t1 = cleanText(row[4].text).replace('(', ' ').replace(')',' ').split()
    
    pistolWon_t2 = cleanText(row[5].text)
    eco_list_t2 = cleanText(row[6].text).replace('(', ' ').replace(')',' ').split()
    semi_eco_list_t2 = cleanText(row[7].text).replace('(', ' ').replace(')',' ').split()
    semi_buy_list_t2 = cleanText(row[8].text).replace('(', ' ').replace(')',' ').split()
    full_buy_list_t2 = cleanText(row[9].text).replace('(', ' ').replace(')',' ').split()
    
    res = {
        'GameID': game_id,
        'Team1_PistolWon': int(pistolWon_t1),
        'Team1_Eco': int(eco_list_t1[0]),
        'Team1_EcoWon': int(eco_list_t1[1]),
        'Team1_SemiEco': int(semi_eco_list_t1[0]),
        'Team1_SemiEcoWon': int(semi_eco_list_t1[1]),
        'Team1_SemiBuy': int(semi_buy_list_t1[0]),
        'Team1_SemiBuyWon': int(semi_buy_list_t1[1]),
        'Team1_FullBuy': int(full_buy_list_t1[0]),
        'Team1_FullBuyWon': int(full_buy_list_t1[1]),
        'Team2_PistolWon': int(pistolWon_t2),
        'Team2_Eco': int(eco_list_t2[0]),
        'Team2_EcoWon': int(eco_list_t2[1]),
        'Team2_SemiEco': int(semi_eco_list_t2[0]),
        'Team2_SemiEcoWon': int(semi_eco_list_t2[1]),
        'Team2_SemiBuy': int(semi_buy_list_t2[0]),
        'Team2_SemiBuyWon': int(semi_buy_list_t2[1]),
        'Team2_FullBuy': int(full_buy_list_t2[0]),
        'Team2_FullBuyWon': int(full_buy_list_t2[1]),
    }    
    return res

    
def getEconomy(url, game_id):
    full_url = url + '?game=' + game_id[0] + '&tab=economy'
    page = requests.get(full_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    economy_dict_list = []
    economy_rbr_dict_list = {}
    
    container = soup.find_all('table', class_='wf-table-inset mod-econ')

    idx_1=0
    idx_2=0
    for table in container:
        data_points = table.find_all('div', class_='stats-sq')
        if len(data_points) > 0 and idx_1 <= len(game_id)-1:
            economy_dict_list.append(extractEconomyRows(data_points, game_id[idx_1]))
            idx_1 = idx_1 + 1
        else:
            if idx_2 <= len(game_id)-1:
                extractTD = table.find_all('td')
                economy_rbr_dict_list[game_id[idx_2]] = extractEconomyRoundByRound(extractTD)
                idx_2 = idx_2 + 1
    
    return economy_dict_list, economy_rbr_dict_list

In [872]:
def checkIfEmpty(html):
    split = html.text.split()
    if len(split) == 0:
        return None
    else:
        return int(split[0])
    

def extractRowPreformanceData(row, game_id):
    player_team = row[0].text.split()
    player = player_team[0]
    team = player_team[1]
    
    agent_pic = row[1].find('img')['src']
    agent = agent_pic.split('/')[5].replace('.png', '')
    
    num_2Ks = checkIfEmpty(row[2])
    num_3Ks = checkIfEmpty(row[3])
    num_4Ks = checkIfEmpty(row[4])
    num_5Ks = checkIfEmpty(row[5])
    OnevOne = checkIfEmpty(row[6])
    OnevTwo = checkIfEmpty(row[7])
    OnevThree = checkIfEmpty(row[8])
    OnevFour = checkIfEmpty(row[9])
    OnevFive = checkIfEmpty(row[10])
    econ = checkIfEmpty(row[11])
    plants = checkIfEmpty(row[12])
    defuses = checkIfEmpty(row[13])

    res = {
        'GameID': game_id,
        'PlayerName': player,
        'TeamAbbreviation': team,
        'Agent': agent,
        'Num_2Ks': num_2Ks,
        'Num_3Ks': num_3Ks,
        'Num_4Ks': num_4Ks,
        'Num_5Ks': num_5Ks,
        'OnevOne': OnevOne,
        'OnevTwo': OnevTwo,
        'OnevThree': OnevThree,
        'OnevFour': OnevFour,
        'OnevFive': OnevFive,
        'Econ': econ,
        'Plants': plants,
        'Defuses': defuses
    }
    
    return res


def getPerformanceData(url, game_id):
    full_url = url + '?game=' + game_id[0] + '&tab=performance'
    page = requests.get(full_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    containers_all = soup.find_all('table', class_='wf-table-inset mod-adv-stats')
    containers_all.pop(0)
    
    player_stats = []
    for idx, container in enumerate(containers_all):
        rows = container.find_all('tr')
        for row in rows:
            team = row.find('div', class_='team')
            if team is None:
                continue
            data_points = row.find_all('td')
            row_data = extractRowPreformanceData(data_points, game_id[idx])
            player_stats.append(row_data)
            
    return player_stats

In [1092]:
def extractRowData(row):
    name_team_list = cleanText(row[0].text).split()
    
    player = name_team_list[0]
    team = name_team_list[1]
    
    combat_score = cleanText(row[2].text)
    kills =  cleanText(row[3].text)
    deaths_unformatted =  cleanText(row[4].text) # need to clean slashes later
    assists =  cleanText(row[5].text)
    plus_minus_unformatted =  cleanText(row[6].text) # need to clean + later
    kast_unformatted =  cleanText(row[7].text)
    adr =  cleanText(row[8].text)
    hs_percent_unformatted =  cleanText(row[9].text)
    first_kills =  cleanText(row[10].text)
    first_deaths =  cleanText(row[11].text)
    fkfd_plus_minus_unformatted = cleanText(row[12].text)
    
    
    # Now onto formatting some of the fields...
    deaths = deaths_unformatted.replace('/','')
    plus_minus = plus_minus_unformatted.replace('+', '')
    kast = kast_unformatted.replace('%','')
    kast = int(kast)/100 # convert % to decimal
    hs_percent = hs_percent_unformatted.replace('%','')
    hs_percent = int(hs_percent)/100 # convert % to decimal
    fkfd_plus_minus = fkfd_plus_minus_unformatted.replace('+', '')
    
    res = {
        'GameID': None,
        'PlayerID': None,
        'PlayerName': player,
        'TeamAbbreviation': team,
        'Agent': None,
        'Kills': int(kills),
        'Deaths': int(deaths),
        'Assists': int(assists),
        'PlusMinus': int(plus_minus),
        'KAST_Percent': kast,
        'ADR': int(adr),
        'HS_Percent': hs_percent,
        'FirstKills': int(first_kills),
        'FirstDeaths': int(first_deaths),
        'FKFD_PlusMinus': int(fkfd_plus_minus)
    }
    
    return res
    

def getScoreboard(url, game_id, team1_id, team2_id):
    full_url = url + '?game=' + game_id[0] + '&tab=overview'
    page = requests.get(full_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    def getScoreboardHeader(soup):
        # Scoreboard Header
        game_json_list = []
        stats_header = soup.find_all('div', class_='vm-stats-game-header')
        for idx, tab in enumerate(stats_header):
            team_list = tab.find_all('div', class_='team-name')
            team1 = cleanText(team_list[0].text)
            team2 = cleanText(team_list[1].text)    
            score_list = tab.find_all('div', class_='score')
            team1_score = int(cleanText(score_list[0].text))
            team2_score = int(cleanText(score_list[1].text))

            if team1_score > team2_score:
                winner = team1
            else:
                winner = team2

            half_score_list = tab.find_all('span')
            team1_firsthalf_score = half_score_list[0].text
            team1_secondhalf_score = half_score_list[1].text
            map_name = cleanText(half_score_list[2].text)
            team2_firsthalf_score = half_score_list[3].text
            team2_secondhalf_score = half_score_list[4].text

            if 'mod-ct' in half_score_list[0]['class']:
                team1_firsthalf_side = 'defend'
            else:
                team1_firsthalf_side = 'attack'

            if 'mod-ct' in half_score_list[3]['class']:
                team2_firsthalf_side = 'defend'
            else:
                team2_firsthalf_side = 'attack'

            duration_div = tab.find('div', class_='map-duration ge-text-light')
            duration = cleanText(duration_div.text)

            game_json = {
                'GameID': game_id[idx],
                'Map': map_name,
                'Team1ID': team1_id,
                'Team2ID': team2_id,
                'Team1': team1,
                'Team2': team2,
                'Winner': winner,
                'Team1_TotalRounds': team1_score,
                'Team2_TotalRounds': team2_score,
                'Team1_SideFirstHalf': team1_firsthalf_side,
                'Team2_SideFirstHalf': team2_firsthalf_side,
                'Team1_RoundsFirstHalf': int(team1_firsthalf_score),
                'Team1_RoundsSecondtHalf': int(team1_secondhalf_score),
                'Team2_RoundsFirstHalf': int(team2_firsthalf_score),
                'Team2_RoundsSecondtHalf': int(team2_secondhalf_score) 
            }
            
            game_json_list.append(game_json)

        return game_json_list
    
    def getRoundByRound(soup, game_id):
        # Round by round
        rounds_container_all = soup.find_all('div', class_='vlr-rounds-row')
        round_by_round_dict = {}
        
        for idx, rounds_container in enumerate(rounds_container_all):
            teams = rounds_container.find_all('div', class_='team')
            team1_abr = cleanText(teams[0].text)
            team2_abr = cleanText(teams[1].text)
            cols = rounds_container.find_all('div', class_='vlr-rounds-row-col')
            cols.pop(0)
            round_by_round = []

            team1_start_side_flag = False
            team1_start_side = ''
            for col in cols:
                rnd_number = col.find('div', class_='rnd-num')
                if len(col['class']) == 1: #Length will be 2 with: 'vlr-rounds-row-col mod-spacing'
                    round_number = cleanText(rnd_number.text)
                    class_name = col.div.find_next_sibling('div')['class']
                    if 'mod-win' in class_name:
                        if team1_start_side_flag is False:
                            if 'mod-ct' in class_name:
                                team1_start_side = 'defend'
                                team1_start_side_flag = True
                            if 'mod-t' in class_name:
                                team1_start_side = 'attack'
                                team1_start_side_flag = True

                        win_type_img = col.find('img')
                        win_type = win_type_img['src'].split("/")[5].replace('.webp','')
                        curr_score_tag = col.find('div', class_='rnd-currscore')
                        current_score = cleanText(curr_score_tag.text)
                        round_json = {
                            'RoundNumber': round_number,
                            'RoundWinner': team1_abr,
                            'ScoreAfterRound': current_score,
                            'WinType': win_type,
                        }

                    else: # Team 2 won OR no rounds were played (game ended)
                        row2 = col.div.find_next_sibling('div').find_next_sibling('div')
                        row2_class = row2['class']
                        if 'mod-win' in row2_class:
                            win_type_img = row2.find('img')
                            win_type = win_type_img['src'].split("/")[5].replace('.webp','')
                            curr_score_tag = col.find('div', class_='rnd-currscore')
                            current_score = cleanText(curr_score_tag.text)
                            round_json = {
                                'RoundNumber': round_number,
                                'RoundWinner': team2_abr,
                                'ScoreAfterRound': current_score,
                                'WinType': win_type,
                            }

                        else:
                            continue

                    round_by_round.append(round_json)
                round_by_round_dict[game_id[idx]] = round_by_round
                
        return round_by_round_dict
    
    # Get Player Stats
    def getPlayerStats(soup):
        containers_all = soup.find_all('div', class_='vm-stats-game')
        
        containers = []
        for check in containers_all: # Get rid of overall tab
            header = check.find('div', class_='vm-stats-game-header')
            if header is not None:
                containers.append(check)
            
        player_stats = []
        for idx, container in enumerate(containers):
            container = container.find_all('table', class_='wf-table-inset mod-overview')
            for table in container:
                rows = table.find_all('tr')
                for row in rows:
                    data_points = row.find_all('td')
                    agent_img = row.find('img')
                    player_id_search = row.find('a')

                    # There's times where it gets the headers which we don't want
                    if (len(data_points) > 0):
                        agent = agent_img['alt']
                        player_id = player_id_search['href'].split('/')[2]
                        row_data = extractRowData(data_points)
                        row_data['GameID'] = game_id[idx]
                        row_data['Agent'] = agent
                        row_data['PlayerID'] = player_id

                        player_stats.append(row_data)
        return player_stats
        
    try:
        game_dict = getScoreboardHeader(soup)
    except:
        print('Unable to execute getScoreboardHeader')
    try:
        rbr_dict = getRoundByRound(soup, game_id)
    except:
        print('Unable to execute getRoundByRound')
    try:
        player_stats_dict = getPlayerStats(soup)
    except:
        print('Unable to execute getPlayerStats')
        traceback.print_exc()
    
    return game_dict, rbr_dict, player_stats_dict

                
                
def getMapStats(source, url, team1_id, team2_id):
    header = source.find("div", class_="vm-stats")
    
    games = header.find_all("div", class_="vm-stats-gamesnav-item js-map-switch")
    
    game_id = []
    for game in games:
        game_id.append(game['data-game-id'])
    
    game_dict_list, rbr_dict_list, player_stats_dict_list = getScoreboard(url, game_id, team1_id, team2_id)
    
    return game_dict_list, rbr_dict_list, player_stats_dict_list, game_id


In [1121]:
url = 'https://www.vlr.gg/61393/akrew-vs-super-squad-knights-monthly-gauntlet-december-ro16/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

match_row = getMatchHeader(soup)
match_df = pd.DataFrame(match_row, index=[0])
team1_id = match_row['Team1ID']
team2_id = match_row['Team2ID']

game_dict_list, rbr_dict_list, player_stats_dict_list, game_id_list = getMapStats(soup, url, team1_id, team2_id)
adv_player_stats = getPerformanceData(url, game_id_list)
player_stats_df = pd.DataFrame(player_stats_dict_list)
adv_player_stats_df = pd.DataFrame(adv_player_stats)


all_player_stats = player_stats_df.merge(adv_player_stats_df, on=['GameID', 'PlayerName', 'Agent'])
all_player_stats = all_player_stats.drop(columns=['TeamAbbreviation_y'])


economy_dict_list, economy_rbr_dict_list = getEconomy(url, game_id_list)

game_df = pd.DataFrame(game_dict_list)
econ_df = pd.DataFrame(economy_dict_list)

full_game_df = game_df.merge(econ_df, on=['GameID'])


game_rounds = []
for key in rbr_dict_list:
    econ_info = economy_rbr_dict_list[key]
    econ_info_size = len(econ_info)
    
    for idx, round in enumerate(rbr_dict_list[key]):
        round.update(econ_info[idx])
    print(rbr_dict_list[key])
    
    res = {
        'GameID': key,
        'Team1ID': team1_id,
        'Team2ID': team2_id,
        'RoundHistory': rbr_dict_list[key]
    }
    game_rounds.append(res)

#print(game_rounds[1])
#game_rounds_df = pd.DataFrame(game_rounds)

#print(game_rounds_df)
#print('test')
'''
match_df # Match Table
full_game_df # Game Tables
'''
    

[{'RoundNumber': 1, 'RoundWinner': 'AKRE', 'ScoreAfterRound': '1-0', 'WinType': 'defuse', 'Team1Bank': 300.0, 'Team2Bank': 200.0, 'Team1BuyType': 'eco', 'Team2BuyType': 'eco'}, {'RoundNumber': 2, 'RoundWinner': 'AKRE', 'ScoreAfterRound': '2-0', 'WinType': 'elim', 'Team1Bank': 2200.0, 'Team2Bank': 10000.0, 'Team1BuyType': 'semi-buy', 'Team2BuyType': 'eco'}, {'RoundNumber': 3, 'RoundWinner': 'AKRE', 'ScoreAfterRound': '3-0', 'WinType': 'elim', 'Team1Bank': 14800.0, 'Team2Bank': 600.0, 'Team1BuyType': 'semi-buy', 'Team2BuyType': 'full-buy'}, {'RoundNumber': 4, 'RoundWinner': 'AKRE', 'ScoreAfterRound': '4-0', 'WinType': 'elim', 'Team1Bank': 19800.0, 'Team2Bank': 8400.0, 'Team1BuyType': 'full-buy', 'Team2BuyType': 'semi-eco'}, {'RoundNumber': 5, 'RoundWinner': 'AKRE', 'ScoreAfterRound': '5-0', 'WinType': 'defuse', 'Team1Bank': 26400.0, 'Team2Bank': 2500.0, 'Team1BuyType': 'full-buy', 'Team2BuyType': 'full-buy'}, {'RoundNumber': 6, 'RoundWinner': 'SUPE', 'ScoreAfterRound': '5-1', 'WinType': 

'\nmatch_df # Match Table\nfull_game_df # Game Tables\n'

In [None]:
# Match table
#------------
# MatchID
# Date
# Patch
# EventID
# EventStage
# Team1ID
# Team2ID
# Team1
# Team2
# Team1_MapScore
# Team2_MapScore

# Game Table
# ------------
# GameID
# Map
# Duration
# Team1ID
# Team2ID
# Team1
# Team2
# Winner
# Team1_TotalRounds
# Team1_SideFirstHalf
# Team1_RoundsFirstHalf
# Team1_RoundsSecondtHalf
# Team1_PistolWon
# Team1_Eco
# Team1_EcoWon
# Team1_SemiEco
# Team1_SemiEcoWon
# Team1_SemiBuy
# Team1_SemiBuyWon
# Team1_FullBuy
# Team1_FullBuyWon
# Team2_TotalRounds
# Team2_SideFirstHalf
# Team2_RoundsFirstHalf
# Team2_RoundsSecondtHalf
# Team2_PistolWon
# Team2_Eco
# Team2_EcoWon
# Team2_SemiEco
# Team2_SemiEcoWon
# Team2_SemiBuy
# Team2_SemiBuyWon
# Team2_FullBuy
# Team2_FullBuyWon


# Game_Rounds
# ------------
# GameID
# Team1ID
# Team2ID
# Round1      NOTE: Each column contains a dictionary that has a lot more details
# Round2
# ...
# Round45



# Game_Scoreboard
# ------------
# GameID
# PlayerID
# TeamID
# PlayerName
# Agent
# Kills
# Deaths
# Assists
# PlusMinus
# KAST_Percent
# ADR
# HS_Percent
# FirstKills
# FirstDeaths
# FKFD_PlusMinus
# Num_2Ks
# Num_3Ks
# Num_4Ks
# Num_5Ks
# OnevOne
# OnevTwo
# OnevThree
# OnevFour
# OnevFive
# Econ
# Plants
# Defuses

