In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import time
import glob
import os

In [2]:
def misc_data(soup, game_id):
    rounds = soup.find('div', class_='vlr-rounds').text.split()
    teams = soup.findAll('div', class_='team-name', limit=2)
    team_one = teams[0].text.split()
    team_two = teams[1].text.split()
    team_one =' '.join(team_one)
    team_two = ' '.join(team_two)
    patch = soup.findAll(class_='match-header-date')[-1]
    date_and_patch = patch.text.split()
    patch = date_and_patch[-1]
    date = date_and_patch[:-2]
    date = ' '.join(date)
    _map = soup.find('div', class_='map').text.split()[0]
    ct_rounds = soup.find_all(class_="mod-ct")[0:2] # first index is left team, second index is right corresponds to first and second index later
    t_rounds = soup.find_all(class_="mod-t")[0:2]
    match_type = soup.findAll(class_='match-header-vs-note')
    match_type = match_type[-1].text.split()[0]
    t1_ct_rounds = ct_rounds[0].text.split()
    t1_t_rounds = t_rounds[0].text.split()
    t2_ct_rounds = ct_rounds[1].text.split()
    t2_t_rounds = t_rounds[1].text.split()
    t1_ct_rounds =''.join(t1_ct_rounds)
    t1_t_rounds =''.join(t1_t_rounds)
    t2_ct_rounds =''.join(t2_ct_rounds)
    t2_t_rounds =''.join(t2_t_rounds)
    t1_total = t1_t_rounds + t1_ct_rounds
    t2_total = t2_t_rounds + t2_ct_rounds

    data = np.array([team_one, t1_ct_rounds, t1_t_rounds, team_two, t2_ct_rounds, t2_t_rounds, _map, date, patch, game_id, t1_total>t2_total, t2_total>t1_total])
    columns = ['team_one', 'defense_rounds', 'offense_rounds', 'team_two', 'defense_rounds', 'offense_rounds', 'map', 'date', 'patch', 'game_id', 'team_one_win', 'team_two_win']
    misc = pd.DataFrame(data=data).T
    misc.columns = columns
    i, j = -1,-2
    while j > (-1*len(rounds)): # was gonna do while i since I was confident in the break but figured this was better
        if not re.search('[0-9]*-[0-9]*', rounds[i]) and re.search('[0-9]*-[0-9]*', rounds[j]): # found the last round
            rounds = rounds[:i]
            break
        j -= 1
        i -= 1 
    print(rounds)
    return rounds, misc
URL = "https://www.vlr.gg/164064/enigma-gaming-vs-kizuna-esports-challengers-league-malaysia-singapore-split-1-d13/?game=109086&tab=overview"
#misc_data(URL, 2)


In [3]:
def round_parser(round_list):
    parsed_round_list = []
    for item in round_list:
        if re.search('[0-9]?[-][0-9]?', item):
            parsed_round_list.append(item)
    return parsed_round_list

def round_list_to_df(parsed_round_list):
    rounds_stripped = [i.split('-', 1) for i in round_parser(parsed_round_list)]
    df = pd.DataFrame(columns={rounds[0], rounds[1]}, data=rounds_stripped)
    # feature idea: tempo, something to deal with how many rounds were won in succession compared to back to back.
    return df

# round_list_to_df(round_parser(rounds))
# #team on the left appears to be  Offense and team on the right appears to be Defense

In [4]:
def parse_performance_tab(URL):
    page = ''
    while page == '':
        try:
            page = requests.get(URL)
            break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            time.sleep(5)
            print("Was a nice sleep, now let me continue...")
            continue
    soup = BeautifulSoup(page.content, "html.parser")
    mod_list = [' mod-normal', ' mod-fkfd', ' mod-op']
    df_list = []
    for mod in mod_list:
        mod_class = 'wf-table-inset mod-matrix'+mod
        new = soup.find(class_='vm-stats-game mod-active')
        table = new.findAll('table', class_=mod_class)
        test = pd.read_html(str(table))[0]
        df = test
        df.columns.name = mod[5:]
        df_list.append(df)
    return (df_list)

parse_performance_tab('https://www.vlr.gg/179442/oxyg3nious-vs-tjd-esports-club-fgc-valorant-invitational-2023-act-1-qualifiers-play-in/?game=118033&tab=performance')

[normal            0          1          2          3        4  \
 0               NaN  YuKai TJD  YanYe TJD  zaner TJD  5 円 TJD   
 1       WawaLee O3O     3 3 +0     3 4 -1     1 3 -2   3 4 -1   
 2        XiMiLu O3O     4 2 +2     4 5 -1     7 2 +5   3 2 +1   
 3         Yusin O3O     4 2 +2     3 3 +0     3 6 -3   3 3 +0   
 4         Lizhi O3O     6 8 -2     3 5 -2     3 1 +2   6 4 +2   
 5          Nico O3O     0 5 -5     3 4 -1     1 3 -2   1 5 -4   
 
 normal                     5  
 0       sinatraachildren TJD  
 1                     2 4 -2  
 2                     3 7 -4  
 3                     4 4 +0  
 4                     3 1 +2  
 5                     5 1 +4  ,
 fkfd            0          1          2          3        4  \
 0             NaN  YuKai TJD  YanYe TJD  zaner TJD  5 円 TJD   
 1     WawaLee O3O     1 0 +1     0 1 -1        NaN      NaN   
 2      XiMiLu O3O     1 0 +1     0 1 -1        NaN      NaN   
 3       Yusin O3O     1 0 +1     2 0 +2     1 0 +1   1

In [5]:
def parse_economy_tab(URL):
    page = ''
    while page == '':
        try:
            page = requests.get(URL)
            break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            time.sleep(5)
            print("Was a nice sleep, now let me continue...")
            continue

    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.find('div', class_='vm-stats-game mod-active')
    df = pd.read_html(str(table), index_col=0)[0]
    df_two = pd.read_html(str(table), index_col=0)[1]
    return df, df_two


Now that we can scrape all of the data from the page we need to scrape all of the game id's to account for best of threes, best of fives as well as single game matches. Then we will figure out how to grab every match from the lists page. Feature creation and analysis is just around the corner.


In [6]:
def grab_matches_from_results_page(URL):
    page = ''
    while page == '':
        try:
            page = requests.get(URL)
            break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            time.sleep(5)
            print("Was a nice sleep, now let me continue...")
            continue
    soup = BeautifulSoup(page.content, features="lxml")
    urls = []
    for a in soup.find_all('a', href=True):
        urls.append(a['href'])
    link_list = []
    words_list = urls
    letters = set('-')

    for word in words_list:
        if letters & set(word):
            link_list.append(word)

    return link_list

In [7]:
def grab_games_from_match(URL):
    page = ''
    while page == '':
        try:
            page = requests.get(URL+'/?game=all&tab=overview')
            break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            time.sleep(5)
            print("Was a nice sleep, now let me continue...")
            continue

    soup = BeautifulSoup(page.content, "html.parser")
    divs = soup.findAll('div', class_='vm-stats-gamesnav-item js-map-switch')
    game_ids = []
    for y in divs:
        game_ids.append(URL+'/?game='+y["data-game-id"])
    return game_ids

In [8]:
def extract_game_data(URL, game_id):
    # URL = 'https://www.vlr.gg/165323/rebels-gaming-vs-movistar-riders-challengers-league-spain-rising-split-1-r9/?game=110615&tab=overview'
    # Creating list with all tables
    page = ''
    while page == '':
        try:
            page = requests.get(URL)
            break
        except:
            print("Connection refused by the server..")
            print("Let me sleep for 5 seconds")
            print("ZZzzzz...")
            time.sleep(5)
            print("Was a nice sleep, now let me continue...")
            continue

    soup = BeautifulSoup(page.content, "html.parser")
    try:
        rounds, misc = misc_data(soup, game_id)
        round_data = pd.DataFrame(data=rounds)
        misc.to_csv("not_clean/misc/"+(str(game_id)+"_misc.csv")) 
        round_data.to_csv("not_clean/rounds/"+(str(game_id)+"_rounds.csv")) 
    except:
        print("Misc Dataframe Error")
        print()
    test = soup.findAll('div', class_='vm-stats-game')
    titles = []
    for td in soup.findAll(class_='vm-stats-game mod-active'):
        for test in td.select('td.mod-agents'):
            img = test.select_one('span.stats-sq img')
            if img:
                titles.append(img.get('title'))


    team_one_agents = titles[0:5]
    team_two_agents = titles[5:10]
    table = soup.findAll('table', class_='wf-table-inset mod-overview') 

    df_one = pd.read_html(str(table))[0].dropna(axis='columns')
    df_two = pd.read_html(str(table))[1].dropna(axis='columns')
    df_one.columns = ['IGN', 'R', 'ACS', 'K', 'D', 'A', 'P_M_ONE', 'KAST', 'ADR', 'HS', 'FK', 'FD', 'P_M_2']
    df_two.columns = ['IGN', 'R', 'ACS', 'K', 'D', 'A', 'P_M_ONE', 'KAST', 'ADR', 'HS', 'FK', 'FD', 'P_M_2']
    df_one['AGENTS'] = team_one_agents
    df_two['AGENTS'] = team_two_agents
    return df_one, df_two

URL = 'https://www.vlr.gg/167391/loud-vs-drx-champions-tour-2023-lock-in-s-o-paulo-sf/?game=113124&tab=overview'
extract_game_data(URL, 113124)

['LOUD', 'DRX', '1', '1-0', '2', '2-0', '3', '2-1', '4', '2-2', '5', '2-3', '6', '3-3', '7', '4-3', '8', '5-3', '9', '6-3', '10', '7-3', '11', '8-3', '12', '9-3', '13', '10-3', '14', '11-3', '15', '12-3', '16', '12-4', '17', '12-5', '18', '13-5']


(             IGN               R          ACS         K           D        A  \
 0     aspas LOUD  1.88 2.25 1.69  366 417 341  24 10 14   / 9 2 7 /    4 3 1   
 1  cauanzin LOUD  1.18 0.88 1.34  224 153 260   16 4 12  / 13 4 9 /    3 1 2   
 2   saadhak LOUD  1.18 1.35 1.09  180 206 167    11 4 7   / 9 2 7 /    4 2 2   
 3      Less LOUD  1.17 1.03 1.25  220 174 243   14 4 10  / 10 4 6 /    6 1 5   
 4      tuyz LOUD  0.86 0.68 0.95   116 96 127     7 2 5  / 11 4 7 /  11 1 10   
 
      P_M_ONE          KAST          ADR           HS     FK     FD     P_M_2  \
 0  +15 +8 +7  83% 100% 75%  211 242 196  27% 33% 22%  4 1 3  1 1 0   +3 0 +3   
 1    +3 0 +3   72% 67% 75%   125 80 148  35% 60% 27%  1 0 1  2 1 1   -1 -1 0   
 2    +2 +2 0   83% 83% 83%  122 125 121  27% 19% 32%  4 2 2  3 0 3  +1 +2 -1   
 3    +4 0 +4   72% 67% 75%  143 116 157  29% 50% 24%  2 0 2  1 1 0  +1 -1 +2   
 4   -4 -2 -2   67% 67% 67%     71 77 68  20% 29% 17%  0 0 0  0 0 0     0 0 0   
 
     AGENTS  
 0     Jet

In [9]:
# def change_results_page(num_pages=450, i_start=1):
#     URL = 'https://www.vlr.gg/matches/results/?page='
#     performance = '&tab=performance'
#     economy = '&tab=economy'
#     import os  
#     os.makedirs('not_clean/', exist_ok=True)  
#     for i in range(i_start, num_pages+1): # currently 389 pages of matches
#         URL = URL+str(i)
#         print()
#         print(i)
#         print()
#         print(URL)
#         print()
#         # now it needs to call function that grabs all links on the page above
#         match_list = grab_matches_from_results_page(URL)
#         #print(match_list)
#         for match in match_list:
#             #now it needs to call function that grabs all game IDs from the match
#             match_link = 'https://www.vlr.gg' + match
#             try:
#                 game_links = grab_games_from_match(match_link)
#             except:
#                 continue
#             for game in game_links:
#                 #now extract all information present on the current game
#                 print(game)
#                 game_id = re.search('(?<=\?game=)(\d+)', game).group()
#                 print(game_id)
#                 try:
#                     game_data_one, game_data_two = extract_game_data(game, game_id)
#                     scoreboard_df = pd.concat([game_data_one, game_data_two])
#                     #scoreboard_df.to_csv("not_clean/scoreboards/"+(str(game_id)+"_scoreboard.csv")) 
#                     ##game_data_one.to_csv("not_clean/"+(str(game_id)+"_one_scoreboard.csv")) 
#                     ##game_data_two.to_csv("not_clean/"+(str(game_id)+"_two_scoreboard.csv"))
#                 except:
#                     print("Scoreboard Tab Error")
#                     continue
#                 try:
#                     economy_tab_one, economy_tab_two = parse_economy_tab(game+economy)
#                     economy_df = pd.concat([economy_tab_one, economy_tab_two])
#                     #economy_df.to_csv("not_clean/economy/"+(str(game_id)+"_economy.csv")) 
#                     ##economy_tab_one.to_csv("not_clean/"+(str(game_id)+'_economy_one.csv'))
#                     ##economy_tab_two.to_csv("not_clean/"+(str(game_id)+'_economy_two.csv'))
#                 except:
#                     print("Economy Tab Error")
#                     print()
#                     continue
#                 try:
#                     performance_tab_normal, performance_tab_op, performance_tab_fkfd = parse_performance_tab(game+performance) 
#                     #performance_tab_normal.to_csv("not_clean/performance/"+(str(game_id)+'_normal_performance.csv'))
#                     #performance_tab_op.to_csv("not_clean/performance/"+(str(game_id)+'_op_performance.csv'))
#                     #performance_tab_fkfd.to_csv("not_clean/performance/"+(str(game_id)+'_fkfd_performance.csv'))
#                 except:
#                     print("Performance Tab Error")
#                     print()
#                     continue

                



# change_results_page()

In [10]:
# def output_game(game_url):
#     performance = '&tab=performance'
#     economy = '&tab=economy'
#     print(game_url)
#     game_id = re.search('(?<=\?game=)(\d+)', game_url).group()
#     print(game_id)
#     try:
#         game_data_one, game_data_two = extract_game_data(game_url, game_id)
#         scoreboard_df = pd.concat([game_data_one, game_data_two])
#         scoreboard_df.to_csv("not_clean/scoreboards/"+(str(game_id)+"_scoreboard.csv")) 
#     except:
#         print("Scoreboard Tab Error")
#         pass
#     try:
#         game_data_one, game_data_two = extract_game_data(game_url, game_id)
#         scoreboard_df = pd.concat([game_data_one, game_data_two])
#         scoreboard_df.to_csv("not_clean/scoreboards/"+(str(game_id)+"_scoreboard.csv")) 
#     except:
#         print("Scoreboard Tab Error")
#         pass
#     try:
#         economy_tab_one, economy_tab_two = parse_economy_tab(game_url+economy)
#         economy_df = pd.concat([economy_tab_one, economy_tab_two])
#         economy_df.to_csv("not_clean/economy/"+(str(game_id)+"_economy.csv")) 
#     except:
#         print("Economy Tab Error")
#         print()
#         pass
#     try:
#         performance_tab_normal, performance_tab_op, performance_tab_fkfd = parse_performance_tab(game_url+performance) 
#         performance_tab_normal.to_csv("not_clean/performance/"+(str(game_id)+'_normal_performance.csv'))
#         performance_tab_op.to_csv("not_clean/performance/"+(str(game_id)+'_op_performance.csv'))
#         performance_tab_fkfd.to_csv("not_clean/performance/"+(str(game_id)+'_fkfd_performance.csv'))
#     except:
#         print("Performance Tab Error")
#         print()
#         pass

In [11]:
# def change_results_page(num_pages=2, i_start=1):
#     #URL = 'https://www.vlr.gg/matches/results/?page='
#     results_links = []
#     performance = '&tab=performance'
#     economy = '&tab=economy'
#     import os  
#     os.makedirs('not_clean/', exist_ok=True)  
#     os.makedirs('not_clean/economy/', exist_ok=True)
#     os.makedirs('not_clean/performance/', exist_ok=True)
#     os.makedirs('not_clean/scoreboards/', exist_ok=True)
#     for i in range(i_start, num_pages+1): # currently 393 pages of matches
#         URL = 'https://www.vlr.gg/matches/results/?page='+str(i)
#         print()
#         print(i)
#         print()
#         print(URL)
#         print()
#         results_links.append(URL)
#     # for URL in results_links:    
#     #     # now it needs to call function that grabs all links on the page above
#     #     match_list = grab_matches_from_results_page(URL)
#     #     #print(match_list)
#     #     for match in match_list:
#     #         #now it needs to call function that grabs all game IDs from the match
#     #         match_link = 'https://www.vlr.gg' + match
#     #         try:
#     #             game_links = grab_games_from_match(match_link)
#     #         except:
#     #             continue
#     match_list = []
#     game_links = []
#     for z in range(len(results_links)):
#         match_list = []
#         if (len(results_links) - z) >= 2:
#             match_list.append(grab_matches_from_results_page(results_links[z]))
#             match_list.append(grab_matches_from_results_page(results_links[z+1]))
#             z += 2
#         else:
#             match_list.append(grab_matches_from_results_page(results_links[z]))
#             z+=1
#         for j in range(len(match_list)):
#             game_list = []
#             if (len(match_list) - j) >= 2:
#                 game_list.append(grab_games_from_match(match_list[j]))
#                 game_list.append(grab_games_from_match(match_list[j+1]))
#                 j+=2
#             else:
#                 game_list.append(grab_games_from_match(match_list[j]))
#                 j+=1
#             for i in range(len(game_links)):
#                 if (len(game_links) - i) >= 2:
#                     output_game(game_links[i])
#                     output_game(game_links[i+1])
#                     i += 2
#                 else:
#                     output_game(game_links[i])
#                     i += 1
# change_results_page()

In [12]:
# import aiohttp 

# import asyncio
# import time

# def grab_games_from_match(text, URL):
    
#     soup = BeautifulSoup(text, "html.parser")
#     divs = soup.findAll('div', class_='vm-stats-gamesnav-item js-map-switch')
#     game_ids = []
#     for y in divs:
#         game_ids.append(URL+'/?game='+y["data-game-id"])
#     return game_ids

# def grab_matches_from_results_page(text, URL):
#     soup = BeautifulSoup(text, features="lxml")
#     urls = []
#     for a in soup.find_all('a', href=True):
#         urls.append(a['href'])
#     link_list = []
#     words_list = urls
#     letters = set('-')

#     for word in words_list:
#         if letters & set(word):
#             link_list.append(word)

#     return link_list

def get_links(num_pages=1, i_start=1):
    #URL = 'https://www.vlr.gg/matches/results/?page='
    results_links = []
    import os  
    os.makedirs('not_clean/', exist_ok=True)  
    os.makedirs('not_clean/economy/', exist_ok=True)
    os.makedirs('not_clean/performance/', exist_ok=True)
    os.makedirs('not_clean/scoreboards/', exist_ok=True)
    os.makedirs('not_clean/rounds/', exist_ok=True)
    for i in range(i_start, num_pages+1): # currently 393 pages of matches
        URL = 'https://www.vlr.gg/matches/results/?page='+str(i)
        print()
        print(i)
        print()
        print(URL)
        print()
        results_links.append(URL)
    return results_links

# # def change_results_page(num_pages=1, i_start=1):
# def output_scoreboard(text, game_url):
#     start_time = time.time()
#     game_id = re.search('(?<=\?game=)(\d+)', game_url).group()
#     soup = BeautifulSoup(text, "html.parser")
#     try:
#         rounds, misc = misc_data(soup, game_id)
#         round_data = pd.DataFrame(data=rounds)
#         print(round_data, misc)
#         misc.to_csv("not_clean/misc/"+(str(game_id)+"_misc.csv")) 
#         round_data.to_csv("not_clean/rounds/"+(str(game_id)+"_rounds.csv")) 
#     except:
#         print("Misc Dataframe Error")
#         print()
#     test = soup.findAll('div', class_='vm-stats-game')
#     titles = []
#     for td in soup.findAll(class_='vm-stats-game mod-active'):
#         for test in td.select('td.mod-agents'):
#             img = test.select_one('span.stats-sq img')
#             if img:
#                 titles.append(img.get('title'))


#     team_one_agents = titles[0:5]
#     team_two_agents = titles[5:10]
#     table = soup.findAll('table', class_='wf-table-inset mod-overview') 

#     df_one = pd.read_html(str(table))[0].dropna(axis='columns')
#     df_two = pd.read_html(str(table))[1].dropna(axis='columns')
#     df_one.columns = ['IGN', 'R', 'ACS', 'K', 'D', 'A', 'P_M_ONE', 'KAST', 'ADR', 'HS', 'FK', 'FD', 'P_M_2']
#     df_two.columns = ['IGN', 'R', 'ACS', 'K', 'D', 'A', 'P_M_ONE', 'KAST', 'ADR', 'HS', 'FK', 'FD', 'P_M_2']
#     df_one['AGENTS'] = team_one_agents
#     df_two['AGENTS'] = team_two_agents
#     scoreboard_df = pd.concat([df_one, df_two])
#     print(f"{(time.time() - start_time):.2f} seconds")
#     print()
#     print(scoreboard_df)
#     print()
#     scoreboard_df.to_csv("not_clean/scoreboards/"+(str(game_id)+"_scoreboard.csv"))

# def output_economy(text, game_id):
#     start_time = time.time()
#     soup = BeautifulSoup(text, "html.parser")
#     table = soup.find('div', class_='vm-stats-game mod-active')
#     df = pd.read_html(str(table), index_col=0)[0]
#     df_two = pd.read_html(str(table), index_col=0)[1]
#     economy_df = pd.concat([df, df_two])
#     print(f"{(time.time() - start_time):.2f} seconds")
#     print()
#     print(economy_df)
#     print()
#     economy_df.to_csv("not_clean/economy/"+(str(game_id)+"_economy.csv"))
 



# async def get_response(session, url):
#     performance = '&tab=performance'
#     economy = '&tab=economy'

#     async with session.get(url) as resp:

#         text = await resp.text()
#         try:
#             output_scoreboard(text, url)
#         except:
#             print("Scoreboard Tab Error")
#             pass
#         # try:
#         #     output_economy(text, url+economy)
#         # except:
#         #     print("Economy Tab Error")
#         #     print()
#         #     pass
#         # try:
#         #     parse_performance_tab(url+performance) 
#         # except:
#         #     print("Performance Tab Error")
#         #     print()
#         #     pass

# async def get_match_links(session, url):

#     async with session.get(url) as resp:

#         text = await resp.text()
#         match_list = grab_matches_from_results_page(text, url)
#         match_list = ["https://www.vlr.gg" + x for x in match_list]

#     return match_list


# async with aiohttp.ClientSession() as session:

#     tasks = []
#     game_links = []
#     for url in get_links():
#         print(url)
#         async with session.get(url) as resp:
#             text = await resp.text()
#             match_list = grab_matches_from_results_page(text, url)
#             match_list = ["https://www.vlr.gg" + x for x in match_list]
#             game_links = np.array([])
#             for match in match_list:
#                 #now it needs to call function that grabs all game IDs from the match
#                 async with session.get(match) as resp:

#                     text = await resp.text()
#                     try:
#                         game_links = np.append(game_links, grab_games_from_match(text, match))
#                     except:
#                         continue
#     for link in game_links:
        
#         tasks.append(asyncio.create_task(get_response(session, link)))

#     results = await asyncio.gather(*tasks)

In [13]:
# import asyncio
# import aiohttp
# import re
# import pandas as pd
# from bs4 import BeautifulSoup

# async def grab_games_from_match(session, URL):
#     async with session.get(URL) as resp:
#         text = await resp.text()
#     soup = BeautifulSoup(text, "lxml")
#     divs = soup.findAll('div', class_='vm-stats-gamesnav-item js-map-switch')
#     game_ids = []
#     for y in divs:
#         game_ids.append(URL+'/?game='+y["data-game-id"])
#     for link in game_ids:
#         await output_scoreboard(session, link)

# async def grab_matches_from_results_page(session, URL):
#     async with session.get(URL) as resp:
#         text = await resp.text()
#     soup = BeautifulSoup(text, "lxml")
#     urls = [a['href'] for a in soup.find_all('a', href=True)]
#     link_list = [word for word in urls if '-' in word]
#     link_list = ["https://www.vlr.gg" + x for x in link_list]
#     for link in link_list:
#         await grab_games_from_match(session, link)

# async def output_scoreboard(session, game_url):
#     start_time = time.time()
#     game_id = re.search('(?<=\?game=)(\d+)', game_url).group()
#     async with session.get(URL) as resp:
#         text = await resp.text()
#     soup = BeautifulSoup(text, "html.parser")
#     try:
#         rounds, misc = misc_data(soup, game_id)
#         round_data = pd.DataFrame(data=rounds)
#         print(round_data, misc)
#         misc.to_csv("not_clean/misc/"+(str(game_id)+"_misc.csv")) 
#         round_data.to_csv("not_clean/rounds/"+(str(game_id)+"_rounds.csv")) 
#     except:
#         print("Misc Dataframe Error")
#         print()
#     test = soup.findAll('div', class_='vm-stats-game')
#     titles = []
#     for td in soup.findAll(class_='vm-stats-game mod-active'):
#         for test in td.select('td.mod-agents'):
#             img = test.select_one('span.stats-sq img')
#             if img:
#                 titles.append(img.get('title'))


#     team_one_agents = titles[0:5]
#     team_two_agents = titles[5:10]
#     table = soup.findAll('table', class_='wf-table-inset mod-overview') 

#     df_one = pd.read_html(str(table))[0].dropna(axis='columns')
#     df_two = pd.read_html(str(table))[1].dropna(axis='columns')
#     df_one.columns = ['IGN', 'R', 'ACS', 'K', 'D', 'A', 'P_M_ONE', 'KAST', 'ADR', 'HS', 'FK', 'FD', 'P_M_2']
#     df_two.columns = ['IGN', 'R', 'ACS', 'K', 'D', 'A', 'P_M_ONE', 'KAST', 'ADR', 'HS', 'FK', 'FD', 'P_M_2']
#     df_one['AGENTS'] = team_one_agents
#     df_two['AGENTS'] = team_two_agents
#     scoreboard_df = pd.concat([df_one, df_two])
#     print(f"{(time.time() - start_time):.2f} seconds")
#     print()
#     print(scoreboard_df)
#     print()
#     scoreboard_df.to_csv("not_clean/scoreboards/"+(str(game_id)+"_scoreboard.csv"))


# async def get_response(session, url):
#     performance = '&tab=performance'
#     economy = '&tab=economy'
#     async with session.get(url) as resp:
#         text = await resp.text()
#         try:
#             await output_scoreboard(session, url)
#         except Exception as e:
#             print(f"Error occurred while outputting scoreboard: {e}")

# async with aiohttp.ClientSession() as session:
#     tasks = []
#     for url in get_links():
#         tasks.append(asyncio.create_task(grab_matches_from_results_page(session, url)))
#     results = await asyncio.gather(*tasks)

In [14]:
import asyncio
import aiohttp
import re
import pandas as pd
from bs4 import BeautifulSoup

async def grab_games_from_match(session, URL):
    async with session.get(URL) as resp:
        text = await resp.text()
    soup = BeautifulSoup(text, "lxml")
    divs = soup.select('div.vm-stats-gamesnav-item.js-map-switch[data-game-id]')
    game_ids = [URL + '/?game=' + y['data-game-id'] for y in divs]
    tasks = [output_scoreboard(session, link) for link in game_ids]
    await asyncio.gather(*tasks)

async def grab_matches_from_results_page(session, URL):
    async with session.get(URL) as resp:
        text = await resp.text()
    soup = BeautifulSoup(text, "lxml")
    urls = [a['href'] for a in soup.select('a[href]')]
    link_list = ["https://www.vlr.gg" + x for x in urls if '-' in x]
    tasks = [grab_games_from_match(session, link) for link in link_list]
    await asyncio.gather(*tasks)

async def output_scoreboard(session, game_url):
    start_time = time.time()
    game_id = re.search('(?<=\?game=)(\d+)', game_url).group()
    async with session.get(game_url) as resp:
        text = await resp.text()
    soup = BeautifulSoup(text, "html.parser")
    try:
        rounds, misc = misc_data(soup, game_id)
        round_data = pd.DataFrame(data=rounds)
        misc.to_csv("not_clean/misc/"+(str(game_id)+"_misc.csv")) 
        round_data.to_csv("not_clean/rounds/"+(str(game_id)+"_rounds.csv")) 
    except Exception as e:
        print(f"Misc Dataframe Error: {e}")
        print()

    titles = [img.get('title') for img in soup.select('div.vm-stats-game.mod-active td.mod-agents span.stats-sq img')]

    team_one_agents = titles[0:5]
    team_two_agents = titles[5:10]
    table = soup.select('table.wf-table-inset.mod-overview') 

    df_one = pd.read_html(str(table))[0].dropna(axis='columns')
    df_two = pd.read_html(str(table))[1].dropna(axis='columns')
    df_one.columns = ['IGN', 'R', 'ACS', 'K', 'D', 'A', 'P_M_ONE', 'KAST', 'ADR', 'HS', 'FK', 'FD', 'P_M_2']
    df_two.columns = ['IGN', 'R', 'ACS', 'K', 'D', 'A', 'P_M_ONE', 'KAST', 'ADR', 'HS', 'FK', 'FD', 'P_M_2']
    df_one['AGENTS'] = team_one_agents
    df_two['AGENTS'] = team_two_agents
    scoreboard_df = pd.concat([df_one, df_two])
    print(f"{(time.time() - start_time):.2f} seconds")
    print()
    print(scoreboard_df)
    print()
    scoreboard_df.to_csv("not_clean/scoreboards/"+(str(game_id)+"_scoreboard.csv"))


async def start():
    async with aiohttp.ClientSession() as session:
        tasks = [grab_matches_from_results_page(session, url) for url in get_links()]
        matches = await asyncio.gather(*tasks)
        tasks = [grab_games_from_match(session, url) for url in matches]
        games = await asyncio.gather(*tasks)
        [output_scoreboard(session, url) for url in games]
asyncio.run(start())

RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
# def change_results_page(num_pages=1, i_start=1):
#     #URL = 'https://www.vlr.gg/matches/results/?page='
#     results_links = []
#     import os  
#     os.makedirs('not_clean/', exist_ok=True)  
#     os.makedirs('not_clean/economy/', exist_ok=True)
#     os.makedirs('not_clean/performance/', exist_ok=True)
#     os.makedirs('not_clean/scoreboards/', exist_ok=True)
#     for i in range(i_start, num_pages+1): # currently 393 pages of matches
#         URL = 'https://www.vlr.gg/matches/results/?page='+str(i)
#         print()
#         print(i)
#         print()
#         print(URL)
#         print()
#         results_links.append(URL)
#     match_list = np.array([])
#     for URL in results_links:    
#         # now it needs to call function that grabs all links on the page above
#         match_list = np.append(match_list, grab_matches_from_results_page(URL))
#     match_list = ["https://www.vlr.gg" + x for x in match_list]
#     game_links = np.array([])
#     for match in match_list:
#         #now it needs to call function that grabs all game IDs from the match
#         try:
#             game_links = grab_games_from_match(match)
#         except:
#             continue
#         for game in game_links:
#             output_game(game)
# change_results_page()

In [None]:
# def change_results_page(num_pages=2, i_start=1):
#     #URL = 'https://www.vlr.gg/matches/results/?page='
#     results_links = []
#     performance = '&tab=performance'
#     economy = '&tab=economy'
#     import os  
#     os.makedirs('not_clean/', exist_ok=True)  
#     os.makedirs('not_clean/economy/', exist_ok=True)
#     os.makedirs('not_clean/performance/', exist_ok=True)
#     os.makedirs('not_clean/scoreboards/', exist_ok=True)
#     for i in range(i_start, num_pages+1): # currently 393 pages of matches
#         URL = 'https://www.vlr.gg/matches/results/?page='+str(i)
#         print()
#         print(i)
#         print()
#         print(URL)
#         print()
#         results_links.append(URL)
#     for URL in results_links:    
#         # now it needs to call function that grabs all links on the page above
#         match_list = grab_matches_from_results_page(URL)
#         #print(match_list)
#         for match in match_list:
#             #now it needs to call function that grabs all game IDs from the match
#             match_link = 'https://www.vlr.gg' + match
#             try:
#                 game_links = grab_games_from_match(match_link)
#             except:
#                 continue
#             for game in game_links:
#                 output_game(game)
# change_results_page()

In [None]:
# # merging the files
# joined_files = os.path.join("/not_clean/scoreboards", "*.csv")
  
# # A list of all joined files is returned
# joined_list = glob.glob(joined_files)
  
# # Finally, the files are joined
# df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
# print(df)

In [None]:
# # import required module
# import os
# # assign directory
# directory = 'not_clean'
# game_id_array = np.array([])
# # iterate over files in
# # that directory
# for filename in os.listdir(directory):
#     f = os.path.join(directory, filename)
#     # checking if it is a file
#     if os.path.isfile(f):
#         match_id = re.search('[0-9]*', f)
#         game_id_array = np.append(game_id_array, f)
# unique_game_ids = np.unique(game_id_array)


In [None]:
# i = 0
# for game_id in unique_game_ids:
#     team_one_scoreboard = pd.read_csv("not_clean/"+game_id+"_one_scoreboard.csv")
#     team_two_scoreboard = pd.read_csv("not_clean/"+game_id+"_two_scoreboard.csv")
#     both_teams = pd.concat(team_one_scoreboard, team_two_scoreboard)
#     if i == 0:
#         all_games_scoreboard = both_teams
#     else:
#         all_games_scoreboard = pd.concat(all_games_scoreboard, both_teams)


In [None]:
def add_winner(scoreboard_df, misc_df):
    if misc_df['team_one_win'].values: # team one won
        win_label = np.array([1,1,1,1,1,0,0,0,0,0])
    else:
        win_label = np.array([0,0,0,0,0,1,1,1,1,1])
    scoreboard_df['game_result'] = win_label
    return scoreboard_df

In [None]:
def split_tn_ign(scoreboard):
    team_names = np.array([])
    igns = np.array([])
    for team_and_ign in scoreboard['IGN'].values:
        team_names = np.append(team_names, team_and_ign.split()[1])
        igns = np.append(igns, team_and_ign.split()[0])
    scoreboard = scoreboard.drop(columns=['IGN', 'Unnamed: 0'])
    scoreboard.insert(loc = 0,
                      column = 'team',
                      value = team_names)
    scoreboard.insert(loc = 1,
                      column = 'IGN',
                      value = igns)
    #scoreboard['team'] = team_names
    #scoreboard['ign'] = igns
    return scoreboard

In [None]:
def add_full_team(scoreboard, misc):
    t1 = misc['team_one'].values
    t2 = misc['team_two'].values
    team_array = np.array([t1,t1,t1,t1,t1,t2,t2,t2,t2,t2])
    scoreboard['full_tn'] = team_array
    t1_d = misc['defense_rounds'].values
    t1_o = misc['offense_rounds'].values
    t2_d = misc['defense_rounds.1'].values
    t2_o = misc['defense_rounds.1'].values
    d_rounds_array = np.array([t1_d, t1_d, t1_d, t1_d, t1_d, t2_d, t2_d, t2_d, t2_d, t2_d])
    o_rounds_array = np.array([t1_o, t1_o, t1_o, t1_o, t1_o, t2_o, t2_o, t2_o, t2_o, t2_o])
    scoreboard['offense_rounds_won'] = o_rounds_array
    scoreboard['defense_rounds_won'] = d_rounds_array
    return scoreboard

In [None]:
# # import required module
# import os
# import numpy as np
# # assign directory
# scoreboard_dir = 'not_clean/scoreboards'
# economy_dir = 'not_clean/economy'
# performance_dir = 'not_clean/performance'
# game_id_regex = '[0-9]+'
# complete_scoreboard = pd.DataFrame()
# missing_misc = np.array([])
# # iterate over files in
# # that directory
# for filename in os.scandir(scoreboard_dir):
#     if filename.is_file():
#         # print(filename.path) prints the path
#         #print(str(filename.path))
#         scoreboard = pd.read_csv(filename.path) # reads in scoreboard
#         game_id = re.search('[0-9]+', str(filename.path)) # grabs game id from filename
#         if game_id:
#             game_id = game_id.group()
#             game_id_array = np.full(((scoreboard.index).shape), game_id) # creates a game_id key to create database
#             scoreboard['game_id'] = game_id_array
#         try:
#             file_dir = 'not_clean/misc/'+str(game_id)+'_misc.csv'
#             misc = pd.read_csv(file_dir)
#             #print(misc)
#             #scoreboard = split_tn_ign(scoreboard)
#             #scoreboard['patch'] = np.full(((scoreboard.index).shape), misc['patch'].values)
#             #scoreboard['date'] = np.full(((scoreboard.index).shape), misc['date'].values)
#             #scoreboard['map'] = np.full(((scoreboard.index).shape), misc['map'].values)

#             #scoreboard = add_winner(scoreboard, misc)
#             #scoreboard = add_full_team(scoreboard, misc)
#         except OSError as e:
#             print('File does not exist')
#             print()
#             print("Misc Dir: "+file_dir)
#             print()
#             print("Scoreboard Dir: "+filename.path)
#             print()
#             print(game_id)
#             print()
#             print('These should all match.')
#             #print(missing_misc)
#             #missing_misc.append(str(game_id))
#             missing_misc = np.append(missing_misc, str(game_id))
#             continue

#         #complete_scoreboard = pd.concat([complete_scoreboard, scoreboard])
# #complete_scoreboard.to_csv('complete_scoreboard.csv')
# np.savetxt("array.txt", np.array(missing_misc), fmt="%s",)


File does not exist

Misc Dir: not_clean/misc/100079_misc.csv

Scoreboard Dir: not_clean/scoreboards\100079_scoreboard.csv

100079

These should all match.
File does not exist

Misc Dir: not_clean/misc/100080_misc.csv

Scoreboard Dir: not_clean/scoreboards\100080_scoreboard.csv

100080

These should all match.
File does not exist

Misc Dir: not_clean/misc/100081_misc.csv

Scoreboard Dir: not_clean/scoreboards\100081_scoreboard.csv

100081

These should all match.
File does not exist

Misc Dir: not_clean/misc/100082_misc.csv

Scoreboard Dir: not_clean/scoreboards\100082_scoreboard.csv

100082

These should all match.
File does not exist

Misc Dir: not_clean/misc/100083_misc.csv

Scoreboard Dir: not_clean/scoreboards\100083_scoreboard.csv

100083

These should all match.
File does not exist

Misc Dir: not_clean/misc/100085_misc.csv

Scoreboard Dir: not_clean/scoreboards\100085_scoreboard.csv

100085

These should all match.
File does not exist

Misc Dir: not_clean/misc/100086_misc.csv

S

In [None]:
text_file= open('array.txt','r')
data=text_file.read()
print(data)

100079
100080
100081
100082
100083
100085
100086
100087
100088
100089
100091
100093
100094
100095
100096
100097
100098
100099
100100
100101
100103
100104
100106
100107
100110
100111
100112
100113
100181
100182
100183
100233
100234
100350
100351
100352
100353
100354
100355
100356
100357
100358
100360
100361
100362
100363
100364
100365
100366
100367
100368
100369
100370
100371
100372
100373
100374
100375
100376
100377
100378
100379
100380
100381
100382
100383
100384
100385
100386
100387
100388
100389
100451
100452
100454
100455
100457
100458
100459
100460
100461
100463
100464
100466
100467
100469
100470
100472
100473
100474
100475
100476
100477
100478
100479
100480
100481
100482
100483
100484
100485
100487
100488
100490
100491
100492
100493
100494
100496
100497
100498
100499
100501
100502
100504
100505
100506
100507
100508
100509
100510
100511
100513
100514
100515
100516
100517
100519
100520
100522
100523
100524
100525
100526
100528
100529
100531
100532
100533
100534
100535
100537
100538