# Import Libraries

In [20]:
import pandas as pd
import numpy as np
import httpx
from selectolax.parser import HTMLParser
import re
from collections import defaultdict

# Get Data From vlr.gg

## Data Extracting & Cleaning Function

In [2]:
def extract_df(df):
    
    
    def extract_data(data) -> str:
        try:
            list_data = re.findall('[\d+-.]+',data)
            if (ll:=len(list_data)) == 0:
                return 'Nan Nan Nan'
            elif ll < 3:
                list_data += ['0'] * (3-ll)
            return ' '.join(list_data)
        except TypeError:
            return f'{data} Nan Nan'
    
    filled_df =df.fillna('Nan Nan Nan').astype('object')
    
    name = filled_df.iloc[:,0].map(lambda s : s.split(' ')[0])
    team = filled_df.iloc[:,0].map(lambda s : s.split(' ')[1])
    
    filled_df.iloc[:,0] = name
    filled_df.iloc[:,1] = team
    
    formatted_df = filled_df.rename(columns={'Unnamed: 0':'Name','Unnamed: 1':'Team'})
    
    
    
    # formatted_df.iloc[:,2:] = formatted_df.iloc[:,2:].applymap(lambda x :  ' '.join(re.findall("[\d+-.]+",x,re.A)))
    formatted_df.iloc[:,2:] = formatted_df.iloc[:,2:].applymap(extract_data)
    
    
    
    new_columns = []
    for c in formatted_df.columns[2:]:
        for side in ['all','atk','def']:
            new_columns.append((c,side))

    new_columns = np.array(new_columns).reshape(-1,3,2)
    
    
    result_df = pd.DataFrame(formatted_df.iloc[:,:2])
    
    
    result_df.columns = [('Name','Name'),('Team','Team')]
    
    
    for nc,c in zip(new_columns,formatted_df.iloc[:,2:].columns):
        result_df[list(map(lambda x: tuple(x),list(nc)))] = formatted_df[c].str.split(' ',expand=True)
        
    
    result_df.columns = pd.MultiIndex.from_tuples(result_df.columns,name=['Type','Side'])
    
    return result_df

## Scraping Data

### Preparing URL(s)

In [3]:
base_url = 'https://www.vlr.gg'

event_url = '/event/matches/1188/champions-tour-2023-lock-in-s-o-paulo/?series_id=all'

### Scraping All Matches in the Event

In [4]:
res = httpx.get(base_url+event_url) #get https response from vlr.gg

html = HTMLParser(res.text) # parsing into plain text

matches = html.css('a.wf-module-item') # get all matches in events

# find all completed match by css status class
completed_matches = [match for match in matches \
                     if match.css_first('div.ml-status').text() == 'Completed'] 

In [5]:
# incase need a perfect matches with no missing data point(s) for VCT LOCK//IN
# perfect_completed_matches = completed_matches[:21] + completed_matches[22:30] + completed_matches[31:]

### Scraping Map(s) played and players data

In [6]:
re_strip = lambda sp,st : sp.join(re.findall('\S+',st)) # function for normal regex by finding all char

n_matches = 5

for index,match in enumerate(completed_matches[slice(n_matches)]):
    
    # formatting name (tbh i couldn't remember why i did this but for the pretty formatted name)
    match_res = httpx.get(match_url := f'{base_url}{match.attributes["href"]}')
    print(httpx.get(match_url))

    match_html = HTMLParser(match_res.text)

    match_name = match_html.css_first('title').text().strip().split(' | ')[0].split(' vs. ')

    messy_match_result = match_html.css_first('div.js-spoiler').text()
    match_result = ' ' + re_strip('',messy_match_result) + ' '

    print(index, '>> ',end='')
    print(match_sum:=match_result.join(match_name))
    print()
    
    # get date and time of the match
    date = match_html.css_first('div.match-header-date')
    print('\t'+re_strip(' ',date.text()))
    print()
    
    # stage of the match
    stage = match_html.css_first('div.match-header-event-series')
    print('\t'+re_strip(' ',stage.text()))
    print()
    
    #ban pick of the match
    ban_pick = match_html.css_first('div.match-header-note')
    for bp in re_strip(' ',ban_pick.text()).split(';'):
        print('\t'+bp.strip())
#     #scraping all tables from pages w/ css table class
#     tables = match_html.css('table.wf-table-inset')

    
#     for table in tables: # for all table in matches
#         df = pd.read_html(table.html)[0]
#         display(transform_subcolumns_df(df))


    # get data of each map(s) in the match
    maps = match_html.css('div.vm-stats-game')

    # rearrange the map order
    maps = [maps[1],maps[0],maps[2]]
    
    # extract data of each map
    for m in maps:
        # get the current map pick name or overall if it the match summary
        current_map = m.css_first('div.map')
        match_header = 'Overall' if not current_map else\
                        ' '.join(re_strip(' ',current_map.text()).split(' ')[::2])
        
        print(match_header)
        
        
        tables = m.css('table.wf-table-inset')
        
        for table in tables:
            df = pd.read_html(table.html)[0]
            # display(extract_df(df))
        

<Response [200 OK]>
0 >> KOI 0:2 NRG Esports

	Tuesday, February 14th 12:10 AM +07 Patch 6.02

	Bracket Stage: Alpha - Round of 16

	NRG ban Ascent
	KOI ban Split
	NRG pick Icebox
	KOI pick Haven
	NRG ban Fracture
	KOI ban Lotus
	Pearl remains
Overall
Icebox 1:12:59
Haven 53:43
<Response [200 OK]>
1 >> DetonatioN FocusMe 0:2 Giants Gaming

	Tuesday, February 14th 3:05 AM +07 Patch 6.02

	Bracket Stage: Alpha - Round of 16

	DFM ban Split
	GIA ban Fracture
	DFM pick Haven
	GIA pick Icebox
	DFM ban Lotus
	GIA ban Ascent
	Pearl remains
Overall
Haven 50:48
Icebox 51:43
<Response [200 OK]>
2 >> FunPlus Phoenix 1:2 Karmine Corp

	Wednesday, February 15th 12:10 AM +07 Patch 6.02

	Bracket Stage: Alpha - Round of 16

	FPX ban Icebox
	KC ban Ascent
	FPX pick Lotus
	KC pick Haven
	FPX ban Split
	KC ban Fracture
	Pearl remains
Overall
Lotus 55:34
Haven 52:27
<Response [200 OK]>
3 >> BBL Esports 1:2 DRX

	Wednesday, February 15th 3:45 AM +07 Patch 6.02

	Bracket Stage: Alpha - Round of 16

	DRX ba

# OOP Design

In [7]:
class Event:
    def __init__(self,url):
        self.url = url
        # self.dates -> str(maybe pd.datatime)
        # self.players -> List[Player]
        # self.prize_pool -> int,float
        # self.region -> str
        # self.matches -> List[Match]

In [8]:
lock_in_url = 'https://www.vlr.gg/event/matches/1188/champions-tour-2023-lock-in-s-o-paulo/?series_id=all'


class Event_Slow:
    def __init__(self,url):
        self.url = url
        self.info = pd.DataFrame({
            'Name' : ['Dates','Prize Pool','Location'],
            'Value' : [self.dates(),self.prize_pool(),self.location()]
        })
        
        self.player = dict()
        
    def page_html(self):
        res = httpx.get(self.url)
        
        return HTMLParser(res.text)
    
    def prize_pool(self):
        
        return self.page_html().css('div.event-desc-item-value')[1].text().strip()
    
    def location(self):
        return self.page_html().css_first('div.event-desc-item.mod-last')\
                        .css_first('div.event-desc-item-value').text().strip()
    def dates(self):
        dates = self.page_html().css_first('div.event-desc-item-value').text().strip().split(' - ')
        return pd.to_datetime(dates).strftime('%y/%m/%d')
    
    def date_range(self):
        return pd.date_range(start=self.dates()[0],end=self.dates()[1],freq='D')
    
        
        
        
        

In [9]:
lock_in = Event_Slow(lock_in_url)
lock_in.prize_pool()

'$500,000 USD'

In [23]:
class Event:
    def __init__(self,url:str) -> None:
        self.url = url
        self.html = HTMLParser(httpx.get(url).text)
        header = self.html.css('div.event-desc-item-value')
        
        self.info = pd.DataFrame({
            'dates' : header[0].text().strip(),
            'prize_pool' : header[1].text().strip(),
            'location' : header[2].text().strip()
        },index=['Info'])
        self.players = defaultdict(lambda: {"data" : None})
        
        
        
    def get_matches(self,head=None):
        if hasattr(self,'matches'):
            return self.matches
        
            
        completed_matches = [Match(match) for match in self.html.css('a.wf-module-item')[slice(head)]
                                if match.css_first('div.ml-status').text() == 'Completed']
        self.matches = completed_matches
        return completed_matches
        

In [24]:
maps = match_html.css('div.vm-stats-game')

# rearrange the map order
maps = [maps[1],maps[0],maps[2]]

# extract data of each map
for m in maps:
    # get the current map pick name or overall if it the match summary
    current_map = m.css_first('div.map')
    match_header = 'Overall' if not current_map else\
                    ' '.join(re_strip(' ',current_map.text()).split(' ')[::2])

    print(match_header)


    tables = m.css('table.wf-table-inset')

    for table in tables:
        df = pd.read_html(table.html)[0]
        # display(extract_df(df))

Overall
Lotus 52:12
Pearl 39:51


In [44]:
class Match:
    def __init__(self,node) -> None:
        self.match_url = 'https://www.vlr.gg' + node.attributes['href']
        self.html = HTMLParser(httpx.get(self.match_url).text)
        
        match_name = self.html.css_first('title').text().strip().split(' | ')[0].split(' vs. ')
        
        messy_match_result = self.html.css_first('div.js-spoiler').text()
        match_result = ' ' + re_strip('',messy_match_result) + ' '
        
        self.match_sum = match_result.join(match_name)
        
        self.stage = re_strip(' ',self.html.css_first('div.match-header-event-series').text())
        
        # self.ban_pick = re_strip(' ',self.html.css_first('div.match-header-note').text()).split('; ')
        self.ban_pick = None if (h:=self.html.css_first('div.match-header-note')) is None\
        else re_strip(' ',h.text()).split('; ')
        
        self.date = re_strip(' ',self.html.css_first('div.match-header-date').text())
        
        self.players = defaultdict(lambda:{"data":None})
        
        
    def __repr__(self):
        return self.match_sum
    
    def _extract_data(self,data):
        try:
            list_data = re.findall('[\d+-.]+',data)
            if (ll:=len(list_data)) == 0:
                return 'Nan Nan Nan'
            elif ll < 3:
                list_data += ['0'] * (3-ll)
            return ' '.join(list_data)
        except TypeError:
            return f'{data} Nan Nan'
        
    def _extract_df(self,df):
        filled_df =df.fillna('Nan Nan Nan').astype('object')
    
        name = filled_df.iloc[:,0].map(lambda s : s.split(' ')[0])
        team = filled_df.iloc[:,0].map(lambda s : s.split(' ')[1])

        filled_df.iloc[:,0] = name
        filled_df.iloc[:,1] = team

        formatted_df = filled_df.rename(columns={'Unnamed: 0':'Name','Unnamed: 1':'Team'})



        # formatted_df.iloc[:,2:] = formatted_df.iloc[:,2:].applymap(lambda x :  ' '.join(re.findall("[\d+-.]+",x,re.A)))
        formatted_df.iloc[:,2:] = formatted_df.iloc[:,2:].applymap(self._extract_data)



        new_columns = []
        for c in formatted_df.columns[2:]:
            for side in ['all','atk','def']:
                new_columns.append((c,side))

        new_columns = np.array(new_columns).reshape(-1,3,2)


        result_df = pd.DataFrame(formatted_df.iloc[:,:2])


        result_df.columns = [('Name','Name'),('Team','Team')]


        for nc,c in zip(new_columns,formatted_df.iloc[:,2:].columns):
            result_df[list(map(lambda x: tuple(x),list(nc)))] = formatted_df[c].str.split(' ',expand=True)


        result_df.columns = pd.MultiIndex.from_tuples(result_df.columns,name=['Type','Side'])

        return result_df
        
        
    
    def get_scoreboard(self):
        result = {}
        maps = self.html.css('div.vm-stats-game')
          # extract data of each map
        for m in maps:
            # get the current map pick name or overall if it the match summary
            current_map = m.css_first('div.map')
            match_header = 'Overall' if not current_map else\
                            ' '.join(re_strip(' ',current_map.text()).split(' ')[::2])

            print(match_header)


            tables = m.css('table.wf-table-inset')

            for table in tables:
                df = pd.read_html(table.html)[0]
                result[match_header] = self._extract_df(df)
                
        return result
        
        
        
        
        
        

In [45]:
lock_in = Event(lock_in_url)

In [46]:
lock_in.get_matches(3)

[KOI 0:2 NRG Esports,
 DetonatioN FocusMe 0:2 Giants Gaming,
 FunPlus Phoenix 1:2 Karmine Corp]

In [47]:
lock_in.matches

[KOI 0:2 NRG Esports,
 DetonatioN FocusMe 0:2 Giants Gaming,
 FunPlus Phoenix 1:2 Karmine Corp]

In [48]:
lock_in.matches[-2].date

'Tuesday, February 14th 3:05 AM +07 Patch 6.02'

In [49]:
lock_in.matches[0].get_scoreboard()['Icebox 1:12:59']

Icebox 1:12:59
Overall
Haven 53:43


Type,Name,Team,R,R,R,ACS,ACS,ACS,K,K,...,HS%,FK,FK,FK,FD,FD,FD,+/–.1,+/–.1,+/–.1
Side,Name,Team,all,atk,def,all,atk,def,all,atk,...,def,all,atk,def,all,atk,def,all,atk,def
0,s0m,NRG,1.42,1.77,1.07,295,353,238,24,15,...,29,4,2,2,0,0,0,4,2,2
1,crashies,NRG,1.27,1.66,0.87,244,325,163,22,16,...,16,3,1,2,2,0,2,1,1,0
2,Victor,NRG,1.0,1.23,0.76,161,181,143,14,9,...,26,2,2,0,3,2,1,-1,0,-1
3,FiNESSE,NRG,1.0,1.21,0.8,165,158,174,12,6,...,14,4,3,1,0,0,0,4,3,1
4,ardiis,NRG,0.94,1.18,0.69,156,136,177,11,5,...,14,3,2,1,3,0,3,0,2,-2


In [50]:
sample_match = lock_in.matches[0].get_scoreboard()['Icebox 1:12:59']

Icebox 1:12:59
Overall
Haven 53:43


In [51]:
sample_match.iloc[[1]].T

Unnamed: 0_level_0,Unnamed: 1_level_0,1
Type,Side,Unnamed: 2_level_1
Name,Name,crashies
Team,Team,NRG
R,all,1.27
R,atk,1.66
R,def,0.87
ACS,all,244
ACS,atk,325
ACS,def,163
K,all,22
K,atk,16


In [63]:
class Player:
    def __init__(self,info):
        self.name =  info.loc['Name'].loc['Name']
        self.team = info.loc['Team'].loc['Team']
        
        
    
    
    
    

In [64]:
p1 = Player(sample_match.iloc[0])

In [65]:
print(p1.name)
print(p1.team)

s0m
NRG
