## Scrape

In [1]:
import pandas as pd
import requests

from bs4 import BeautifulSoup
from datetime import datetime
from time import sleep

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
base_url = "https://www.hltv.org"
results_url = f"{base_url}/stats/matches"
events_url = f"{base_url}/events/archive"

In [4]:
# Scraping match results by each map
def results(url):
    
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    results = []
    class_lst = ['group-1', 'group-2']
    for c in class_lst:
        matches = soup.find("table", {"class": "matches-table"}).find_all("tr", {"class": c})
    
        for match in matches:
            event = match.find("td", {"class": "event-col"}).text.strip()
            teams = [item.text.strip() for item in match.find_all("td", {"class": "team-col"})]
            result = [item.text.strip() for item in match.find_all("span", {"class": "score"})]
            map_played = [item.text.strip() for item in match.find_all("div", {"class": "dynamic-map-name-full"})]
            date_played = match.find("div", {"class": "time"}).text.strip()
            match_data = {
                "date": date_played,
                "event": event,
                "map": map_played,
                "team_1": teams[0],    
                "team_2": teams[1]
            }
            results.append(match_data)
            df = pd.DataFrame.from_dict(results)

    return df

In [5]:
# Scraping event details
def events(url):
       
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    results = []
    events_lst = soup.find_all("table", {"class": "table"})

    # data stored manually entered - manually split after scraping
    for e in events_lst:
        event_details = [item.text.strip() for item in e.find_all("td", {"class": "col-value"})]
        region = [item.text.strip() for item in e.find_all("span", {"class": "col-desc"})]
        match_data = {
            "event_details": event_details,
            "region_details": region
        }
        results.append(match_data)
        
    # convert output to df
    df = pd.DataFrame.from_dict(results)
    df1 = pd.DataFrame(df["event_details"].to_list(), columns=['event', 'num_teams', 'prize', 'type'])
    df2 = pd.DataFrame(df["region_details"].to_list(), columns=['region', 'start_end'])
    df = pd.concat([df1, df2], axis=1)
    return df

In [6]:
# Scraping team rankings
region_lst = ['/country/United%20States', '/country/Argentina', '/country/Brazil',
             '/country/Bulgaria', '/country/Czech%20Republic', '/country/Denmark',
             '/country/Finland', '/country/France', '/country/Germany', '/country/Kosovo',
             '/country/Norway', '/country/Poland', '/country/Portugal', '/country/Romania',
             '/country/Sweden', '/country/United%20Kingdom', '/country/Russia', 
              '/country/Ukraine', '/country/China', '/country/Mongolia', '/country/Australia'
             ]

def ranks(url):
    results = []
    for region in region_lst:
        r = requests.get(url + region)
        soup = BeautifulSoup(r.text, "html.parser")

        teams_lst = soup.find_all("div", {"class": "bg-holder"})

        for t in teams_lst:
            team = [item.text.strip() for item in t.find_all("span", {"class": "name"})]
            points = [item.text.strip() for item in t.find_all("span", {"class": "points"})]
            players = [item.text.strip() for item in t.find_all("div", {"class": "rankingNicknames"})]
            match_data = {
                "team": team,
                "points": points,
                "players": players            
            }
            results.append(match_data)

    df = pd.DataFrame.from_dict(results)
    return df

### Execution

In [7]:
### update current database
df = pd.read_csv('rank_info.csv')

In [21]:
df2 = ranks('https://www.hltv.org/ranking/teams/2021/september/6')
df = pd.concat([df, df2], ignore_index=True)
df['date_start'] = df['date_start'].fillna('6/9/2021')
df

Unnamed: 0.1,Unnamed: 0,team,points,players,date_start
0,0.0,['Liquid'],['(707 points)'],"['nitr0', 'NAF', 'EliGE', 'oSee', 'YEKINDAR']",24/10/2022
1,1.0,['Nouns'],['(39 points)'],"['cynic', 'Bwills', 'cxzi', 'nosraC', 'jeorges...",24/10/2022
2,2.0,['EG White'],['(30 points)'],"['djay', 'Jonji', 'ben1337', 'PwnAlone', 'viz']",24/10/2022
3,3.0,['EG Black'],['(23 points)'],"['RUSH', 'stanislaw', 'Walco', 'wiz', 'chop']",24/10/2022
4,4.0,['Iron Blood'],['(21 points)'],"['droid', 'shane', 'dare', 'snav', 'intra']",24/10/2022
...,...,...,...,...,...
12320,,[Dynasty],[(3 points)],"[bz, yourwombat, supa, spidok, rhys]",6/9/2021
12321,,[Animal Squad],[(2 points)],"[sibe, busta, RaZ, ajk, moop]",6/9/2021
12322,,[8Ballers],[(2 points)],"[bogeymanh, Pacificdongr, Shoey, Onii]",6/9/2021
12323,,[Caught off Guard],[(2 points)],"[Misfit, mega2f, Rev, viridian, kayoh]",6/9/2021


In [9]:
### Loop to scrape
for i in range(5):
    scraped_dict = results(results_url + '?offset=' + str(i*50))
    df_dict = pd.DataFrame.from_dict(scraped_dict)
    df = pd.concat([df, df_dict], ignore_index=True)
    sleep(5)

In [22]:
### Save csv
df.to_csv('rank_info.csv')