# Essais avec playwright en async

Doc de playwright : https://playwright.dev/python/docs/intro

In [None]:
import asyncio, time
from playwright.async_api import async_playwright

async def scrape_1():
    print("scraping_1...")
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto("http://playwright.dev")
        print(await page.title())
        await browser.close() 
    print("finished scraping_1")

async def scrape_2():
    print("scraping_2...")
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto("http://playwright.dev")
        print(await page.title())
        await browser.close() 
    print("finished scraping_2")

async def scrape_3():
    print("scraping_3...")
    await asyncio.sleep(3)
    print("je fais rien vrai")
    print("finished scraping_3")

async def main():
    await asyncio.gather(scrape_1(), scrape_2(), scrape_3())

#asyncio.run(main())
await main()


# plz don't run, crash !
Scraper plusieurs url de manière async + multiprocessing
fonctionne dans "scripts/async_multiprocessing.py" mais pas dans jupyter

In [None]:
import asyncio
import nest_asyncio
from aiohttp import request
from aiomultiprocess import Pool
nest_asyncio.apply()

async def get(url):
    async with request("GET", url) as response:
        return await response.text("utf-8")

async def main():
    urls = ["https://jreese.sh"]
    async with Pool() as pool:
        async for result in pool.map(get, urls):
            print(result)

#await main()

asyncio.run(main())

Compteur de temps pour scripts (pas besoin dans jupyter mais utile dans les scripts)

In [None]:
s = time.perf_counter()
await main()
elapsed = time.perf_counter() - s
print(f"Script executed in {elapsed:0.2f} seconds.")

# Example simple avec aiohttp

In [None]:
import aiohttp
import asyncio

async def test():

    async with aiohttp.ClientSession() as session:

        async with session.get('https://www.nba.com/stats/players/advanced/?sort=TEAM_ABBREVIATION&dir=-1') as response:

            print("Status:", response.status)
            print("Content-type:", response.headers['content-type'])

            html = await response.text()
            print("Body:", html)

await test()

In [None]:
import time, random, json
import asyncio, aiohttp
import pandas as pd
from aiomultiprocess import Pool
from pprint import pprint

# Scraping with &redirect=no sets US website, no need to set cookies, main page : https://www.nba.com/?&redirect=no
# https://www.nba.com/stats/?&redirect=no
# https://www.nba.com/stats/teams

# https://www.nba.com/stats/players
# https://www.nba.com/stats/players/advanced/?sort=TEAM_ABBREVIATION&dir=-1

# https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Advanced&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2021-22&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=

#body > main > div > div > div.landing-page-content > div.inner__sidebar.inner__sidebar-next.\[.columns.\/.large-3.\] > section:nth-child(1) > div > div > div:nth-child(2)
#/html/body/main/div/div/div[3]/div[2]/section[1]/div/div/div[2]
# Players stats by seasons
# Remove 'If-Modified-Since' from headers else 304 NOT MODIFIED

headers = {
    'Connection': 'keep-alive',
    'DNT': '1',
    'sec-ch-ua-mobile': '?0',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.20 Safari/537.36 Edg/97.0.1072.21',
    'Accept': 'application/json, text/plain, */*',
    'x-nba-stats-token': 'true',
    'x-nba-stats-origin': 'stats',
    'sec-ch-ua-platform': '"Windows"',
    'Origin': 'https://www.nba.com',
    'Sec-Fetch-Site': 'same-site',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Dest': 'empty',
    'Referer': 'https://www.nba.com/',
    'Accept-Language': 'fr,fr-FR;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
}

rate_limit_bypass_headers = {
    'X-Originating-IP': '127.0.0.1',
    'X-Remote-IP': '127.0.0.1',
    'X-Remote-Addr': '127.0.0.1',
    'X-Client-IP': '127.0.0.1',
    'X-Host': '127.0.0.1',
    'X-Forwarded': '127.0.0.1',
    'X-Forwarded-By': '127.0.0.1',
    'X-Forwarded-For': '',
    'X-Forwarded-For': '127.0.0.1',
    'X-Forwarder-For': '127.0.0.1',
    'X-Forwarder-For-Original': '127.0.0.1',
    'Forwarded-For': '127.0.0.1',
    'X-Custom-IP-Authorization': '127.0.0.1',
    'X-Originating-IP': '127.0.0.1',
    'X-Remote-IP': '127.0.0.1',
    'X-Remote-Addr': '127.0.0.1',
}

magic_chars = ['%00', '%2e', '%09', '%20', '%0', '%0d', '%0a', '%0C']
platforms = ['Android', 'Windows', 'Linux', 'macOS']
user_agents_df = pd.read_csv('../data/user_agents.csv', delimiter='\n')
lang = ['fr', 'zh', 'en', 'de', 'ja', 'es', 'en-US', 'fr-CH', 'fr']
mobile = ['?0', '?1']

def rand_headers():
    rand_lang = f"{random.choice(lang)};q={random.choice(magic_chars)},{random.choice(lang)};q={random.choice(magic_chars)}"
    rand_platform = random.choice(platforms)
    rand_agent = user_agents_df.sample().values[0][0]
    rand_mobile = random.choice(mobile)
    rand_header_name, rand_header_val = random.choice(list(rate_limit_bypass_headers.items()))

    rand_headers = {
        'Connection': 'keep-alive',
        'DNT': '1',
        'sec-ch-ua-mobile': f'{rand_mobile}',
        'User-Agent': f'{rand_agent}',
        'Accept': 'application/json, text/plain, */*',
        'x-nba-stats-token': 'true',
        'x-nba-stats-origin': 'stats',
        'sec-ch-ua-platform': f'Windows',
        'Origin': 'https://www.nba.com',
        'Sec-Fetch-Site': 'same-site',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Referer': 'https://www.nba.com/',
        'Accept-Language': f'{rand_lang}',
        rand_header_name: rand_header_val,
    }

    # print(headers)
    # print("----------")
    # print(rand_headers)

    return rand_headers

leaguedashplayerstats_params = (
    ('College', ''),
    ('Conference', ''),
    ('Country', ''),
    ('DateFrom', ''),
    ('DateTo', ''),
    ('Division', ''),
    ('DraftPick', ''),
    ('DraftYear', ''),
    ('GameScope', ''),
    ('GameSegment', ''),
    ('Height', ''),
    ('LastNGames', '0'),
    ('LeagueID', '00'),
    ('Location', ''),
    ('MeasureType', 'Advanced'),
    ('Month', '0'),
    ('OpponentTeamID', '0'),
    ('Outcome', ''),
    ('PORound', '0'),
    ('PaceAdjust', 'N'),
    ('PerMode', 'PerGame'),
    ('Period', '0'),
    ('PlayerExperience', ''),
    ('PlayerPosition', ''),
    ('PlusMinus', 'N'),
    ('Rank', 'N'),
    ('Season', '2021-22'),
    ('SeasonSegment', ''),
    ('SeasonType', 'Regular Season'),
    ('ShotClockRange', ''),
    ('StarterBench', ''),
    ('TeamID', '0'),
    ('TwoWay', '0'),
    ('VsConference', ''),
    ('VsDivision', ''),
    ('Weight', ''),
)

leaguedashplayerstats_params = dict(leaguedashplayerstats_params)

def gen_seasons():
    seasons = []
    end = (2021,22)
    start = (1996,97)
    i = start[0]
    j = start[1]
    while i <= end[0]:
        if j % 100 == 0:
            j = 0
        season = f"{i}-{j}"    
        if j <= 9:
            season = f"{i}-0{j}"
        seasons.append(season)
        i+=1
        j+=1

    return seasons

seasons_dates = gen_seasons()
players_stats_by_seasons = []

async def get(session, url, params):
    async with session.head(url, params=params) as r:
        #print(r)
        return await r.text("utf-8")

async def scrape_player_stats_by_season():
    url = 'https://stats.nba.com/stats/leaguedashplayerstats'
    params = leaguedashplayerstats_params
    async with aiohttp.ClientSession(headers=rand_headers()) as s:
        for date in seasons_dates:
            params['Season'] = date
            #print(params)
            #params.update(params)
            async with s.get(url, params=params) as r:
                r = await r.text("utf-8")
                #print(r)
                print(f"GOT {date}")
                #await asyncio.sleep(0.5)
                #Timeout of 1s : 1m18, no timeout 52s
            #r = await get(s, url, params=params)
            #print(r)
            #print(r.status)
            #text = await r.text("utf-8")
            #time.sleep(2)
            #print(r)
            #print(len(players_stats_by_seasons))
            #df = pd.DataFrame(r['resultSets'][0]['rowSet'])
            #print(df)
            #print(date)
            #print(params['Season'])
            #players_stats_by_seasons.append((date,r))
            #print(r['resultSets'][0]['rowSet'][0])
    print(players_stats_by_seasons)

            
await asyncio.gather(scrape_player_stats_by_season())

In [None]:
async def scrape_save_user_agents():
    # Run this only once, or u'll quickly get IP banned.
    # Get a list of valid recent user agents, wait 1sec beetween every request
    # Save to csv
    # Mullvad DE servers aren't IP banned
    # Else try scraping with 4G acces point.
    # Or with rotating proxies
    url = 'https://www.whatismybrowser.com/guides/the-latest-user-agent/'
    user_agents = []
    async with aiohttp.ClientSession() as s:

        for browser in ('chrome', 'firefox', 'safari', 'edge', 'opera', 'vivaldi', 'yandex'):

            async with s.get(url + browser) as r:
                await asyncio.sleep(1)
                if r.status == 200:
                    r = await r.text("utf-8")

                    elems = html.fromstring(r).cssselect('td li span.code')

                    print(len(elems))
                    for elem in elems:
                        user_agents.append(elem.text_content().strip())
                else:
                    print(f"HTTP {r.status}")

    print(f"Collected : {len(user_agents)} user agents !")
    df = pd.DataFrame(user_agents)
    df.to_csv('data/user_agents.csv', index=False) 
    #return user_agents

#await asyncio.gather(scrape_save_user_agents())    

In [None]:
import time, random
import asyncio, aiohttp
import pandas as pd
from lxml import html

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.20 Safari/537.36 Edg/97.0.1072.21',
    'Accept': 'application/json, text/plain, */*',
    'Origin': 'https://www.nba.com',
    'Referer': 'https://www.nba.com/',
}

leaguedashplayerstats_params = (
    ('College', ''),
    ('Conference', ''),
    ('Country', ''),
    ('DateFrom', ''),
    ('DateTo', ''),
    ('Division', ''),
    ('DraftPick', ''),
    ('DraftYear', ''),
    ('GameScope', ''),
    ('GameSegment', ''),
    ('Height', ''),
    ('LastNGames', '0'),
    ('LeagueID', '00'),
    ('Location', ''),
    ('MeasureType', 'Advanced'),
    ('Month', '0'),
    ('OpponentTeamID', '0'),
    ('Outcome', ''),
    ('PORound', '0'),
    ('PaceAdjust', 'N'),
    ('PerMode', 'PerGame'),
    ('Period', '0'),
    ('PlayerExperience', ''),
    ('PlayerPosition', ''),
    ('PlusMinus', 'N'),
    ('Rank', 'N'),
    ('Season', '2021-22'),
    ('SeasonSegment', ''),
    ('SeasonType', 'Regular Season'),
    ('ShotClockRange', ''),
    ('StarterBench', ''),
    ('TeamID', '0'),
    ('TwoWay', '0'),
    ('VsConference', ''),
    ('VsDivision', ''),
    ('Weight', ''),
)

leaguedashplayerstats_params = dict(leaguedashplayerstats_params)

def gen_seasons():
    seasons = []
    end = (2021,22)
    start = (1996,97)
    i = start[0]
    j = start[1]
    while i <= end[0]:
        if j % 100 == 0:
            j = 0
        season = f"{i}-{j}"    
        if j <= 9:
            season = f"{i}-0{j}"
        seasons.append(season)
        i+=1
        j+=1

    return seasons

rate_limit_bypass_headers = {
    'X-Originating-IP': '127.0.0.1',
    'X-Remote-IP': '127.0.0.1',
    'X-Remote-Addr': '127.0.0.1',
    'X-Client-IP': '127.0.0.1',
    'X-Host': '127.0.0.1',
    'X-Forwarded': '127.0.0.1',
    'X-Forwarded-By': '127.0.0.1',
    'X-Forwarded-For': '',
    'X-Forwarded-For': '127.0.0.1',
    'X-Forwarder-For': '127.0.0.1',
    'X-Forwarder-For-Original': '127.0.0.1',
    'Forwarded-For': '127.0.0.1',
    'X-Custom-IP-Authorization': '127.0.0.1',
    'X-Originating-IP': '127.0.0.1',
    'X-Remote-IP': '127.0.0.1',
    'X-Remote-Addr': '127.0.0.1',
}

magic_chars = ['%00', '%2e', '%09', '%20', '%0', '%0d', '%0a', '%0C']
platforms = ['Android', 'Windows', 'Linux', 'macOS']
user_agents_df = pd.read_csv('../data/user_agents.csv')
lang = ['fr', 'zh', 'en', 'de', 'ja', 'es', 'en-US', 'fr-CH', 'fr']
mobile = ['?0', '?1']

def rand_headers():
    # Generate random headers
    rand_lang = f"{random.choice(lang)};q={random.choice(magic_chars)},{random.choice(lang)};q={random.choice(magic_chars)}"
    rand_platform = random.choice(platforms)
    rand_agent = user_agents_df.sample().values[0][0]
    rand_mobile = random.choice(mobile)
    rand_header_name, rand_header_val = random.choice(list(rate_limit_bypass_headers.items()))

    rand_headers = {
    'Connection': 'keep-alive',
    'DNT': '1',
    'sec-ch-ua-mobile': f'{rand_mobile}',
    'User-Agent': f'{rand_agent}',
    'Accept': 'application/json, text/plain, */*',
    'x-nba-stats-token': 'true',
    'x-nba-stats-origin': 'stats',
    'sec-ch-ua-platform': f'{rand_platform}',
    'Origin': 'https://www.nba.com',
    'Sec-Fetch-Site': 'same-site',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Dest': 'empty',
    'Referer': 'https://www.nba.com/',
    'Accept-Language': f'{rand_lang}',
    rand_header_name: rand_header_val,
    }

    return rand_headers

async def get(session, url, params):
    async with session.head(url, params=params) as r:
        #print(r)
        return await r.text("utf-8")

async def scrape_player_stats_by_season():
    url = 'https://stats.nba.com/stats/leaguedashplayerstats'
    params = leaguedashplayerstats_params
    seasons_dates = gen_seasons()
    
    for date in seasons_dates:
        params['Season'] = date
        #print(params)
        #params.update(params)
        async with aiohttp.ClientSession(headers=rand_headers()) as s:
            async with s.get(url, params=params) as r:
                r = await r.text("utf-8")
                #print(r)
                print(f"GOT {date}")
                #await asyncio.sleep(0.5)
    print(players_stats_by_seasons)


await asyncio.gather(scrape_player_stats_by_season())    