In [1]:
#
# This small example shows you how to access JS-based requests via Selenium
# Like this, one can access raw data for scraping, 
# for example on many JS-intensive/React-based websites
#
import time
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import ElementNotInteractableException
from selenium.common.exceptions import WebDriverException
import json
from datetime import datetime
import pandas as pd

In [2]:
def process_browser_log_entry(entry):
    response = json.loads(entry['message'])['message']
    return response

def log_filter(log_):
    return (
        # is an actual response
        log_["method"] == "Network.responseReceived"
        # and json
        and "json" in log_["params"]["response"]["mimeType"]
    )

In [3]:
def init_page():
    #fetch a site that does xhr requests
    driver.get("https://www.youtube.com/watch?v=DWcJFNfaw9c")

    main_content_wait = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, '//iframe[@id="chatframe"]'))
    )
    time.sleep(3)
    video_box = driver.find_element_by_xpath('//div[@id="movie_player"]')
    video_box.click()

    frame = driver.find_elements_by_xpath('//iframe[@id="chatframe"]')
    # switch the webdriver object to the iframe.
    driver.switch_to.frame(frame[0])

    try:
        #enable 'all' livechat
        try:
            driver.find_element_by_xpath('//div[@id="label-text"][@class="style-scope yt-dropdown-menu"]').click()
        except ElementNotInteractableException:
            init_page()
        time.sleep(2.1)
        driver.find_element_by_xpath('//a[@class="yt-simple-endpoint style-scope yt-dropdown-menu"][@tabindex="-1"]').click()
    except ElementClickInterceptedException:
        print('let\'s try again...')
        init_page()

In [4]:
# make chrome log requests
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}  # newer: goog:loggingPrefs
driver = webdriver.Chrome(
    desired_capabilities=capabilities
)

In [5]:
init_page()

In [None]:
iter_num = 0
while True:
    iter_num += 1
    if iter_num >= 100:
        iter_num = 0
        init_page()
    # extract requests from logs
    logs_raw = driver.get_log("performance")
    logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

    json_list = []
    for log in filter(log_filter, logs):
        request_id = log["params"]["requestId"]
        resp_url = log["params"]["response"]["url"]
        #print(f"Caught {resp_url}")
        try:
            if 'https://www.youtube.com/youtubei/v1/live_chat/get_live_chat?key=' in resp_url:
                body = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
                json_list.append(body)
        except WebDriverException:
            print('web driver exception!!!')
            continue
            '''
            with open('look.txt', 'a', encoding='utf-8') as text_file:
                body = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
                text_file.write(str(body))
                json_list.append(body)
            '''

    #print(len(json_list))

    message_list = []
    self_message_list = []

    for i in range(len(json_list)):
        json_data = json.loads(json_list[i]['body'].replace('\n','').strip())
        try:
            actions = (json_data['continuationContents']['liveChatContinuation']['actions'])
        except:
            continue
        for j in range(len(actions)):
            try:
                item = actions[j]['addChatItemAction']['item']['liveChatTextMessageRenderer']
                author_channel_id = item['authorExternalChannelId']
                author_name = item['authorName']['simpleText']
                text = item['message']['runs'][0]['text']
                post_time = item['timestampUsec']
                post_time = post_time[0:10]
                post_time = int(post_time)
                author_photo = item['authorPhoto']['thumbnails'][0]['url']
                post_time = datetime.utcfromtimestamp(post_time)

                post_item = {
                    "Author" : author_name,
                    "Message" : text,
                    "Date" : post_time,
                    "Channel ID" : author_channel_id,
                    "Channel" : f'https://youtube.com/channel/{author_channel_id}'
                }
                message_list.append(post_item)
                if 'biss' in text.lower():
                    self_message_list.append(post_item)
                #print(post_item)
            except Exception as e:
                print(str(e))
                continue

    #message_list = list(set(message_list))
    df = pd.DataFrame(message_list)
    df = df.drop_duplicates()
    #print(df)

    df.to_csv('./data/youtube_lofi/test_run.csv', index=False, mode='a')
    reply_df = pd.DataFrame(self_message_list)
    reply_df = reply_df.drop_duplicates()
    if len(self_message_list) > 0 :
        reply_df.to_csv('./data/youtube_lofi/reply_runs_cumulative.csv', index=False, mode='a')
        reply_df.to_csv('./data/youtube_lofi/reply_runs.csv', index=False, mode='a')
    if len(message_list) < 1:
        print('The world is ending!')
    time.sleep(30)

'addChatItemAction'
'text'
'liveChatTextMessageRenderer'
'text'
'addChatItemAction'
'addChatItemAction'
'addChatItemAction'
'addChatItemAction'
'text'
'addChatItemAction'
'addChatItemAction'
'text'
'text'
'text'
'text'
'text'
'addChatItemAction'
'addChatItemAction'
'text'
'text'
'text'
'text'
