In [1]:
import traceback
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from browsermobproxy import Client
from selenium import webdriver
from typing import List

from util import is_json, parse_har, write_har, get_driver, get_proxy
from crawler import process_event
from feature_gen import gen_feature
from gen_types import Event, System

%load_ext autoreload
%autoreload 2

In [3]:
def worker(sys: System, drivers: List[webdriver.Firefox], proxy: Client):
    while len(sys.mq) != 0:
        curr_event = sys.pop_event()
        print("Processing event {}".format(curr_event))
        process_event(drivers[curr_event.user], proxy, curr_event, sys)

In [4]:
# random_walk_page(driver, "https://cs.uchicago.edu", 5)
# har = proxy.har # returns a HAR JSON blob

num_worker = 24
num_user = 2
proxy, server = get_proxy(8080)
pool = ThreadPoolExecutor(num_worker)

drivers = []
for i in range(0, num_user):
    drivers.append(get_driver(proxy))
    
sys = System(5*60, "https://cs.uchicago.edu")
sys.init_mq(num_user)

futures = []
for _ in range(num_worker):
    futures.append(pool.submit(worker, sys, drivers, proxy))

for fut in futures:
    fut.result()

for i in range(0, num_user):
    drivers[i].quit()
server.stop()

The geckodriver version (0.31.0) detected in PATH at /usr/local/bin/geckodriver might not be compatible with the detected firefox version (119.0); currently, geckodriver 0.33.0 is recommended for firefox 119.*, so it is advised to delete the driver in PATH and retry


The geckodriver version (0.31.0) detected in PATH at /usr/local/bin/geckodriver might not be compatible with the detected firefox version (119.0); currently, geckodriver 0.33.0 is recommended for firefox 119.*, so it is advised to delete the driver in PATH and retry


Processing event time: 25.05218042667699, user 0, click 3, url https://cs.uchicago.edu
Processing event time: 37.578745431146906, user 1, click 2, url https://cs.uchicago.edu
Visiting page https://cs.uchicago.edu
Visiting page https://cs.uchicago.edu
There are 161 clickable links on this page
Processing event time: 26.9034612928544, user 0, click 2, url https://cs.uchicago.edu/news/uchicago-scientists-develop-new-tool-to-protect-artists-from-ai-mimicry/
Visiting page https://cs.uchicago.edu/news/uchicago-scientists-develop-new-tool-to-protect-artists-from-ai-mimicry/
There are 161 clickable links on this page
Processing event time: 37.229554036938524, user 1, click 1, url https://cs.uchicago.edu/multimedia/is-it-ethical-to-use-facial-imaging-in-decision-making/
Visiting page https://cs.uchicago.edu/multimedia/is-it-ethical-to-use-facial-imaging-in-decision-making/
There are 126 clickable links on this page
Processing event time: 28.558696461857803, user 0, click 1, url https://cs.uchic

In [18]:
test_pdf = parse_har(sys.har_buffer[1])
test_pdf.head(100)

Unnamed: 0,url,start_time,response_code,body_size,rtt
0,https://cs.uchicago.edu/news/uchicago-scientis...,2023-11-08 16:54:54.805726-06:00,0,-1,0
1,https://cs.uchicago.edu/wp-includes/css/dashic...,2023-11-08 16:54:54.827726-06:00,200,59016,1071
2,https://cs.uchicago.edu/wp-content/themes/csuc...,2023-11-08 16:54:54.827726-06:00,200,188368,1077
3,https://cs.uchicago.edu/wp-includes/js/jquery/...,2023-11-08 16:54:54.827726-06:00,200,89521,1079
4,https://www.googletagmanager.com/gtag/js?id=G-...,2023-11-08 16:54:54.834726-06:00,200,85846,1107
...,...,...,...,...,...
95,https://cs.uchicago.edu/people/affiliated-facu...,2023-11-08 17:49:59.590753-06:00,200,113845,269
96,https://cs.uchicago.edu/wp-content/uploads/202...,2023-11-08 17:49:59.917753-06:00,200,3300,3
97,https://cs.uchicago.edu/wp-content/uploads/201...,2023-11-08 17:49:59.917753-06:00,200,3114,3
98,https://cs.uchicago.edu/wp-content/uploads/202...,2023-11-08 17:49:59.917753-06:00,200,21653,5


In [None]:
load_path = "../data/cs.uchicago.edu/"
fpd = gen_feature(load_path)
fpd.head(5)