# WebPageTest

**Imports**

In [1]:
import requests
import pandas as pd
from tqdm import tqdm
import os
import concurrent.futures
from bs4 import BeautifulSoup
from dotenv import load_dotenv

In [None]:
load_dotenv()

API_KEY = os.getenv('API_KEY')
base_dir = '../data'

In [None]:
def run_webpagetest(url, api_key=API_KEY):
    base_url = "https://www.webpagetest.org/"
    endpoint = "runtest.php"
    params = {
        "url": url,
        "runs": 1,
        "f": "json",
        "browser": "Chrome",  # Specify Chrome as the browser
        "location": "Dulles:Chrome",  # Choose a Chrome-capable location
        "fvonly" : 1
        }

    headers = {
        "X-WPT-API-KEY": api_key  # Include the API key in the header
    }

    # Send the request
    response = requests.get(base_url + endpoint, params=params, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        # Extract and print relevant information
        test_id = data["data"]["testId"]
        json_url = data["data"]["jsonUrl"]
        
        return test_id, json_url
    else:
        print(f"Error submitting test: {response.text}")
        return None

Retrieving article urls from websites in dataset (where applicable)

In [None]:
websites = os.listdir(base_dir)
data_frames = []

for website in websites:
    for category in os.listdir(f'{base_dir}/{website}'):
        df = pd.read_csv(f'{base_dir}/{website}/{category}/image_data.csv')
        if 'article_url' in df.columns:
            df['website'] = [website] * df.shape[0]
            df['category'] = [category] * df.shape[0]
            df['page_weight_in_bytes'] = [''] * df.shape[0]
            # bookkeeping of result urls for get requests
            df['id'] = [''] * df.shape[0]
            df['json_url'] = [''] * df.shape[0]
            df = df.loc[:, ['website', 'category', 'article_number', 'article_url', 'page_weight_in_bytes', 'id', 'json_url']]
            data_frames.append(df)

combined_df = pd.concat(data_frames)
combined_df.drop_duplicates(inplace=True)
combined_df.to_csv('wpt_data.csv')

Send Requests to API

In [None]:
sample_df = combined_df.sample(1000)
sample_websites = combined_df['website'].unique().tolist()

for website in sample_websites:
    website_df = combined_df.groupby('website').get_group(website)
    urls = website_df['article_url'].tolist()
    indices = website_df.index

    # sequential requests due to API rate limits

    id = []
    json_url = []
    for url in urls:
        result = run_webpagetest(url)
        if result is None:
            id.append('None')
            json_url.append('None')
        else:
            id.append(result[0])
            json_url.append(result[1])
    
    combined_df.loc[indices, 'id'] = id
    combined_df.loc[indices, 'json_url'] = json_url

combined_df.to_csv('wpt_data.csv', index=False)

Fetch Page Weights

In [None]:
def create_result_url(id):
    return f'https://www.webpagetest.org/jsonResult.php?test={id}&highlight=1'

def get_page_weight_in_bytes(id):
    if id =='None':
        return -1
    url = create_result_url(id)
    response = requests.get(url)
    html_source = response.text
    soup = BeautifulSoup(html_source, 'html.parser')
    soup_str = str(soup)
    temp = soup_str[soup_str.find('bytesIn'):]
    return int(float(temp[:temp.find(',')].replace('bytesIn": ', '')))

Paralleiizing fetching responses with concurrent threads

In [None]:
def read_all_responses(outputs, website):
    response_outputs = [None] * len(outputs)  # Initialize to hold responses in the original order

    # Filter only valid token IDs
    valid_token_ids = [token_id for token_id in outputs if token_id is not None]

    if valid_token_ids:  # Only proceed if we have valid token IDs
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            response_futures = {executor.submit(get_page_weight_in_bytes, token_id): idx for idx, token_id in enumerate(outputs) if token_id is not None}

            for future in tqdm(concurrent.futures.as_completed(response_futures), total=len(response_futures), desc=f"Fetching page weights: {website}"):
                idx = response_futures[future]  # Get the original index
                response = future.result()
                response_outputs[idx] = response  # Store the response at the original index

    return response_outputs

In [None]:
for website in websites:
    website_df = combined_df.groupby('website').get_group(website)
    indices = website_df.index
    token_ids = website_df['id'].tolist()
    responses = read_all_responses(token_ids, website)
    combined_df.loc[indices, 'page_weight_in_bytes'] = responses

combined_df.drop(['id', 'json_url', ], axis=1, inplace=True)
combined_df.to_csv('wpt_data.csv', index=False)

Sample from the page_weights dataset

In [18]:
combined_df.sample(5)

Unnamed: 0,website,category,article_number,article_url,page_weight_in_bytes
210,www.nytimes.com,Music,1,https://www.nytimes.com/2024/10/04/arts/music/...,5835625
786,edition.cnn.com,Politics Congress,12,https://edition.cnn.com/2023/10/23/politics/ho...,5322836
809,auone.com,International,4,https://article.auone.jp/detail/1/4/8/407_8_r_...,1974
784,edition.cnn.com,Politics Congress,9,https://edition.cnn.com/2023/10/24/politics/to...,4841045
456,www.usatoday.com,Travel,8,https://www.usatoday.com/story/travel/cruises/...,6201197


In [26]:
print(f"Average page_weight in bytes: {combined_df['page_weight_in_bytes'].mean()}")
print(f"Average page_weight in MB: {combined_df['page_weight_in_bytes'].mean() / 1024**2}")

Average page_weight in bytes: 4777977.88
Average page_weight in MB: 4.5566347885131835
