## Fetch Webpage Content

In [1]:
import os
import sys

# Needed to import modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from utils.database import *
from utils.files import *
from tqdm import tqdm
#from bson import ObjectId
import pandas as pd 
import numpy as np

## Setup

In [3]:
load_dotenv()
CONNECTION_STRING = os.getenv("CONNECTION_STRING")
DATABASE_NAME = os.getenv("DATABASE_NAME")

**Connect to database:**

In [6]:
fs, db = getConnection(CONNECTION_STRING, DATABASE_NAME)

## Fetch Page Content

In [7]:
def fetchPages(
    db,
    limit: int = 0,
    skip: int = 0,
    query={},
    fields: dict = {},
    collection = 'pages.content.extracted.evaluation'
):

    tasks = db[collection].find(
        query, fields).limit(limit).skip(skip)
    return list(tasks)

**Fetch Webpage Content in Batches:**

In [8]:
batches = [15, 16, 17]

In [9]:
total_count = db.pages.content.extracted.evaluation.count_documents({'batch_id': {'$in': batches}})
print(f'Number of documents where batch is in {batches}: {total_count}')

Number of documents where batch is in [15, 16, 17]: 3437316


In [7]:
query = {"batch_id": { "$in": batches }}
fields = {'_id': 1,
    'batch_id': 1,
    'domain': 1,
    'encoding': 1,
    'file_id': 1,
    'lang': 1,
    'text': 1,
    'text_length': 1,
    'url': 1,
    'word_count': 1,
    'view_url': 1}

# Define the batch size and initial skip
batch_size = 100_000
skip = 0

# File path for the JSONL file
#file_path = f"../../data/raw/pages_labelled_{topic}_id_only.json"
file_path = f"../../data/raw/pages_with_text.json"

# Check if the file exists and remove it
if os.path.exists(file_path):
    os.remove(file_path)

# Initialize tqdm progress bar
with tqdm(total=total_count, desc="Processing Documents", unit=" pages", dynamic_ncols=True) as pbar:
    while True:
        # Fetch a batch of pages
        pages = fetchPages(db, limit=batch_size, skip=skip, query=query, fields=fields)

        # Break the loop if no more data is returned
        if not pages:
            break

        # Convert the batch to a DataFrame
        df_pages = pd.DataFrame(pages)

        # Append the DataFrame to the JSONL file
        df_pages.to_json(file_path, orient="records", force_ascii=True, lines=True, 
                         default_handler=str, mode='a')

        # Update the skip value for the next batch
        batch_count = len(pages)
        skip += batch_count

        # Update tqdm progress bar
        pbar.update(batch_count)

print("Data fetching and saving complete.")


Processing Documents: 100%|██████████| 3437316/3437316 [11:09<00:00, 5132.29 pages/s]

Data fetching and saving complete.



