## Classify Webpages: Iterate over Pages

In [1]:
from utils.database import *
from utils.files import *
from tqdm import tqdm
from bson import ObjectId
import pandas as pd 
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import InferenceClient
from transformers import BertTokenizer
from utils.preprocessing import *
from utils.accelerators import *
from utils.multithreading import *
from utils.database import *
from utils.model import *
from utils.files import *
from datasets import Dataset
from tqdm import tqdm
import statistics
import hashlib
import random
import time
import math
import re

  from .autonotebook import tqdm as notebook_tqdm


## Setup Database Connection

In [2]:
load_dotenv()
CONNECTION_STRING = os.getenv("CONNECTION_STRING")
DATABASE_NAME = os.getenv("DATABASE_NAME")

**Connect to database:**

In [3]:
fs, db = getConnection(CONNECTION_STRING, DATABASE_NAME)

## Topics and Keywords

In [4]:
keywords = {
	'kinder': ['kinder', 'kindergr', 'paus', 'familie', 'bundestag.de', 'arbeitsagentur.de', 'kindergrundsicherung',  'kindergeld', 'kindersicherung', 'kinderzuschlag', 'gesetz'],
	'energie': ['energie', 'eeg','grün','gruen','habeck', 'climate', 'strom','Waerme','wende','frderung', 'förderung', 'windkraft', 'windrad', 'photovoltaik',
            	'photovoltaic', 'solar', 'heizung', 'heiz', 'gesetz', 'erneuer', 'geothermie', 'pv', 'geg'],
	'cannabis': ['cannabis', 'canabis', 'cannabic', 'gras', 'cbd' , 'droge', 'hanf', 'thc', 'canbe', 'legal', 'legalisierung', 'gesetz', 'verein', 'entkriminali']
}

***

## Query Database

In [5]:
def fetchPages(
    db,
    limit: int = 0,
    skip: int = 0,
    query={},
    fields: dict = {},
    collection = 'pages.content.extracted'
):
    tasks = db[collection].find(
        query, fields).limit(limit).skip(skip)
    return list(tasks)

In [6]:
batches = [15, 16, 17]

In [7]:
fields = {"_id": 1, "batch_id": 1, "view_url": 1, "lang": 1, "domain": 1, "text": 1}
query = {"batch_id": { "$in": batches }, "classes": {"$exists": False}, 
        "text": {"$exists": True}, "word_count": {"$lt": 100_000}, "lang": "de"}
pages = fetchPages(db, limit = 10, skip = 0, query = query, fields = fields)

**Example page:**

In [8]:
example_page = pages[0]
print(example_page)

{'_id': ObjectId('648c2ad88e8cadbd29004e04'), 'batch_id': 15, 'domain': 'insektenstop.net', 'lang': 'de', 'text': ' Um Insektenstop in vollem Umfang nutzen zu können, empfehlen wir Ihnen Javascript in Ihrem Browser zu aktiveren. 4,6 / 5 Menü Suchen Suchen Infocenter Warenkorb Fenster Dachfenster Türen Maßanfertigung Lichtschachtabdeckungen Gewebe Zubehör Zur Kategorie Fenster Spannrahmen Klettband Set Dachfenster Rollladen mit Insektenschutz Rollosystem Fertig montiert/Zuschnitt Zur Kategorie Dachfenster Zur Kategorie Türen Drehtüren Schiebetüren Vorhang Rollladen mit Insektenschutz Fertig montiert/Zuschnitt Zur Kategorie Maßanfertigung Zur Kategorie Lichtschachtabdeckungen Zur Kategorie Gewebe Zur Kategorie Zubehör Ersatzteile mobiler Insektenschutz Fenster Spannrahmen Fliegengitter Fenster "START", Zuschnitt - Alurahmen Zurück Vor Menü schließen Kategorien Fenster Spannrahmen Klettband Set Dachfenster Rollladen mit Insektenschutz Rollosystem Fertig montiert/Zuschnitt Dachfenster Türe

## Split Pages into Chunks

In [9]:
tokenizer = AutoTokenizer.from_pretrained("deepset/gbert-large")

In [10]:
def splitText(text, n_tokens, tokenizer, overlap=100):
    """
    Splits the input text into chunks with n_tokens tokens using HuggingFace tokenizer, with an overlap of overlap tokens.
    Each chunk includes the special tokens (e.g., [CLS], [SEP]) required by the tokenizer.
    """
    # Tokenize the text
    tokens = tokenizer.tokenize(text, truncation=False)
    chunks = []
    i = 0
    while i < len(tokens):
        # Adjust the end index of the chunk to account for the special tokens
        end_index = min(i + n_tokens - 4, len(tokens)) # -3 for [CLS] and [SEP]
        
        # Extract the chunk and add special tokens
        chunk = tokens[i:end_index]
        chunk_str = tokenizer.convert_tokens_to_string(chunk)
        
        chunks.append(chunk_str)
        i += n_tokens - overlap - 4 # -3 for [CLS] and [SEP] in overlap

    return chunks


In [11]:
example_page_chunks = splitText(example_page.get("text", ""), n_tokens = 20, tokenizer = tokenizer, overlap=10)
print(example_page_chunks[:2])

Token indices sequence length is longer than the specified maximum sequence length for this model (2063 > 512). Running this sequence through the model will result in indexing errors


['Um Insektenstop in vollem Umfang nutzen zu können, empfehlen wir Ihnen Javasc', 'Umfang nutzen zu können, empfehlen wir Ihnen Javascript in Ihrem Browser zu aktiver']


## Tokenize Chunks 

In [12]:
def tokenizeInputs(examples, tokenizer, padding="max_length", truncation=True, return_tensors="pt"):
    """Tokenizes the input examples using the given tokenizer."""
    return tokenizer(examples, padding=padding, truncation=truncation, add_special_tokens=False, return_tensors=return_tensors)

# Example of tokenizing the chunks
example_page_chunks_tokenized = tokenizeInputs(example_page_chunks, tokenizer)
print(example_page_chunks_tokenized.get("input_ids")[0])
print(example_page_chunks_tokenized.get("input_ids")[:2])

tensor([  607, 14287,  3403, 30903,   153, 21830,  7661,  4964,   205,   618,
          818, 10558,   268,  1609, 19204,  3753,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

## Compute Class Probabilities

In [13]:
def computeClassProbas(model, examples, tokenizer, device=None):
    """Computes the classification for the input examples using the given model and tokenizer."""
    # Tokenize the input examples
    inputs = tokenizeInputs(examples, tokenizer)
    
    # Move the inputs to the appropriate device
    if device is not None:
        inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Compute the classification
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Compute the predicted class probabilities
    #probas = outputs.softmax(dim=1)
    probas = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    return probas

In [14]:
model_path = "../models/deepset_gbert-large_kinder_model" # Path to the model e.g. "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).eval()

In [15]:
# Example of computing the classifications
example_page_chunks_classifications = computeClassProbas(model, example_page_chunks[:2], tokenizer)
print(example_page_chunks_classifications)

tensor([[0.9969, 0.0031],
        [0.9880, 0.0120]])


In [16]:
# Predicted classes
example_page_chunks_classes = example_page_chunks_classifications.argmax(dim=1)
print(example_page_chunks_classes)

tensor([0, 0])


In [17]:
# Predict class for positive example
example_page_kinder = """
Eltern mit Kindern sollen mehr Geld bekommen. Das fordert die Partei Die Linke. Kindergeld leistet einen wichtigen Beitrag zur Bekämpfung von Kinderarmut. Die Linke will das Kindergeld um 75 Euro monatlich erhöhen.
"""

# Split into chunks
example_page_kinder_chunks = splitText(example_page_kinder, n_tokens=100, tokenizer=tokenizer, overlap=10)
print("Number of chunks:", len(example_page_kinder_chunks))
print(example_page_kinder_chunks[:2])

# Tokenize the input
#example_page_kinder_tokenized = tokenizeInputs(example_page_kinder_chunks, tokenizer)

# Compute the classification
example_page_kinder_classification = computeClassProbas(model, example_page_kinder_chunks, tokenizer)
print(example_page_kinder_classification)

# Predicted class
example_page_kinder_class = example_page_kinder_classification.argmax(dim=1)
print(example_page_kinder_class)



Number of chunks: 1
['Eltern mit Kindern sollen mehr Geld bekommen. Das fordert die Partei Die Linke. Kindergeld leistet einen wichtigen Beitrag zur Bekämpfung von Kinderarmut. Die Linke will das Kindergeld um 75 Euro monatlich erhöhen.']
tensor([[8.1902e-05, 9.9992e-01]])
tensor([1])


## Update Pages in Database

In [18]:
def updatePage(db, id: str, values: dict = {}, collection="pages.content.extracted"):
    """Updates an article in the database."""
    # Ensure the _id is formatted correctly for MongoDB
    filter = {"_id": ObjectId(id)}
    # Prepare the update values
    values = {"$set": {**values}}
    # Execute the update operation
    r = db[collection].update_one(filter, values)
    return r

In [18]:
from tqdm import tqdm

def updatePages(db, pages, field_name, collection="pages.content.extracted"):
    """Updates the articles in the database with prediction results."""
    print("Field name:", field_name)
    for page in tqdm(pages, desc="Uploading results"):
        id = page.get("_id")  # Retrieve the article ID
        # Prepare the values to be updated with prediction results
        values = {
            field_name: page.get("prediction", {}),  # Includes chunks info, avg. class probas, overall predicted class
            # Add any other fields you wish to update here
        }
        # Call updateArticle to update each article in the database
        #updatePage(db, id, values, collection) # TODO: Uncomment this line to update the articles in the database


***

## Process Pages in Batches

In [19]:
def processBatch(
    articles, 
    model, 
    tokenizer, 
    chunk_size=512, 
    overlap=64, 
    show_progress=False,
    device=None
):
    """
    Processes a batch of articles, tokenizes them, computes class probabilities, and updates each article with a predicted class.
    """
    
    runtimes = []  # List to store processing times for each article
    
    # Iterate over articles with optional progress bar
    for article in tqdm(articles, disable=not show_progress, desc="Processing articles"):
        start_time = time.time()  # Start timer for processing this article
        
        # Extract text from the article
        article_text = article.get("text", "")
        
        # Split the article text into chunks
        chunks = splitText(article_text, n_tokens=chunk_size, tokenizer=tokenizer, overlap=overlap)
        
        # Compute the classification probabilities for the chunks
        probas = computeClassProbas(model, chunks, tokenizer, device=device)
        
        # Determine the predicted class for each chunk
        predicted_classes = probas.argmax(dim=1)
        
        # Prepare the chunk, probabilities, and classes for storage
        chunks_info = [{
            "chunk": chunk,
            "class_probas": proba.tolist(),
            "predicted_class": predicted_class.item()
        } for chunk, proba, predicted_class in zip(chunks, probas, predicted_classes)]
        
        # Calculate the average class probabilities over all chunks
        # Ensure 'probas' is moved to CPU and converted to a NumPy array if necessary
        if torch.is_tensor(probas) and probas.is_cuda:
            average_class_probas = np.mean(probas.cpu().numpy(), axis=0).tolist()
        else:
            # Handle the case where 'probas' is already a CPU tensor or a NumPy array
            average_class_probas = np.mean(probas, axis=0).tolist()
        
        # Convert each probability to a Python float
        average_class_probas = [float(proba) for proba in average_class_probas]

        # Determine the overall predicted class based on the average class probabilities
        overall_predicted_class = np.argmax(average_class_probas)
        
        # Convert to Python int
        overall_predicted_class = int(overall_predicted_class)
        
        # Organize the prediction data
        prediction = {
            "chunks": chunks_info,
            "average_class_probas": average_class_probas,
            "overall_predicted_class": overall_predicted_class
        }
        
        # Update the article with the chunks, their probabilities, and predicted classes
        article["prediction"] = prediction

        end_time = time.time()  # End timer for processing this article
        runtimes.append(end_time - start_time)  # Calculate and store processing time

    return articles, runtimes


In [20]:
#model_path = "../models/bert_kinder_model_buff" # Path to the model e.g. "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).eval()

In [21]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print('Using device:', device)
# print()

# #Additional Info when using cuda
# if device.type == 'cuda':
#     print(torch.cuda.get_device_name(0))
#     print('Memory Usage:')
#     print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
#     model.to(device)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Using 2 GPUs!


DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(31102, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-23): 24 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=1024, out_fea

In [22]:
LIMIT = 10 # Number of articles to process in each batch
SKIP = 0 # Number of articles to skip in each batch
CHUNK_SIZE = 512 # Number of tokens in each chunk
OVERLAP = 64 # Number of overlapping tokens between chunks
TOPIC = "kinder"    

In [23]:
collection = "pages.content.extracted"
fields = {"_id": 1, "batch_id": 1, "view_url": 1, "lang": 1, "domain": 1, "text": 1}
query = {"batch_id": { "$in": batches }, f"prediction_{TOPIC}": {"$exists": False}, 
        "text": {"$exists": True}, "word_count": {"$lt": 100_000}, "lang": "de"}

In [24]:
batch_id = 0

while True:
    print(f"------ Batch {batch_id} ------")

    # Fetch the next batch of articles
    pages = fetchPages(db, limit = LIMIT, skip = SKIP, query = query, fields = fields)
    
    # Stop if no more articles are available
    if not pages:
        break
    
    # Process the batch of articles
    pages, runtimes = processBatch(pages, model, tokenizer, chunk_size=CHUNK_SIZE, overlap=OVERLAP, show_progress=True, device=device)

    # Update the articles in the database
    updatePages(db, pages, field_name = f"prediction_{TOPIC}", collection=collection)
    #print(f"Updated {len(articles)} articles", end="\n\n")
    
    #print(runtimes)
    
    # Print average processing time for the batch
    print(f"Average processing time: {statistics.mean(runtimes):.2f} seconds")
    print(f"Standard deviation: {statistics.stdev(runtimes):.2f} seconds")

    batch_id += 1
    break # TODO: Remove to process all batches

------ Batch 0 ------


Processing articles:   0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1714 > 512). Running this sequence through the model will result in indexing errors


Processing articles: 100%|██████████| 10/10 [00:11<00:00,  1.14s/it]


Field name: prediction_kinder


Uploading results: 100%|██████████| 10/10 [00:00<00:00, 195995.51it/s]

Average processing time: 1.14 seconds
Standard deviation: 1.02 seconds





**Runtime Napkin Math:**
>21136872 pages times 0.08 seconds = 1.690.949
>1.690.949 / 60 / 60 / 24 / 2 = 10 days

### Document Classification Aggregation Strategies

For document classification tasks where documents are split into chunks and each chunk is classified separately, several strategies can be employed to aggregate these chunk-level predictions to determine a single label for the entire document:

**Majority Voting:**
The label that occurs most frequently among all chunk predictions is assigned to the entire document.

**Weighted Voting:**
Similar to majority voting, but each vote (chunk prediction) is weighted based on confidence, chunk length, or position in the document.

**Threshold-Based Aggregation:**
Aggregate the prediction probabilities across chunks and choose the label with the highest cumulative probability, or set a threshold for label validity.
