## Train Multiple Random Forrest Models

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, concatenate_datasets
from sklearn.metrics import accuracy_score
import random
from tqdm import tqdm
import numpy as np
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


## Set Random Seed for Reproducibility

In [2]:
# Set a seed for random module
random.seed(42)

# Set a seed for numpy module
np.random.seed(42)

# Set a seed for torch module
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Define Parameters

In [3]:
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"
SPLIT = "extended" # "train", "test", "holdout", "extended"
MAX_CONTENT_LENGTH = 384 # 496, 192
OVERLAP = 64
FEATURES = "url" # "url", "content", "url_and_content"

In [4]:
TOPICS = ["cannabis", "kinder", "energie"]
#TOPICS = ["cannabis"]

**Extract URL-path:**

In [5]:
from urllib.parse import urlparse, urlunparse

def extract_url_path(example):
    view_url = example['view_url']
    if "://" not in view_url:
        view_url = "http://" + view_url  # Assume http if no protocol specified
    parsed_url = urlparse(view_url)
    new_url = urlunparse(('', '', parsed_url.path, parsed_url.params, parsed_url.query, parsed_url.fragment))
    example['url_path'] = new_url.lstrip('/')  # Store the result in a new field
    return example


extract_url_path({"view_url": "https://www.google.com/search?q=python+url+path"})

{'view_url': 'https://www.google.com/search?q=python+url+path',
 'url_path': 'search?q=python+url+path'}

## Train Model

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

german_stop_words = stopwords.words('german')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jschelb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [8]:
def prepare_input_data(train_data, features):
    """Prepares input data based on the specified features."""
    X_train = []
    for row in train_data:
        if features == "content":
            input = row["text"]
        elif features == "url":
            input = row["url_path"]
        elif features == "url_and_content":
            input = row["url_path"] + " " + row["text"]
        else:
            raise ValueError("Invalid value for features. Expected 'content', 'url', or 'url_and_content'.")
        X_train.append(input)
    return X_train


def train_model_rf(train_data, german_stop_words, features, max_features=10000):
    """Trains an RF model and returns the model and vectorizer."""
    
    X_train = prepare_input_data(train_data, features)
        
    # Create a TfidfVectorizer
    vectorizer = TfidfVectorizer(stop_words=german_stop_words, max_features=max_features)
    X_train = vectorizer.fit_transform(X_train)
    y_train = np.array(train_data['label'])
    
    # Train an SVM classifier
    # Initialize the Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  
    rf_classifier.fit(X_train, y_train)
    return rf_classifier, vectorizer

def get_predictions_rf(model, vectorizer, new_data):
    """Gets predictions for new data using the trained model and vectorizer."""
    X_new = vectorizer.transform(new_data)
    return model.predict(X_new)

## Evaluate Models

In [9]:
def get_predictions(tokenized_datasets, model, vectorizer, features, split="test"):
    """Use the trained model to make predictions on the test set."""
    
    preds = []
    labels = []
    input_data = prepare_input_data(tokenized_datasets[split], features)
    
    for i, input in enumerate(tqdm(input_data)):
        predicted_class = get_predictions_rf(model, vectorizer, [input])[0]
        preds.append(predicted_class)
        labels.append(tokenized_datasets[split][i]["label"])
    
    return preds, labels

In [10]:
def calc_metrics(labels, preds):
    """
    Calculates the accuracy, precision, recall, and F1 score for the given labels and predictions and returns them in a dictionary.
    """
    
    metrics = {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='binary'),
        'recall': recall_score(labels, preds, average='binary'),
        'f1': f1_score(labels, preds, average='binary'),
    }

    return metrics

In [11]:
def sample_random_from_dataset(dataset, n=5, subset='test'):
    """
    Samples n random examples from a specified subset of the dataset.
    """
    n = min(n, len(dataset[subset]))
    random_indices = random.sample(range(len(dataset[subset])), n)
    sampled_dataset = dataset[subset].select(random_indices)
    return sampled_dataset

**Get chunk level predictions:**

In [12]:
from collections import defaultdict
eval_results = defaultdict(dict)

for topic in TOPICS: # ----------------------------------------------------------------------

    print(f"\n\n###### Evaluating model on {topic} ###### \n\n")
    
    if FEATURES == "url":
        dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_{SAMPLING}{SUFFIX}")
        
        if SPLIT == "holdout":
                dataset["holdout"] = concatenate_datasets(
                    [dataset["holdout"], dataset["test"]])
        dataset = dataset.map(extract_url_path, num_proc=8) # Extract the path from the URL
    else:
        dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}")
        
        if SPLIT == "holdout":
                dataset["holdout"] = concatenate_datasets(
                    [dataset["holdout"], dataset["test"]])
                
        dataset = dataset.map(extract_url_path, num_proc=8) # Extract the path from the URL
        

    # Train Model
    model, vectorizer = train_model_rf(dataset['train'], german_stop_words, FEATURES)
    
    # Use the trained model to make predictions on the test set
    preds, labels = get_predictions(dataset, model, vectorizer, FEATURES, split=SPLIT)
    metrics = calc_metrics(labels, preds)
    print(f"Metrics for {topic}: {metrics}")
    
    # Add answers to the dataset
    dataset[SPLIT] = dataset[SPLIT].add_column("preds", preds)
    dataset.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_s_RF_{FEATURES}_{SPLIT}")
    
    # Update the eval_results dictionary
    eval_results[topic] = metrics
    
    # Clear GPU memory to avoid memory errors
    del model, vectorizer



###### Evaluating model on cannabis ###### 




Map (num_proc=8): 100%|██████████| 3448/3448 [00:00<00:00, 11158.17 examples/s]
100%|██████████| 44432/44432 [04:04<00:00, 181.42it/s]


Metrics for cannabis: {'accuracy': 0.9984470651782499, 'precision': 0.2872340425531915, 'recall': 0.9310344827586207, 'f1': 0.43902439024390244}


Saving the dataset (1/1 shards): 100%|██████████| 410/410 [00:00<00:00, 26753.13 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 46/46 [00:00<00:00, 4460.58 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3448/3448 [00:00<00:00, 65420.97 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 44432/44432 [00:00<00:00, 212242.36 examples/s]




###### Evaluating model on kinder ###### 




100%|██████████| 53253/53253 [04:53<00:00, 181.56it/s]


Metrics for kinder: {'accuracy': 0.07729142020167878, 'precision': 0.000914968891057704, 'recall': 1.0, 'f1': 0.0018282649765372662}


Saving the dataset (1/1 shards): 100%|██████████| 384/384 [00:00<00:00, 26622.58 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 44/44 [00:00<00:00, 4202.52 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3722/3722 [00:00<00:00, 73213.99 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 53253/53253 [00:00<00:00, 230302.90 examples/s]




###### Evaluating model on energie ###### 




100%|██████████| 45925/45925 [04:14<00:00, 180.40it/s]


Metrics for energie: {'accuracy': 0.20651061513336963, 'precision': 0.0008499670980478175, 'recall': 1.0, 'f1': 0.0016984905350245186}


Saving the dataset (1/1 shards): 100%|██████████| 408/408 [00:00<00:00, 26987.90 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 46/46 [00:00<00:00, 4473.41 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4164/4164 [00:00<00:00, 68431.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 45925/45925 [00:00<00:00, 225815.06 examples/s]


In [13]:
print(eval_results)

defaultdict(<class 'dict'>, {'cannabis': {'accuracy': 0.9984470651782499, 'precision': 0.2872340425531915, 'recall': 0.9310344827586207, 'f1': 0.43902439024390244}, 'kinder': {'accuracy': 0.07729142020167878, 'precision': 0.000914968891057704, 'recall': 1.0, 'f1': 0.0018282649765372662}, 'energie': {'accuracy': 0.20651061513336963, 'precision': 0.0008499670980478175, 'recall': 1.0, 'f1': 0.0016984905350245186}})


### Save Chunk Level Predictions and Output Results

In [14]:
from IPython.display import display, HTML
from tabulate import tabulate
import json

In [15]:
# Define the file path to save the dictionary
file_path =f"eval_results_rf_{FEATURES}_{SPLIT}_chunks.json"

In [16]:
# Save the dictionary to disk as JSON
with open(file_path, "w") as file:
    json.dump(eval_results, file)

In [17]:
with open(file_path, "r") as file:
    eval_results = json.load(file)

## Page Level Predictions

In [18]:
from collections import Counter

In [19]:
def majority_voting(answers):
    """Apply majority voting to a list of arbitrary classification answers."""
    count = Counter(answers)
    most_common = count.most_common()  # Get all common answers sorted by frequency

    if not most_common:
        return 0 # Handle empty input scenario

    # Check for ties at the highest count
    max_votes = most_common[0][1]
    tied_classes = [cls for cls, votes in most_common if votes == max_votes]

    if len(tied_classes) > 1:
        return max(tied_classes)  # Return the maximum class label in case of a tie
    return tied_classes[0]  # Return the class with the most votes

majority_voting([1, 1, 2, 2, 2, 3])

2

In [20]:
from collections import defaultdict
eval_results_pages = defaultdict(dict)

for topic in TOPICS: # ----------------------------------------------------------------------

    print(f"\n\n###### Evaluating on {topic} ###### \n\n")
    dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_s_RF_{FEATURES}_{SPLIT}")
    
    print(dataset)
    
    # Group dataset examples by URL, with a fallback to domain
    grouped_dataset = {}
    for example in tqdm(dataset[SPLIT]):
        url = example.get("view_url") or example.get("domain")
        example_filtered = {k: example[k] for k in ["text", "domain", "preds", "label", "category", "annotation_type", "lang"]}
        grouped_dataset.setdefault(url, []).append(example_filtered)
        
    # Extract labels
    labels = []
    for url, chunks in grouped_dataset.items():
        preds = [chunk["label"] for chunk in chunks]
        labels.append(max(preds))
        
    # Merge chunk level predictions
    predictions = []
    for url, chunks in grouped_dataset.items():
        preds = [chunk["preds"] for chunk in chunks]
        pred = majority_voting([pred for pred in preds if pred > 0]) if max(preds) > 0 else 0
        predictions.append(pred)

    # Use the trained model to make predictions on the test set
    metrics = calc_metrics(labels, predictions)
    print(f"Metrics for {topic}: {metrics}")
    
    # Update the eval_results dictionary
    eval_results_pages[topic] = metrics
    



###### Evaluating on cannabis ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 410
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 46
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 3448
    })
    extended: Dataset({
        features: ['_id', 'batch_id', 'domain', '

100%|██████████| 44432/44432 [00:08<00:00, 5458.41it/s]


Metrics for cannabis: {'accuracy': 0.9984865255596467, 'precision': 0.29347826086956524, 'recall': 0.9310344827586207, 'f1': 0.4462809917355372}


###### Evaluating on kinder ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 384
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 44
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic

100%|██████████| 53253/53253 [00:09<00:00, 5639.42it/s]


Metrics for kinder: {'accuracy': 0.07793833200591649, 'precision': 0.0009246132034765456, 'recall': 1.0, 'f1': 0.0018475181672619782}


###### Evaluating on energie ###### 


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 408
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'url_path'],
        num_rows: 46
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label'

100%|██████████| 45925/45925 [00:08<00:00, 5643.45it/s]


Metrics for energie: {'accuracy': 0.20651061513336963, 'precision': 0.0008499670980478175, 'recall': 1.0, 'f1': 0.0016984905350245186}


In [21]:
print(eval_results_pages)

defaultdict(<class 'dict'>, {'cannabis': {'accuracy': 0.9984865255596467, 'precision': 0.29347826086956524, 'recall': 0.9310344827586207, 'f1': 0.4462809917355372}, 'kinder': {'accuracy': 0.07793833200591649, 'precision': 0.0009246132034765456, 'recall': 1.0, 'f1': 0.0018475181672619782}, 'energie': {'accuracy': 0.20651061513336963, 'precision': 0.0008499670980478175, 'recall': 1.0, 'f1': 0.0016984905350245186}})


### Save Chunk Level Predictions and Output Results

In [22]:
# Define the file path to save the dictionary
file_path = f"eval_results_rf_{FEATURES}_{SPLIT}_pages.json"

In [23]:
# Save the dictionary to disk as JSON
with open(file_path, "w") as file:
    json.dump(eval_results_pages, file)

In [24]:
with open(file_path, "r") as file:
    eval_results_pages = json.load(file)