## Get Predictions

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, concatenate_datasets
from sklearn.metrics import accuracy_score
import random
from tqdm import tqdm
import numpy as np
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


## Set Random Seed for Reproducibility

In [2]:
# Set a seed for random module
random.seed(42)

# Set a seed for numpy module
np.random.seed(42)

# Set a seed for torch module
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Define Parameters

In [3]:
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"
SPLITS = ['train', 'test', 'holdout', 'extended', 'holdout_url', 'extended_url']
MAX_CONTENT_LENGTH = 384 # 496, 192
OVERLAP = 64
FEATURES = "url_and_content" # "url", "content", "url_and_content"

In [4]:
TOPIC = "cannabis" # "cannabis", "kinder", "energie"
MODEL = "deepset/gbert-large"

**Extract URL-path:**

In [5]:
from urllib.parse import urlparse, urlunparse

def extract_url_path(example):
    view_url = example['view_url']
    if "://" not in view_url:
        view_url = "http://" + view_url  # Assume http if no protocol specified
    parsed_url = urlparse(view_url)
    new_url = urlunparse(('', '', parsed_url.path, parsed_url.params, parsed_url.query, parsed_url.fragment))
    example['url_path'] = new_url.lstrip('/')  # Store the result in a new field
    return example


extract_url_path({"view_url": "https://www.google.com/search?q=python+url+path"})

{'view_url': 'https://www.google.com/search?q=python+url+path',
 'url_path': 'search?q=python+url+path'}

## Evaluate Models

In [6]:
def get_predictions(tokenized_datasets, tokenizer, model, device, features, split="test"):
    """Use the trained model to make predictions on the test set."""
    
    preds = []
    labels = []
    probabilities = []
    
    for row in tqdm(tokenized_datasets[split]):
        # Encode the text inputs
        if features == "content":
            inputs = tokenizer(row["text"], padding="max_length", truncation=True, return_tensors="pt")
        elif features == "url":
            inputs = tokenizer(row["url_path"], padding="max_length", truncation=True, return_tensors="pt")
        elif features == "url_and_content":
            inputs = tokenizer(row["url_path"], row["text"], padding="max_length", truncation=True, return_tensors="pt")
        else:
            raise ValueError("Invalid value for FEATURES. Expected 'content', 'url', or 'url_and_content'.")

        with torch.no_grad():
            # Forward pass
            outputs = model(**inputs.to(device))
            # Apply softmax to logits to get probabilities
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            # Get the predicted class (the one with the highest probability)
            predicted_class = torch.argmax(predictions).item()
        
        # Store the predictions, labels, and probabilities
        preds.append(predicted_class)
        labels.append(row["label"])
        probabilities.append(predictions.cpu().numpy().tolist()[0][1])# Store the probability of the positive class
    
    return preds, labels, probabilities

In [7]:
def calc_metrics(labels, preds):
    """
    Calculates the accuracy, precision, recall, and F1 score for the given labels and predictions and returns them in a dictionary.
    """
    
    metrics = {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='binary'),
        'recall': recall_score(labels, preds, average='binary'),
        'f1': f1_score(labels, preds, average='binary'),
    }

    return metrics

In [8]:
def sample_random_from_dataset(dataset, n=5, subset='test'):
    """
    Samples n random examples from a specified subset of the dataset.
    """
    n = min(n, len(dataset[subset]))
    random_indices = random.sample(range(len(dataset[subset])), n)
    sampled_dataset = dataset[subset].select(random_indices)
    return sampled_dataset

**Get chunk level predictions:**

In [9]:
from collections import defaultdict
eval_results = defaultdict(dict)


print(f"\n\n###### Evaluating model {MODEL} on {TOPIC} ###### \n\n")
    

dataset = load_from_disk(
    f"../../data_ccu/tmp/processed_dataset_{TOPIC}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_with_urls")

# Extract the path from the URL
dataset = dataset.map(extract_url_path)
# dataset['test'] = sample_random_from_dataset(dataset, n=5, subset='test')

# Load model and tokenizer
model_name_local = f"../../models_ccu/{MODEL.replace('/','_')}_{TOPIC}_model_{FEATURES}/"
print(f"Loading model from {model_name_local}")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(model_name_local, num_labels=2, local_files_only=True)

# Use multiple GPUs if available
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)
    
# Move model to GPU if available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(DEVICE)


for split in SPLITS:
    print(f"Get Predictions for Split: {split}")
    
    # Sample a few examples from the dataset for demonstration purposes
    dataset[split] = sample_random_from_dataset(dataset, n=100, subset=split)

    # Use the trained model to make predictions on the test set
    preds, labels, probas = get_predictions(dataset, tokenizer, model, DEVICE, FEATURES, split=split)
    dataset[split] = dataset[split].add_column("preds", preds)
    dataset[split] = dataset[split].add_column("probas", probas)
    
dataset.save_to_disk(f"../../data_ccu/tmp/processed_dataset_{TOPIC}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_with_urls_{MODEL.split('/')[1]}_{FEATURES}_with_predictions")

#metrics = calc_metrics(labels, preds)
#print(f"Metrics for {MODEL} on {TOPIC}: {metrics}")

# # Add answers to the dataset
# dataset[SPLIT] = dataset[SPLIT].add_column("preds", preds)
# dataset[SPLIT] = dataset[SPLIT].add_column("probas", probas)
# dataset.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_s_{model_name.split('/')[1]}_{FEATURES}_{SPLIT}")

# # Update the eval_results dictionary
# eval_results[model_name][topic] = metrics

# # Clear GPU memory to avoid memory errors
# del model, tokenizer
# torch.cuda.empty_cache()



###### Evaluating model deepset/gbert-large on cannabis ###### 




Map: 100%|██████████| 3815/3815 [00:00<00:00, 4760.82 examples/s]
Map: 100%|██████████| 507/507 [00:00<00:00, 5652.61 examples/s]
Map: 100%|██████████| 33702/33702 [00:06<00:00, 5041.05 examples/s]
Map: 100%|██████████| 224737/224737 [00:44<00:00, 5101.51 examples/s]
Map: 100%|██████████| 1094/1094 [00:00<00:00, 4643.09 examples/s]
Map: 100%|██████████| 39424/39424 [00:07<00:00, 4978.16 examples/s]


Loading model from ../../models_ccu/deepset_gbert-large_cannabis_model_url_and_content/
Using 2 GPUs!
Get Predictions for Split: train


100%|██████████| 100/100 [00:09<00:00, 10.63it/s]
Flattening the indices: 100%|██████████| 100/100 [00:00<00:00, 11172.60 examples/s]


Get Predictions for Split: test


100%|██████████| 100/100 [00:08<00:00, 12.19it/s]
Flattening the indices: 100%|██████████| 100/100 [00:00<00:00, 10309.47 examples/s]


Get Predictions for Split: holdout


100%|██████████| 100/100 [00:08<00:00, 12.07it/s]
Flattening the indices: 100%|██████████| 100/100 [00:00<00:00, 10328.00 examples/s]


Get Predictions for Split: extended


100%|██████████| 100/100 [00:08<00:00, 12.18it/s]
Flattening the indices: 100%|██████████| 100/100 [00:00<00:00, 9499.91 examples/s]


Get Predictions for Split: holdout_url


100%|██████████| 100/100 [00:08<00:00, 12.00it/s]
Flattening the indices: 100%|██████████| 100/100 [00:00<00:00, 9535.11 examples/s]


Get Predictions for Split: extended_url


100%|██████████| 100/100 [00:08<00:00, 11.84it/s]
Flattening the indices: 100%|██████████| 100/100 [00:00<00:00, 9621.73 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 13060.26 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 12981.04 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 13662.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 15613.68 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 12829.76 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 12164.10 examples/s]
