## Classifier: In Context Learing

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

import numpy as np
import os
import time
import random
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"
MAX_CONTENT_LENGTH = 384 # 496, 192
OVERLAP = 64
FEATURES = "url" # "url", "content", "url_and_content"

In [3]:
TOPICS = ["cannabis", "energie", "kinder"]

## Build Index

**Load Model:**

In [4]:
# Load the transformer-based model
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex

encoder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
print("Sentence embedding dimension", encoder.get_sentence_embedding_dimension())

Sentence embedding dimension 768


**Create Index:**

In [5]:
# Function to encode texts to embeddings
def encode_to_embedding(example):
    example['embeddings'] = encoder.encode(example['text'])
    return example

# Encode all texts in the dataset
def build_document_index(encoder, dataset, topic, num_trees=10):
    """Builds an AnnoyIndex for articles in the dataset using embeddings generated by the encoder."""
    
    # Initiate index
    dim = encoder.get_sentence_embedding_dimension()
    article_index = AnnoyIndex(dim, 'angular')

    # Add articles to index
    for page_id, page in tqdm(enumerate(dataset), desc="Indexing articles"):
        article_index.add_item(page_id, page["embeddings"])

    # Build and save index
    article_index.build(num_trees)
    article_index.save(f'../../data/indices/page_index_{topic}.ann')
    print(f"Article index for topic '{topic}' saved successfully.")


In [6]:
for topic in TOPICS:
    
    print(f"Loading dataset for {topic}")
    dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}")
    dataset = dataset["train"].map(encode_to_embedding, batched=True)
    build_document_index(encoder, dataset, topic, num_trees=5)

Loading dataset for cannabis


Indexing articles: 3815it [00:01, 2956.39it/s]


Article index for topic 'cannabis' saved successfully.
Loading dataset for energie


Map: 100%|██████████| 4227/4227 [00:08<00:00, 502.94 examples/s]
Indexing articles: 4227it [00:01, 2916.39it/s]


Article index for topic 'energie' saved successfully.
Loading dataset for kinder


Map: 100%|██████████| 3628/3628 [00:07<00:00, 516.80 examples/s]
Indexing articles: 3628it [00:01, 2921.93it/s]

Article index for topic 'kinder' saved successfully.





## Test Index

In [7]:
topic = "cannabis"

# Load dataset
print(f"Loading dataset for {topic}")
dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}")
dataset

Loading dataset for cannabis


DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 3815
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 507
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 33702
    })
    extended: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length

In [8]:
article_index = AnnoyIndex(encoder.get_sentence_embedding_dimension(), "angular")
article_index.load(f'../../data/indices/page_index_{topic}.ann')

True

In [9]:
dataset["train"][0]["text"]

'Dein sicherer Marktplatz Kleinanzeigen durchsuchen Kostenlos inserieren ✔ über 1.3 Mio Anzeigen ✔ über 210.000 Besucher pro Tag ✔ über 75.000 Anfragen pro Tag Was suchst Du? Wellness & Gesundheit Rubrik Bitte warte bis alle Daten geladen sind. PLZ oder Ort Dein Standort +25 km Umkreis im Ort +5 km +10 km +25 km +50 km +100 km +150 km +250 km maximal Finden Gemerkt Kleinanzeigen Düsseldorf Wellness & Gesundheit Massage in Düsseldorf Meine Suche massage Wellness & Gesundheit Düsseldorf Suche speichern Übergabe Abholung 79 Versand 15 Rubriken Alle Rubriken Wellness & Gesundheit 84 Kosmetik und Schönheit 63 Natürlich Leben 18 Medizinische Hilfsmittel, Rollstühle 2 Esoterik 1 Preis eingrenzen Preis von - Preis bis abschicken zu verschenken Anbieter nur Private 18 nur Gewerbliche 66 Angebotstyp nur Angebote 83 nur Gesuche 1 Ort Alle Städte Düsseldorf 38 Krefeld 16 Mönchengladbach 5 Mülheim an der Ruhr 5 Haan 3 Jüchen 3 weitere Städte Neuss 3 Solingen 3 Duisburg 2 Leverkusen 2 Hilden 1 Lange

In [10]:
# Encode query and search for similar articles
inferred_vector = encoder.encode(dataset["train"][0]["text"], convert_to_tensor=True, show_progress_bar = False)
sims = article_index.get_nns_by_vector(inferred_vector, 2, search_k=-1, include_distances=True)
print(sims)

([0, 2763], [0.0, 0.3817378580570221])
