In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import evaluate
import torch
import torch.nn.functional as F
from nltk.tokenize import sent_tokenize
import json
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [2]:
def split_into_paragraphs(text, max_sentences=7):
    sentences = sent_tokenize(text)
    paragraphs = []
    for i in range(0, len(sentences), max_sentences):
        para = " ".join(sentences[i : i + max_sentences]).strip()
        if para:
            paragraphs.append(para)
    return paragraphs

In [3]:
def predict_whole_text(text, tokenizer, model):
    """
    Predicts if a text is human (label = 0) or AI (label = 1)
    Parameters:
        text: text to predict
    Returns:
        dict: prediction and probabilities
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True)

    model.eval()
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        
        probabilities = F.softmax(logits, dim=-1)

        predicted_class = torch.argmax(probabilities, dim=-1).item()
        
        human_prob = probabilities[0][0].item()
        ai_prob = probabilities[0][1].item()
    
    result = {
        "prediction": predicted_class,
        "confidence": max(human_prob, ai_prob),
        "probabilities": {
            "human": human_prob,
            "ai": ai_prob
        }
    }
    
    return result

In [4]:
def predict_paragraph(text, tokenizer, model, max_sentences=7):
    paragraphs = split_into_paragraphs(text, max_sentences)
    
    model.eval()

    predictions = []
    for paragraph in paragraphs:
        inputs = tokenizer(paragraph, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = F.softmax(outputs.logits, dim=-1)
            pred = torch.argmax(probs, dim=-1).item()
            predictions.append(pred)

    majority = int(sum(predictions) > len(predictions) / 2)
    return {
        "prediction": majority,
        "paragraph_predictions": predictions,
        "num_paragraphs": len(paragraphs),
    }

In [5]:
def predict_sentence(text, tokenizer, model, context_size=2):
    sentences = sent_tokenize(text)
    
    model.eval()

    predictions = []
    for i in range(len(sentences)):
        start = max(0, i - context_size)
        end = min(len(sentences), i + context_size + 1)
        context = sentences[start:i] + [f"[TARGET] {sentences[i]} [/TARGET]"] + sentences[i+1:end]
        input_text = " ".join(context)

        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = F.softmax(outputs.logits, dim=-1)
            pred = torch.argmax(probs, dim=-1).item()
            predictions.append(pred)

    majority = int(sum(predictions) > len(predictions) / 2)
    return {
        "prediction": majority,
        "sentence_predictions": predictions,
        "num_sentences": len(sentences),
    }

In [6]:
def aggregation_strategy1_prediction(input, whole_text_classification_model, whole_text_classification_tokenizer,
                          paragraph_classification_model, paragraph_classification_tokenizer,
                          sentence_classification_model, sentence_classification_tokenizer
                         ):
    """
    Uses all three models to classify given input string.
    Decision is made by majority vote
        Parameters:
            - input_text: input text to be classified
            - whole_text_classification_model: model for whole text classification
            - whole_text_classification_tokenizer: tokenizer for whole text model
            - paragraph_classification_model: model for paragraph classification
            - paragraph_classification_tokenizer: tokenizer for paragraph model
            - sentence_classification_model: model for sentence classification
            - sentence_classification_tokenizer: tokenizer for sentence model
        Returns:
            - prediction: prediction of the input text (0 = human, 1 = AI)
    """
    pred_whole_text = predict_whole_text(input, whole_text_classification_tokenizer, whole_text_classification_model)
    pred_paragraph = predict_paragraph(input, paragraph_classification_tokenizer, paragraph_classification_model)
    pred_sentence = predict_sentence(input, sentence_classification_tokenizer, sentence_classification_model)

    return 1 if pred_whole_text["prediction"] + pred_paragraph["prediction"] + pred_sentence["prediction"] >= 2 else 0

In [7]:
# TODO - load all 3 models/tokenizers

In [8]:
# TODO - load test data

In [9]:
# TODO - evaluate