Integrantes del equipo:
- Alonso Cañas Rico
- Hugo Jiménez García

## Paths

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
ai_generated_path = "/content/drive/MyDrive/HUGO/Master/NLP/pan24-generative-authorship-news/machines"
human_path = "/content/drive/MyDrive/HUGO/Master/NLP/pan24-generative-authorship-news/human.jsonl"

## Libraries

In [None]:
import os
import json
import pandas as pd

## Import data

In [None]:
model, id, text = [], [], []

# Loop through every file in the directory
for filename in os.listdir(ai_generated_path):
    # Check if the file is a JSONL file
    if filename.endswith('.jsonl'):
        filepath = os.path.join(ai_generated_path, filename)
        with open(filepath, 'r', encoding='utf-8') as jsonl_file:
            for line in jsonl_file:
                # Each line is a separate JSON object
                data = json.loads(line)
                model.append(filename)
                id.append(data['id'])
                text.append(data['text'])

df_generated = pd.DataFrame({'model': model, 'id': id, 'text': text, 'ai_generated': 1})
df_generated

In [None]:
id, text = [], []

with open(human_path, 'r', encoding='utf-8') as jsonl_file:
    for line in jsonl_file:
        # Each line is a separate JSON object
        data = json.loads(line)
        id.append(data['id'])
        text.append(data['text'])

df_human = pd.DataFrame({'model': 'Human', 'id': id, 'text': text, 'ai_generated': 0})
df_human

In [None]:
df = pd.concat([df_generated, df_human])
df

In [None]:
from sklearn.model_selection import train_test_split

test_size = 0.2
val_size = 0.1
_adjusted_val_size = val_size / (1 - test_size)

# Assume X is your features and y is your target variable
X_train_val, X_test, y_train_val, y_test = train_test_split(df.drop(columns=['ai_generated']), df['ai_generated'], test_size=test_size, stratify=df['ai_generated'], shuffle=True, random_state=1337)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=_adjusted_val_size, stratify=y_train_val, shuffle=True, random_state=1337)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
# Print the dimensions
print(f"X_train shape: {X_train.shape} / AI generated count: {y_train.value_counts()[1]} - Human count: {y_train.value_counts()[0]}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape} / AI generated count: {y_val.value_counts()[1]} - Human count: {y_val.value_counts()[0]}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape} / AI generated count: {y_test.value_counts()[1]} - Human count: {y_test.value_counts()[0]}")
print(f"y_test shape: {y_test.shape}")

## Model

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="Lau123/distilbert-base-uncased-detect_ai_generated_text")

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score

# Define un dataset personalizado compatible con PyTorch
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Tokenizador y modelo
model_name = "Lau123/distilbert-base-uncased-detect_ai_generated_text"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Crear datasets
train_dataset = CustomDataset(X_train['text'], y_train, tokenizer)
val_dataset = CustomDataset(X_val['text'], y_val, tokenizer)
test_dataset = CustomDataset(X_test['text'], y_test, tokenizer)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)
test_loader = DataLoader(test_dataset, batch_size=256)

# Configuración del dispositivo y optimizador
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Función de entrenamiento
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return total_loss / len(loader)

# Función de evaluación
def evaluate(model, loader, device):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())
            targets.extend(batch["labels"].cpu().numpy())
    return accuracy_score(targets, preds)

# Entrenamiento y validación
epochs = 3
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_accuracy = evaluate(model, val_loader, device)
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Guardar el modelo
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

# Evaluación final en el conjunto de prueba
test_accuracy = evaluate(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.4f}")


In [None]:
y_test

In [None]:
import torch
from itertools import product
import pandas as pd
from sklearn.metrics import accuracy_score

# Asumimos que el modelo y el tokenizador ya están cargados
# model y tokenizer ya están definidos previamente

# Filtrar por label en X_test
label_1_texts = X_test.loc[y_test == 1, 'text'].reset_index(drop=True)
label_0_texts = X_test.loc[y_test == 0, 'text'].reset_index(drop=True)

# Crear todas las combinaciones entre label_1 y label_0
combinations = list(product(label_1_texts, label_0_texts))

# Función para obtener predicciones por lotes
def batch_predict(model, tokenizer, text_pairs, device, batch_size=16):
    predictions = []
    for i in range(0, len(text_pairs), batch_size):
        batch = text_pairs[i:i + batch_size]
        texts1, texts2 = zip(*batch)

        # Tokenización por lotes
        inputs1 = tokenizer(list(texts1), return_tensors="pt", max_length=128,
                            padding="max_length", truncation=True).to(device)
        inputs2 = tokenizer(list(texts2), return_tensors="pt", max_length=128,
                            padding="max_length", truncation=True).to(device)

        # Obtener logits
        with torch.no_grad():
            logits1 = model(**inputs1).logits
            logits2 = model(**inputs2).logits

        # Calcular probabilidades
        probs1 = torch.softmax(logits1, dim=-1).cpu().numpy()
        probs2 = torch.softmax(logits2, dim=-1).cpu().numpy()

        # Evaluar las probabilidades para determinar cuál texto es label_0 y cuál label_1
        for idx in range(len(batch)):
            predicted_label_0 = texts1[idx] if probs1[idx][0] > probs2[idx][0] else texts2[idx]
            predicted_label_1 = texts2[idx] if probs1[idx][0] > probs2[idx][0] else texts1[idx]

            predictions.append({
                "text1": texts1[idx],
                "text2": texts2[idx],
                "predicted_label_0": predicted_label_0,
                "predicted_label_1": predicted_label_1
            })
    return predictions

# Predecir todas las combinaciones en lotes
batch_size = 256
results = batch_predict(model, tokenizer, combinations, device, batch_size)

# Convertir resultados a DataFrame
results_df = pd.DataFrame(results)

# Evaluar la precisión
correct_predictions = 0
for _, row in results_df.iterrows():
    # Recuperar el índice de los textos en X_test
    true_label_0 = X_test.loc[X_test['text'] == row['predicted_label_0']].index[0]
    true_label_1 = X_test.loc[X_test['text'] == row['predicted_label_1']].index[0]

    # Verificar si las predicciones coinciden con las etiquetas reales
    if y_test[true_label_0] == 0 and y_test[true_label_1] == 1:
        correct_predictions += 1

accuracy = correct_predictions / len(results_df)
print(f"Pairwise Accuracy: {accuracy:.4f}")


## Pruebas

In [None]:
# from huggingface_hub import HfApi, HfFolder

# # Replace 'your_api_key_here' with your actual Hugging Face API key
# api_key = "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"

# # Login programmatically
# HfFolder.save_token(api_key)

# # Now you can use the HfApi with the token
# api = HfApi()
# user_info = api.whoami()
# print("Logged in as:", user_info)


In [None]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("text-classification", model="meta-llama/Llama-3.2-1B", device="cuda:0")

In [None]:
def prediction(index, X, y, pipe):
  x = X.iloc[index].text[:512]
  y = y[index]
  y_predict = pipe(x)
  print(f"Label: {y}")
  print(f"Prediction: {y_predict}")
  print()
  return y_predict

prediction(0, X_train, y_train, pipe)
prediction(3, X_train, y_train, pipe)
prediction(5, X_train, y_train, pipe)
prediction(78, X_train, y_train, pipe)

In [None]:
aciertos, errores = 0, 0
for i in range(len(X_test)):
  y_predict = prediction(i, X_test, y_test, pipe)
  if y_predict[0]['label'] == 'LABEL_1':
    y_predict = 1
  else:
    y_predict = 0
  if y_predict == y_test[i]:
    aciertos += 1
  else:
    errores += 1
print(f"Aciertos: {aciertos}")
print(f"Errores: {errores}")

In [None]:
y_predict[0]