Integrantes del equipo:
- Alonso Cañas Rico
- Hugo Jiménez García

In [None]:
import torch

torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Paths

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ai_generated_path = "/content/drive/MyDrive/HUGO/Master/NLP/pan24-generative-authorship-news/machines"
# human_path = "/content/drive/MyDrive/HUGO/Master/NLP/pan24-generative-authorship-news/human.jsonl"
ai_generated_path = "/content/drive/MyDrive/pan24-generative-authorship-news/machines"
human_path = "/content/drive/MyDrive/pan24-generative-authorship-news/human.jsonl"
# ai_generated_path = "/content/drive/MyDrive/UPM/Master/NLP/pan24-generative-authorship-news/machines"
# human_path = "/content/drive/MyDrive/UPM/Master/NLP/pan24-generative-authorship-news/human.jsonl"

## Libraries

In [None]:
import os
import json
import pandas as pd

## Import data

In [None]:
model, id, text = [], [], []

# Loop through every file in the directory
for filename in os.listdir(ai_generated_path):
    # Check if the file is a JSONL file
    if filename.endswith('.jsonl'):
        filepath = os.path.join(ai_generated_path, filename)
        with open(filepath, 'r', encoding='utf-8') as jsonl_file:
            for line in jsonl_file:
                # Each line is a separate JSON object
                data = json.loads(line)
                model.append(filename)
                id.append(data['id'])
                text.append(data['text'])

df_generated = pd.DataFrame({'model': model, 'id': id, 'text': text, 'ai_generated': 1})
df_generated

In [None]:
id, text = [], []

with open(human_path, 'r', encoding='utf-8') as jsonl_file:
    for line in jsonl_file:
        # Each line is a separate JSON object
        data = json.loads(line)
        id.append(data['id'])
        text.append(data['text'])

df_human = pd.DataFrame({'model': 'Human', 'id': id, 'text': text, 'ai_generated': 0})
df_human

In [None]:
df = pd.concat([df_generated, df_human])[['text', 'ai_generated']]
df

In [None]:
# Divide el DataFrame original en textos humanos e IA
df_human = df[df['ai_generated']==0][['text']].reset_index(drop=True)
df_ia = df[df['ai_generated']==1][['text']].reset_index(drop=True)

In [None]:
# Producto cartesiano para todas las combinaciones posibles
cartesian_df = df_human.merge(df_ia, how='cross', suffixes=('_human', '_ia'))

cartesian_df = cartesian_df.sample(frac=1).reset_index(drop=True)

# Número total de combinaciones
total_combinations = len(cartesian_df)

# Primera mitad: humano en comment_text_1, IA en comment_text_2
half_1 = cartesian_df.iloc[:total_combinations // 2].copy()
half_1['comment_text_1'] = half_1['text_human']
half_1['comment_text_2'] = half_1['text_ia']
half_1['list'] = 0  # Etiqueta 0 para esta disposición

# Segunda mitad: IA en comment_text_1, humano en comment_text_2
half_2 = cartesian_df.iloc[total_combinations // 2:].copy()
half_2['comment_text_1'] = half_2['text_ia']
half_2['comment_text_2'] = half_2['text_human']
half_2['list'] = 1  # Etiqueta 1 para esta disposición


In [None]:
# Combinar ambas mitades en un solo DataFrame equilibrado y sin duplicados
balanced_df = pd.concat([half_1, half_2], ignore_index=True)

In [None]:
display(balanced_df.head())
display(balanced_df.tail())
print(balanced_df['list'].value_counts())  # Verifica la distribución de las etiquetas

In [None]:
balanced_df = balanced_df.drop(columns=['text_human', 'text_ia'])
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)
balanced_df

In [None]:
# 0 representa que comment_text_1 es humano, y 1 representa que comment_text_2 es humano

In [None]:
from sklearn.model_selection import train_test_split

test_size = 0.2
val_size = 0.1
_adjusted_val_size = val_size / (1 - test_size)

# Assume X is your features and y is your target variable
train_val, test = train_test_split(balanced_df, test_size=test_size, stratify=balanced_df['list'], shuffle=True, random_state=1337)
train, val = train_test_split(train_val, test_size=_adjusted_val_size, stratify=train_val['list'], shuffle=True, random_state=1337)
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

# Print the dimensions
print(f"train shape: {train.shape} / Text on comment_text_1 is human-generated: {train['list'].value_counts()[0]} - Text on comment_text_2 is human-generated: {train['list'].value_counts()[1]}")
print(f"val shape: {val.shape} / Text on comment_text_1 is human-generated: {val['list'].value_counts()[0]} - Text on comment_text_2 is human-generated: {val['list'].value_counts()[1]}")
print(f"test shape: {test.shape} / Text on comment_text_1 is human-generated: {test['list'].value_counts()[0]} - Text on comment_text_2 is human-generated: {test['list'].value_counts()[1]}")

## Model

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Tokenizador y modelo
model_name = "Lau123/distilbert-base-uncased-detect_ai_generated_text"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Configuración del dispositivo y optimizador
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("text-classification", model="Lau123/distilbert-base-uncased-detect_ai_generated_text")
# model = pipe.model
# model

In [None]:
# Freeze all layers except the classifier layer
for name, param in model.named_parameters():
    if name != "classifier.weight" and name != "classifier.bias":
        param.requires_grad = False

# Verify that only the classifier layer is trainable
for name, param in model.named_parameters():
    print(f"{name}: requires_grad = {param.requires_grad}")


In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text_1 = dataframe.comment_text_1
        self.comment_text_2 = dataframe.comment_text_2
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        comment_text_1 = str(self.comment_text_1[index])
        comment_text_1 = " ".join(comment_text_1.split())
        comment_text_2 = str(self.comment_text_2[index])
        comment_text_2 = " ".join(comment_text_2.split())
        inputs = self.tokenizer(comment_text_1, comment_text_2, max_length=self.max_len,
                                padding="max_length",
                                truncation=True,
                                return_token_type_ids=True,
                                return_overflowing_tokens=True)
        return {
            'input_ids': torch.tensor(inputs.input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(inputs.attention_mask, dtype=torch.long),
            # 'token_type_ids': torch.tensor(inputs.token_type_ids, dtype=torch.long),
            'labels': torch.tensor(self.targets[index], dtype=torch.long)
          }


In [None]:
from transformers import BertModel
import torch

class TransformerClass(torch.nn.Module):
    def __init__(self):
      super(TransformerClass, self).__init__()
      self.l1 = model
      self.l2 = torch.nn.Linear(768, 768)
      self.l3 = torch.nn.Dropout(0.1)
      self.l4 = torch.nn.CosineSimilarity(dim=1)
      self.l5 = torch.nn.Linear(1, 1)
    def forward(self, ids_0, mask_0, token_type_ids_0, ids_1, mask_1, token_type_ids_1):
      last_hidden_state_a = self.l1(ids_0, attention_mask=mask_0).last_hidden_state[:, 0]
      last_hidden_state_b = self.l1(ids_1, attention_mask=mask_1).last_hidden_state[:, 0]
      x_a, x_b = self.l2(last_hidden_state_a), self.l2(last_hidden_state_b)
      x_a, x_b = torch.gelu(self.l3(x_a)), torch.gelu(self.l3(x_b))
      sem_sim = self.l4(x_a, x_b)
      weighted_sem_sim = self.l5(sem_sim)
      return weighted_sem_sim

In [None]:
# Instancia el dataset
train_dataset = CustomDataset(dataframe=train, tokenizer=tokenizer, max_len=128)
val_dataset = CustomDataset(dataframe=val, tokenizer=tokenizer, max_len=128)
test_dataset = CustomDataset(dataframe=test, tokenizer=tokenizer, max_len=128)

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)
test_loader = DataLoader(test_dataset, batch_size=256)

optimizer = AdamW(model.parameters(), lr=5e-5)

# Entrenamiento y validación

In [None]:
# Función de entrenamiento
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return total_loss / len(loader)

# Función de evaluación
def evaluate(model, loader, device):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())
            targets.extend(batch["labels"].cpu().numpy())
    return accuracy_score(targets, preds)

In [None]:
# Entrenamiento y validación
epochs = 1
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_accuracy = evaluate(model, val_loader, device)
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Guardar el modelo
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

# Evaluación final en el conjunto de prueba
test_accuracy = evaluate(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.4f}")


In [None]:
y_test

In [None]:
import torch
from itertools import product
import pandas as pd
from sklearn.metrics import accuracy_score

# Asumimos que el modelo y el tokenizador ya están cargados
# model y tokenizer ya están definidos previamente

# Filtrar por label en X_test
label_1_texts = X_test.loc[y_test == 1, 'text'].reset_index(drop=True)
label_0_texts = X_test.loc[y_test == 0, 'text'].reset_index(drop=True)

# Crear todas las combinaciones entre label_1 y label_0
combinations = list(product(label_1_texts, label_0_texts))

# Función para obtener predicciones por lotes
def batch_predict(model, tokenizer, text_pairs, device, batch_size=16):
    predictions = []
    for i in range(0, len(text_pairs), batch_size):
        batch = text_pairs[i:i + batch_size]
        texts1, texts2 = zip(*batch)

        # Tokenización por lotes
        inputs1 = tokenizer(list(texts1), return_tensors="pt", max_length=128,
                            padding="max_length", truncation=True).to(device)
        inputs2 = tokenizer(list(texts2), return_tensors="pt", max_length=128,
                            padding="max_length", truncation=True).to(device)

        # Obtener logits
        with torch.no_grad():
            logits1 = model(**inputs1).logits
            logits2 = model(**inputs2).logits

        # Calcular probabilidades
        probs1 = torch.softmax(logits1, dim=-1).cpu().numpy()
        probs2 = torch.softmax(logits2, dim=-1).cpu().numpy()

        # Evaluar las probabilidades para determinar cuál texto es label_0 y cuál label_1
        for idx in range(len(batch)):
            predicted_label_0 = texts1[idx] if probs1[idx][0] > probs2[idx][0] else texts2[idx]
            predicted_label_1 = texts2[idx] if probs1[idx][0] > probs2[idx][0] else texts1[idx]

            predictions.append({
                "text1": texts1[idx],
                "text2": texts2[idx],
                "predicted_label_0": predicted_label_0,
                "predicted_label_1": predicted_label_1
            })
    return predictions

# Predecir todas las combinaciones en lotes
batch_size = 256
results = batch_predict(model, tokenizer, combinations, device, batch_size)

# Convertir resultados a DataFrame
results_df = pd.DataFrame(results)

# Evaluar la precisión
correct_predictions = 0
for _, row in results_df.iterrows():
    # Recuperar el índice de los textos en X_test
    true_label_0 = X_test.loc[X_test['text'] == row['predicted_label_0']].index[0]
    true_label_1 = X_test.loc[X_test['text'] == row['predicted_label_1']].index[0]

    # Verificar si las predicciones coinciden con las etiquetas reales
    if y_test[true_label_0] == 0 and y_test[true_label_1] == 1:
        correct_predictions += 1

accuracy = correct_predictions / len(results_df)
print(f"Pairwise Accuracy: {accuracy:.4f}")


## Pruebas

In [None]:
# from huggingface_hub import HfApi, HfFolder

# # Replace 'your_api_key_here' with your actual Hugging Face API key
# api_key = "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"

# # Login programmatically
# HfFolder.save_token(api_key)

# # Now you can use the HfApi with the token
# api = HfApi()
# user_info = api.whoami()
# print("Logged in as:", user_info)


In [None]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("text-classification", model="meta-llama/Llama-3.2-1B", device="cuda:0")

In [None]:
def prediction(index, X, y, pipe):
  x = X.iloc[index].text[:512]
  y = y[index]
  y_predict = pipe(x)
  print(f"Label: {y}")
  print(f"Prediction: {y_predict}")
  print()
  return y_predict

prediction(0, X_train, y_train, pipe)
prediction(3, X_train, y_train, pipe)
prediction(5, X_train, y_train, pipe)
prediction(78, X_train, y_train, pipe)

In [None]:
aciertos, errores = 0, 0
for i in range(len(X_test)):
  y_predict = prediction(i, X_test, y_test, pipe)
  if y_predict[0]['label'] == 'LABEL_1':
    y_predict = 1
  else:
    y_predict = 0
  if y_predict == y_test[i]:
    aciertos += 1
  else:
    errores += 1
print(f"Aciertos: {aciertos}")
print(f"Errores: {errores}")

In [None]:
y_predict[0]