In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW, set_seed
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
random_seed = 0
set_seed(random_seed)
train_path =  '/kaggle/input/d/dimitris2sot/semeval2024task8/subtaskA_train_multilingual.jsonl'
test_path =  '/kaggle/input/d/dimitris2sot/semeval2024task8/subtaskA_dev_multilingual.jsonl'

train_df = pd.read_json(train_path, lines=True)
eval_df = pd.read_json(test_path, lines=True)


In [None]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1}
model_name = 'xlm-roberta-base'
# Initialize the tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id).to(device)

In [None]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, XLMRobertaConfig, AdamW

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
config = XLMRobertaConfig.from_pretrained("xlm-roberta-base")

# Define the path to your partially trained model
model_path = "/kaggle/input/mini-updated-code"

# Load the partially trained model
model = XLMRobertaForSequenceClassification.from_pretrained(model_path, config=config)


In [None]:
model = model.to(device)

In [None]:
# Tokenize text data from DataFrame
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)
eval_encodings = tokenizer(eval_df['text'].tolist(), truncation=True, padding=True)

In [None]:
# Save train encodings
torch.save(train_encodings, os.path.join("/kaggle/working/", "train_encodings.pt"))
print("Train encodings saved successfully.")

# Save test encodings
torch.save(eval_encodings, os.path.join("/kaggle/working/", "test_encodings.pt"))
print("Test encodings saved successfully.")

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Define training parameters
batch_size = 16
learning_rate = 2.0e-5
num_epochs = 3

In [None]:
train_dataset = CustomDataset(train_encodings, train_df['label'].tolist())
eval_dataset = CustomDataset(eval_encodings, eval_df['label'].tolist())

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

In [None]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
# Function for training
def train(model, optimizer, criterion, dataloader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * input_ids.size(0)
            
            # Release GPU memory
            del input_ids, attention_mask, labels, outputs
            torch.cuda.empty_cache()

        epoch_loss = running_loss / len(dataloader.dataset)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}")

        
# Function for evaluation
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Accuracy: {accuracy:.4f}")
    return all_preds,all_labels

In [None]:
def save_checkpoint(model, optimizer, output_dir, epoch):
    checkpoint_dir = os.path.join(output_dir, "checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)
    model_path = os.path.join(checkpoint_dir, f"model_epoch_{epoch}.pt")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, model_path)
    print(f"Checkpoint saved at {model_path}")
# Function for loading the model and optimizer state
def load_checkpoint(model, optimizer, checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    print(f"Checkpoint loaded from {checkpoint_path}. Resuming training from epoch {epoch + 1}.")
    return epoch + 1

In [None]:
# Fine-tune the XLM-RoBERTa model
output_dir = "/kaggle/working/"

In [None]:
# Optionally resume training from a saved checkpoint
checkpoint_path = "/kaggle/working/checkpoints/model_epoch_1.pt"
if os.path.exists(checkpoint_path):
    start_epoch = load_checkpoint(model, optimizer, checkpoint_path)

In [None]:
start_epoch = 1

In [None]:
for epoch in range(start_epoch, num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, optimizer, criterion, train_loader, 1)
    save_checkpoint(model, optimizer, output_dir, epoch)
    all_preds,all_labels = evaluate(model, eval_loader)

In [None]:
save_checkpoint(model,optimizer,output_dir,2)

In [None]:
all_preds,all_labels = evaluate(model, eval_loader)

In [None]:
# Plot confusion matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()