In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LongformerModel, LongformerTokenizer
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.optim import AdamW
from tqdm import tqdm  # Import tqdm for progress bars
import os

# Load the dataset
df = pd.read_csv(r'/Volume2/kitmay/pandora2/m-b+_v1.csv', encoding='latin-1')

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Preprocess the text: tokenize and pad/truncate
max_length = 64  # The max length is chosen after trying multiple values
df['body'] = df['body'].apply(lambda x: tokenizer.encode_plus(x, truncation=True, padding='max_length', max_length=max_length))

# Drop 'author' column
df.drop(['author'], axis=1, inplace=True)

# Split into train and test first
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

# Split df_train into train and validation
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)

# Define a PyTorch dataset
class PersonalityDataset(Dataset):
    def __init__(self, tweets, targets):
        self.tweets = tweets
        self.targets = targets

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.tweets[idx]['input_ids'], dtype=torch.long)
        attention_mask = torch.tensor(self.tweets[idx]['attention_mask'], dtype=torch.long)
        targets = torch.tensor(self.targets[idx], dtype=torch.float)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'targets': targets
        }

# Define the model
class RoBERTaForPersonalityTraits(torch.nn.Module):
    def __init__(self):
        super(RoBERTaForPersonalityTraits, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-large')
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(1024, 5)  # Output size for 5 personality traits

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        output = self.linear(output)
        return output

# Prepare data loaders
batch_size = 16
train_dataset = PersonalityDataset(df_train['body'].tolist(), df_train[['ext', 'neu', 'agr', 'con', 'ope']].values)
val_dataset = PersonalityDataset(df_val['body'].tolist(), df_val[['ext', 'neu', 'agr', 'con', 'ope']].values)
test_dataset = PersonalityDataset(df_test['body'].tolist(), df_test[['ext', 'neu', 'agr', 'con', 'ope']].values)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=os.cpu_count())
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=os.cpu_count())
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=os.cpu_count())

# Instantiate the model and define optimizer and loss function
model = RoBERTaForPersonalityTraits()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.MSELoss()

# Training loop
epochs = 10
patience = 3
best_val_loss = float('inf')
patience_counter = 0
accumulation_steps = 16

log_file = r'/Volume2/kitmay/pandora2/results/roberta_10epoch.txt'  # Output log file

with open(log_file, 'w') as f:  # Open the log file for writing
    f.write("Training started
")

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        step = 0

        train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1} - Training")
        for i, batch in enumerate(train_progress_bar):
            optimizer.zero_grad() if i % accumulation_steps == 0 else None

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, targets)

            # Backward pass
            loss = loss / accumulation_steps
            loss.backward()

            if (i + 1) % accumulation_steps == 0 or i + 1 == len(train_loader):  # Update every accumulation step
                optimizer.step()
                optimizer.zero_grad()

            total_loss += loss.item()
            step += 1

            train_progress_bar.set_postfix({'loss': total_loss / (i + 1)})

        avg_train_loss = total_loss / len(train_loader)
        f.write(f"Epoch {epoch + 1}: Train Loss: {avg_train_loss}
")

    torch.save(model.state_dict(), r'/Volume2/kitmay/pandora2/r_checkpoints/roberta_final_model.pth')
    f.write("Final model saved.
")
