In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install wandb

In [None]:
import wandb
wandb.login()

In [None]:
import re
import os
import sys
import tqdm
import random
import pandas as pd
import numpy as np
import copy
import argparse
import logging
import time
from datetime import datetime
from urllib.request import urlretrieve
import matplotlib.pyplot as plt
from matplotlib import rcParams
import joblib
from functools import partial
import wandb
from collections import Counter

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import BertConfig, BertModel, BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# set seed for reproducibility
def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    np.random.seed(seed_value)  # Numpy module.
    random.seed(seed_value)  # Python random module.
    torch.manual_seed(seed_value)  # PyTorch to ensure deterministic behavior
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)  # Sets the seed for generating random numbers on all GPUs.
        torch.cuda.manual_seed_all(seed_value)  # For multi-GPU.
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed_value)  # Set Python hash seed for reproducibility in hash-based operations.

seed = 0
set_seed(seed)

# dataset

In [None]:
def calculate_length_data(texts, tokenizer):
  token_counts = []
  i = 0
  for text in texts:
    # if i % 1000 == 0:
    #   print(i)
    # i += 1
    tokens = tokenizer.tokenize(text)
    token_counts.append(len(tokens))
  print('max_length', max(token_counts))
  plt.hist(token_counts, bins=30, alpha=0.5, color='blue', edgecolor='black')
  plt.title('Histogram of token length')
  plt.xlabel('Length')
  plt.ylabel('Frequency')
  plt.show()


df = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_train.csv')
print(df)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
calculate_length_data(df['text'], tokenizer)


In [None]:
def calculate_length_data(texts, tokenizer):
  token_counts = []
  i = 0
  for text in texts:
    # if i % 1000 == 0:
    #   print(i)
    # i += 1
    tokens = tokenizer.tokenize(text)
    token_counts.append(len(tokens))
  print('max_length', max(token_counts))
  plt.hist(token_counts, bins=30, alpha=0.5, color='blue', edgecolor='black')
  plt.title('Histogram of token length')
  plt.xlabel('Length')
  plt.ylabel('Frequency')
  plt.show()


df = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs50/processed/blogs50_train.csv')
print(df)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
calculate_length_data(df['text'], tokenizer)


In [None]:
# def preprocess_data(df, tokenizer, max_length, encoder_path, train=False):

#     texts = df['prompt'].tolist()
#     labels = df['user_name'].tolist()

#     input_ids = []
#     attention_masks = []
#     for text in texts:
#         encoded = tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length=max_length,
#             padding='max_length',
#             truncation=True,
#             return_attention_mask=True,
#             return_tensors='pt',
#         )
#         input_ids.append(encoded['input_ids'])
#         attention_masks.append(encoded['attention_mask'])
#     input_ids = torch.cat(input_ids, dim=0)
#     attention_masks = torch.cat(attention_masks, dim=0)

#     if train:
#       label_encoder = LabelEncoder()
#       labels = label_encoder.fit_transform(labels)
#       joblib.dump(label_encoder, encoder_path)
#     else:
#       labels = label_encoder.transform(labels)

#     labels = torch.tensor(labels)

#     return TensorDataset(input_ids, attention_masks, labels)


class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        encoding = self.tokenizer.encode_plus(
            self.texts[item],
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': self.texts[item],
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[item], dtype=torch.long)
        }


## DiffusionDB

In [None]:
#############################################
#### RUN THIS TO GET THE DATASET WE NEED

# df_initial = pd.read_csv('cleaned_diffusionDB_large_data.csv', encoding='utf-8')
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/preprocessed_diffusiondb.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/cleaned_diffusionDB_dataFinal_T_25.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/authorship_inference_attack/cleaned_diffusionDB_large_data_50_chars_final.csv', encoding='utf-8')


df = df.astype(str)


df_initial = df[df['user_name'] != 'deleted-account']

print('Columns :', df_initial.columns)
print(len(df_initial))

# Step 1: Eliminate Leading or Trailing Whitespaces
df_initial['prompt'] = df_initial['prompt'].str.strip()
print('Rows (Step 1) :', len(df_initial))

In [None]:
# Count the number of unique values in the 'user_name' column
user_counts = df_initial['user_name'].value_counts()
unique_user_names_count = df_initial['user_name'].nunique()
print('Unique Users (without deleted_user):', unique_user_names_count)

# Filter for users with at least 100 rows
users_more_than_250_prompts = user_counts[user_counts >= 120]
users_with_100_prompts = user_counts[user_counts >= 120].index

# Get the number of unique users with at least 250 rows
num_users_more_than_250_prompts = len(users_more_than_250_prompts)

# Print the result
print(f"Number of users with at least 100 prompts: {num_users_more_than_250_prompts}")

df_at_least_250_prompts_per_user = df_initial[df_initial['user_name'].isin(users_with_100_prompts)]

# # Create a DataFrame to store 250 prompts for each user
# df_at_least_250_prompts_per_user = pd.DataFrame()

# # Loop through each user and sample 250 prompts
# for user in users_more_than_250_prompts.index:
# #    user_prompts = df_initial[df_initial['user_name'] == user].sample(250, random_state=42) wrong, because it selects only 250 from the beginning
#     user_prompts = df_initial[df_initial['user_name'] == user]
#     df_at_least_250_prompts_per_user = pd.concat([df_at_least_250_prompts_per_user, user_prompts])

# df_at_least_250_prompts_per_user

In [None]:
# df_at_least_250_prompts_per_user.to_csv('/content/drive/MyDrive/authorship_inference_attack/df_real_data 1.csv', index=False)

In [None]:
def get_random_100_users(number_of_users):
    # Randomly select X distinct users
    X_users_number = np.random.choice(users_more_than_250_prompts.index, size = number_of_users, replace=False)

    # Create a DataFrame with samples from the randomly selected users # WRONG!!!
    #df_100_random_users_number = df_at_least_250_prompts_per_user[df_at_least_250_prompts_per_user['user_name'].isin(X_users_number)]

    # Create an empty DataFrame to store the selected prompts
    df_100_random_users_number = pd.DataFrame()

    # Iterate over selected users
    for user in X_users_number:
        # Get all prompts for the current user
        user_prompts = df_at_least_250_prompts_per_user[df_at_least_250_prompts_per_user['user_name'] == user]

        # If the user has more than 250 prompts, select 250 randomly
        if len(user_prompts) >= 120: # 250:
            user_prompts = user_prompts.sample(n=100, random_state=42, replace = False)  # Select 250 prompts randomly

        # Append selected prompts to the DataFrame
        df_100_random_users_number = df_100_random_users_number._append(user_prompts)
        # df_100_random_users_number = pd.concat([df_100_random_users_number, user_prompts], ignore_index=True)

    # Reset index of the DataFrame
    df_100_random_users_number.reset_index(drop=True, inplace=True)

    # Count the number of unique values in the 'user_name' column
    user_counts = df_100_random_users_number['user_name'].value_counts()

    print('Final Number of users (that we will use) with at least 100 prompts :', len(user_counts))
    print('Length of DF :', len(df_100_random_users_number))
    print('Result (proof) :', number_of_users, ' * 250 = ', number_of_users * 250, '\n')

    return df_100_random_users_number

In [None]:
number_of_users = 200
df_100_random_users = get_random_100_users(number_of_users)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

# Step 2: Initialize and fit the LabelEncoder
label_encoder = LabelEncoder()
df_100_random_users['user_name'] = label_encoder.fit_transform(df_100_random_users['user_name'])

# Step 3: Save the LabelEncoder for later use
with open('/content/drive/MyDrive/msc_project/data/diffusiondb/vary/label_encoder_100_200.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Step 4: Display the encoded DataFrame
print(df_100_random_users)

train_list = []
val_list = []
test_list = []

# Loop over each author and split their data
for author in df_100_random_users['user_name'].unique():
    # Filter data for the current author
    author_data = df_100_random_users[df_100_random_users['user_name'] == author]

    # Split the author's data into train (60%) and temp (40%)
    train_data, temp_data = train_test_split(author_data, test_size=0.4, random_state=42)

    # Further split temp data into validation (20%) and test (20%)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

    # Append the splits to the respective lists
    train_list.append(train_data)
    val_list.append(val_data)
    test_list.append(test_data)

# Concatenate all splits into final train, validation, and test DataFrames
train_df = pd.concat(train_list).reset_index(drop=True)
val_df = pd.concat(val_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

# Now you can save the three datasets to CSV or any other format you prefer
train_df.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random100_200_label_1.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/vary/val_random100_200_label_1.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random100_200_label_1.csv', index=False)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")


## IMDB62

# model

In [None]:
class BertClassifier(nn.Module):
  def __init__(self, pretrained_model, num_classes):
    super(BertClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(pretrained_model)
    self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    # for param in self.bert.parameters():
    #   param.requires_grad = False

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    embedding = outputs[1]
    logits = self.classifier(embedding)
    return logits

In [None]:
def get_model(model_path, num_labels, use_cuda=False, cuda_device=0, only_train_classifier=False):
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
    if only_train_classifier:
        for param in model.bert.parameters():
            param.requires_grad = False
    print(model)
    return model


# training

In [None]:
def aa_metrics(labels, predictions, raw_outputs=[], prefix='', no_auc=False, special=False):

    accuracy = metrics.accuracy_score(labels, predictions)
    macro_accuracy = metrics.balanced_accuracy_score(labels, predictions)
    results = {
      f'{prefix}accuracy': accuracy,
      f'{prefix}macro_accuracy': macro_accuracy,
    }

    if len(raw_outputs) != 0:
      top5_accuracy = 0
      for i, prediction in enumerate(raw_outputs):
          true_author = labels[i]
          top5_indices = prediction.argsort()[-5:][::-1]  # Top-5 predicted indices
          if true_author in top5_indices:
              top5_accuracy += 1
      top5_accuracy /= len(labels)
      results.update({
        f'{prefix}top5_accuracy': top5_accuracy
      })

    if special:
        return results

    micro_recall = metrics.recall_score(labels, predictions, average='micro')
    macro_recall = metrics.recall_score(labels, predictions, average='macro')
    micro_precision = metrics.precision_score(labels, predictions, average='micro')
    macro_precision = metrics.precision_score(labels, predictions, average='macro')

    # Calculate micro and macro F1 scores
    micro_f1 = metrics.f1_score(labels, predictions, average='micro')
    macro_f1 = metrics.f1_score(labels, predictions, average='macro')

    results.update({
        f'{prefix}micro_recall': micro_recall,
        f'{prefix}macro_recall': macro_recall,
        f'{prefix}micro_precision': micro_precision,
        f'{prefix}macro_precision': macro_precision,
        f'{prefix}micro_f1': micro_f1,
        f'{prefix}macro_f1': macro_f1,
    })


    if not no_auc:
        ovr_weighted_auc = metrics.roc_auc_score(labels, raw_outputs, average='weighted', multi_class='ovr')
        ovr_macro_auc = metrics.roc_auc_score(labels, raw_outputs, average='macro', multi_class='ovr')
        ovo_weighted_auc = metrics.roc_auc_score(labels, raw_outputs, average='weighted', multi_class='ovo')
        ovo_macro_auc = metrics.roc_auc_score(labels, raw_outputs, average='macro', multi_class='ovo')
        top2 = metrics.top_k_accuracy_score(labels, raw_outputs, k=2)
        top3 = metrics.top_k_accuracy_score(labels, raw_outputs, k=3)
        top4 = metrics.top_k_accuracy_score(labels, raw_outputs, k=4)
        top5 = metrics.top_k_accuracy_score(labels, raw_outputs, k=5)
        top6 = metrics.top_k_accuracy_score(labels, raw_outputs, k=6)
        top7 = metrics.top_k_accuracy_score(labels, raw_outputs, k=7)
        top8 = metrics.top_k_accuracy_score(labels, raw_outputs, k=8)
        top9 = metrics.top_k_accuracy_score(labels, raw_outputs, k=9)
        top10 = metrics.top_k_accuracy_score(labels, raw_outputs, k=10)
        micro_f1 = metrics.f1_score(labels, predictions, average="micro")
        macro_f1 = metrics.f1_score(labels, predictions, average="macro")

        results.update({
            f'{prefix}ovr_weighted_auc': ovr_weighted_auc,
            f'{prefix}ovr_macro_auc': ovr_macro_auc,
            f'{prefix}ovo_weighted_auc': ovo_weighted_auc,
            f'{prefix}ovo_macro_auc': ovo_macro_auc,
            f'{prefix}micro_f1': micro_f1,
            f'{prefix}macro_f1': macro_f1,
            f'{prefix}top2': top2,
            f'{prefix}top3': top3,
            f'{prefix}top4': top4,
            f'{prefix}top5': top5,
            f'{prefix}top6': top6,
            f'{prefix}top7': top7,
            f'{prefix}top8': top8,
            f'{prefix}top9': top9,
            f'{prefix}top10': top10
        })

    return results



def chunk_text(text, tokenizer, max_len=512):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    return [tokens[i:i + max_len] for i in range(0, len(tokens), max_len)]

def majority_vote(predictions):
    vote_count = Counter(predictions)
    return vote_count.most_common(1)[0][0]

def evaluate_with_chunking(text, model, tokenizer, max_len=512):
    chunks = chunk_text(text, tokenizer, max_len)
    predictions = []

    for chunk in chunks:
        input_ids = torch.tensor([chunk]).to(model.device)
        attention_mask = torch.tensor([[1] * len(chunk)]).to(model.device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            prediction = torch.argmax(logits, dim=1).item()
            predictions.append(prediction)

    final_prediction = majority_vote(predictions)
    return final_prediction


def evaluate_chunk(model, dataloader, tokenizer, device):
  model.eval()
  predictions, true_labels = [], []

  with torch.no_grad():
    for batch in dataloader:
      texts = batch['text']  # Assuming dataloader yields batches with 'text' and 'labels'
      labels = batch['labels'].to(device)

      batch_predictions = []
      for text in texts:
        prediction = evaluate_with_chunking(text, model, tokenizer)
        batch_predictions.append(prediction)

      predictions.extend(batch_predictions)
      true_labels.extend(labels.cpu().numpy())

  accuracy = metrics.accuracy_score
  macro_accuracy = metrics.balanced_accuracy_score
  micro_recall = partial(metrics.recall_score, average='micro')
  macro_recall = partial(metrics.recall_score, average='macro')
  micro_precision = partial(metrics.precision_score, average='micro')
  macro_precision = partial(metrics.precision_score, average='macro')
  micro_f1 = partial(metrics.f1_score, average='micro')
  macro_f1 = partial(metrics.f1_score, average='macro')

  eval_results = {
    'accuracy': accuracy(true_labels, predictions),
    'macro_accuracy': macro_accuracy(true_labels, predictions),
    'micro_recall': micro_recall(true_labels, predictions),
    'macro_recall': macro_recall(true_labels, predictions),
    'micro_precision': micro_precision(true_labels, predictions),
    'macro_precision': macro_precision(true_labels, predictions),
    'micro_f1': micro_f1(true_labels, predictions),
    'macro_f1': macro_f1(true_labels, predictions)
  }

  return eval_results


def test_chunk(model, dataloader, tokenizer, device):
  model.eval()
  predictions, true_labels = [], []

  with torch.no_grad():
    for batch in dataloader:
      texts = batch['text']  # Assuming dataloader yields batches with 'text' and 'labels'
      labels = batch['labels'].to(device)

      batch_predictions = []
      for text in texts:
        prediction = evaluate_with_chunking(text, model, tokenizer)
        batch_predictions.append(prediction)

      predictions.extend(batch_predictions)
      true_labels.extend(labels.cpu().numpy())

  test_results = aa_metrics(true_labels, predictions, prefix='test_chunk/', no_auc=True)

  return test_results


def evaluate(model, val_loader, device):
  model.eval()

  predictions = []
  true_labels = []
  total_loss = 0
  with torch.no_grad():
    for batch in val_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

      loss = outputs.loss

      logits = outputs.logits.detach().cpu().numpy()
      label_ids = labels.to('cpu').numpy()

      predictions.extend(np.argmax(logits, axis=1).flatten())
      true_labels.extend(label_ids.flatten())

      total_loss += loss.item()

  accuracy = metrics.accuracy_score
  macro_accuracy = metrics.balanced_accuracy_score
  micro_recall = partial(metrics.recall_score, average='micro')
  macro_recall = partial(metrics.recall_score, average='macro')
  micro_precision = partial(metrics.precision_score, average='micro')
  macro_precision = partial(metrics.precision_score, average='macro')
  micro_f1 = partial(metrics.f1_score, average='micro')
  macro_f1 = partial(metrics.f1_score, average='macro')

  eval_results = {
    'eval_loss': total_loss / len(val_loader),
    'accuracy': accuracy(true_labels, predictions),
    'macro_accuracy': macro_accuracy(true_labels, predictions),
    'micro_recall': micro_recall(true_labels, predictions),
    'macro_recall': macro_recall(true_labels, predictions),
    'micro_precision': micro_precision(true_labels, predictions),
    'macro_precision': macro_precision(true_labels, predictions),
    'micro_f1': micro_f1(true_labels, predictions),
    'macro_f1': macro_f1(true_labels, predictions)
  }

  return eval_results


def test(model, test_loader, device):
  model.eval()
  predictions, true_labels = [], []
  predictions_1 = []

  with torch.no_grad():
    for batch in test_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

      logits = outputs.logits.detach().cpu().numpy()
      label_ids = labels.to('cpu').numpy()

      predictions.extend(np.argmax(logits, axis=1).flatten())
      true_labels.extend(label_ids.flatten())
      predictions_1.extend(logits)

  print(predictions)
  print(true_labels)
  probabilities = F.softmax(torch.tensor(predictions_1), dim=1).numpy()
  test_results = aa_metrics(true_labels, predictions, probabilities, prefix='test/', no_auc=False)

  return test_results


def train(model, train_loader, optimizer, scheduler, device):

  model.train()
  total_loss = 0

  predictions, true_labels = [], []
  for batch in tqdm.tqdm(train_loader, total=len(train_loader)):
    optimizer.zero_grad()

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    loss = outputs.loss
    logits = outputs.logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    predictions.extend(np.argmax(logits, axis=1).flatten())
    true_labels.extend(label_ids.flatten())

    total_loss += loss.item()

    loss.backward()
    optimizer.step()
    scheduler.step()

  accuracy = accuracy_score(true_labels, predictions)

  return total_loss / len(train_loader), accuracy



In [None]:
def run_bertaa(train_df, val_df, test_df, params):

  tokenizer = BertTokenizer.from_pretrained(params['model_path'])

  if len(val_df) == 0 :
    # train_dataset = preprocess_data(train_df, tokenizer, params['max_seq_len'], encoder_path=params['encoder_path'], train=True)
    # test_dataset = preprocess_data(test_df, tokenizer, params['max_seq_len'], encoder_path=params['encoder_path'])
    label_encoder = joblib.load(params['encoder_path'])
    train_labels = label_encoder.transform(train_df['user_name'])
    test_labels = label_encoder.transform(test_df['user_name'])

    train_dataset = TextDataset(train_df['prompt'], train_labels, tokenizer, params['max_seq_len'])
    test_dataset = TextDataset(test_df['prompt'], test_labels, tokenizer, params['max_seq_len'])

    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

    model = get_model(params['model_path'], params['num_labels'], use_cuda=torch.cuda.is_available(), cuda_device=0, only_train_classifier=params['only_train_classifier'])
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    total_steps = len(train_loader) * params['epochs']
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=params['warmup_steps'], num_training_steps=total_steps)

    wandb.init(project=params['wandb_project'], config=params, reinit=True, name=params['wandb_name'], tags=eval(params['wandb_tags']))

    for epoch in tqdm.tqdm(range(params['epochs']), total=params['epochs']):
      train_loss, train_acc = train(model, train_loader, optimizer, scheduler, device)
      print(f"Epoch {epoch}, Loss: {train_loss}, Accuracy: {train_acc}")

      wandb.log({
          'epoch': epoch,
          'train_loss': train_loss,
          'train_acc' : train_acc,
      })

    save_path = params['output_dir'] + 'final_model'
    model.save_pretrained(save_path)

  else:

    label_encoder = LabelEncoder()
    train_labels = label_encoder.fit_transform(train_df['user_name'])
    joblib.dump(label_encoder, params['encoder_path'])
    # label_encoder = joblib.load(params['encoder_path'])
    # train_labels = label_encoder.transform(train_df['user_name'])
    val_labels = label_encoder.transform(val_df['user_name'])
    test_labels = label_encoder.transform(test_df['user_name'])

    # train_dataset = preprocess_data(train_df, tokenizer, params['max_seq_len'], encoder_path=params['encoder_path'], train=True)
    # val_dataset = preprocess_data(val_df, tokenizer, params['max_seq_len'], encoder_path=params['encoder_path'])
    # test_dataset = preprocess_data(test_df, tokenizer, params['max_seq_len'], encoder_path=params['encoder_path'])

    train_dataset = TextDataset(train_df['prompt'], train_labels, tokenizer, params['max_seq_len'])
    val_dataset = TextDataset(val_df['prompt'], val_labels, tokenizer, params['max_seq_len'])
    test_dataset = TextDataset(test_df['prompt'], test_labels, tokenizer, params['max_seq_len'])

    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

    model = get_model(params['model_path'], params['num_labels'], use_cuda=torch.cuda.is_available(), cuda_device=0, only_train_classifier=params['only_train_classifier'])
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    total_steps = len(train_loader) * params['epochs']
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=params['warmup_steps'], num_training_steps=total_steps)

    wandb.init(project=params['wandb_project'], config=params, reinit=True, name=params['wandb_name'], tags=eval(params['wandb_tags']))
    save_path = params['output_dir'] + params['wandb_name'].split('_')[0] + '/best_model'

    best_score = None
    epochs_no_improve = 0

    for epoch in range(params['epochs']):
      print(epoch)
      train_loss, train_acc = train(model, train_loader, optimizer, scheduler, device)
      if params['evaluate_chunk']:
        eval_metrics = evaluate_chunk(model, val_loader, tokenizer, device)
        print(f"Epoch {epoch}, Loss: {train_loss}, Accuracy: {train_acc}, Evaluation Accuracy: {eval_metrics['accuracy']}, Evaluation Macro Accuracy: {eval_metrics['macro_accuracy']}")
      else:
        eval_metrics = evaluate(model, val_loader, device)
        print(f"Epoch {epoch}, Loss: {train_loss}, Accuracy: {train_acc}, Evaluation Loss: {eval_metrics['eval_loss']}, Evaluation Accuracy: {eval_metrics['accuracy']}, Evaluation Macro Accuracy: {eval_metrics['macro_accuracy']}")

      wandb.log({
          'epoch': epoch,
          'train_loss': train_loss,
          'train_acc' : train_acc,
          **eval_metrics
      })

      current_score = eval_metrics['macro_accuracy']

      # Check for improvement
      if best_score is None or current_score > best_score + params['early_stopping_delta']:
          best_score = current_score
          epochs_no_improve = 0
          # Optionally, save the model here as the best model so far
          model.save_pretrained(save_path)
      else:
          epochs_no_improve += 1

      if epochs_no_improve >= params['early_stopping_patience']:
          print(f"Early stopping at epoch {epoch + 1}")
          break

    model = BertForSequenceClassification.from_pretrained(save_path)
    model.to(device)

  if params['evaluate_chunk']:
    test_metrics = test_chunk(model, test_loader, tokenizer, device)
  else:
    test_metrics = test(model, test_loader, device)
  wandb.log({'test_' + k: v for k, v in test_metrics.items()})
  print(test_metrics)
  wandb.finish()

In [None]:
def test_bertaa(train_df, val_df, test_df, params):

  tokenizer = BertTokenizer.from_pretrained(params['model_path'])

  if len(val_df) == 0 :
    # train_dataset = preprocess_data(train_df, tokenizer, params['max_seq_len'], encoder_path=params['encoder_path'], train=True)
    # test_dataset = preprocess_data(test_df, tokenizer, params['max_seq_len'], encoder_path=params['encoder_path'])
    label_encoder = joblib.load(params['encoder_path'])
    train_labels = label_encoder.transform(train_df['user_name'])
    test_labels = label_encoder.transform(test_df['user_name'])

    train_dataset = TextDataset(train_df['prompt'], train_labels, tokenizer, params['max_seq_len'])
    test_dataset = TextDataset(test_df['prompt'], test_labels, tokenizer, params['max_seq_len'])

    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

    model = get_model(params['model_path'], params['num_labels'], use_cuda=torch.cuda.is_available(), cuda_device=0, only_train_classifier=params['only_train_classifier'])

    model.to(device)

    optimizer = AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    total_steps = len(train_loader) * params['epochs']
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=params['warmup_steps'], num_training_steps=total_steps)

    wandb.init(project=params['wandb_project'], config=params, reinit=True, name=params['wandb_name'], tags=eval(params['wandb_tags']))

    for epoch in tqdm.tqdm(range(params['epochs']), total=params['epochs']):
      train_loss, train_acc = train(model, train_loader, optimizer, scheduler, device)
      print(f"Epoch {epoch}, Loss: {train_loss}, Accuracy: {train_acc}")

      wandb.log({
          'epoch': epoch,
          'train_loss': train_loss,
          'train_acc' : train_acc,
      })

    save_path = params['output_dir'] + 'final_model'
    model.save_pretrained(save_path)

  else:

    label_encoder = LabelEncoder()
    train_labels = label_encoder.fit_transform(train_df['user_name'])
    # joblib.dump(label_encoder, params['encoder_path'])
    # label_encoder = joblib.load(params['encoder_path'])
    # train_labels = label_encoder.transform(train_df['user_name'])
    val_labels = label_encoder.transform(val_df['user_name'])
    test_labels = label_encoder.transform(test_df['user_name'])

    # train_dataset = preprocess_data(train_df, tokenizer, params['max_seq_len'], encoder_path=params['encoder_path'], train=True)
    # val_dataset = preprocess_data(val_df, tokenizer, params['max_seq_len'], encoder_path=params['encoder_path'])
    # test_dataset = preprocess_data(test_df, tokenizer, params['max_seq_len'], encoder_path=params['encoder_path'])

    train_dataset = TextDataset(train_df['prompt'], train_labels, tokenizer, params['max_seq_len'])
    val_dataset = TextDataset(val_df['prompt'], val_labels, tokenizer, params['max_seq_len'])
    test_dataset = TextDataset(test_df['prompt'], test_labels, tokenizer, params['max_seq_len'])

    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)

    wandb.init(project=params['wandb_project'], config=params, reinit=True, name=params['wandb_name'], tags=eval(params['wandb_tags']))
    # model = get_model(params['model_path'], params['num_labels'], use_cuda=torch.cuda.is_available(), cuda_device=0, only_train_classifier=params['only_train_classifier'])
    save_path = params['output_dir'] + params['wandb_name'].split('_')[0] + '/best_model'
    model = BertForSequenceClassification.from_pretrained(save_path)
    model.to(device)

  if params['evaluate_chunk']:
    test_metrics = test_chunk(model, test_loader, tokenizer, device)
  else:
    test_metrics = test(model, test_loader, device)
  wandb.log({'test_' + k: v for k, v in test_metrics.items()})
  print(test_metrics)
  wandb.finish()

# run

## IMDB62

In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'imdb62_1',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_train.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_val.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_test.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/label_encoder.pkl',
      'evaluate_chunk': False,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'imdb62_2',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_train.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_val.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_test.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


## blogs50

In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'blogs50_1',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/blogs50/processed/blogs50_train.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/blogs50/processed/blogs50_AA_val.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/blogs50/processed/blogs50_AA_test.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/blogs50/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    test_bertaa(train_df, val_df, test_df, training_args)


## diffusiondb

In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'diffusiondb_1',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/processed/train_random100_1.csv',
      'val_dataset': '',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/diffusiondb/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_data = pd.read_csv(training_args['train_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    train_data.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.DataFrame()
    val_df = pd.DataFrame()

    for author in train_data['user_name'].unique():
        author_df = train_data[train_data['user_name'] == author]
        train1, val1 = train_test_split(author_df, test_size=0.25)
        train_df = pd.concat([train_df, train1])
        val_df = pd.concat([val_df, val1])

    train_df.reset_index(drop=True, inplace=True)
    val_df.reset_index(drop=True, inplace=True)
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'diffusiondb_1',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/processed/train_random100_1.csv',
      'val_dataset': '',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/diffusiondb/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_data = pd.read_csv(training_args['train_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    train_data.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.DataFrame()
    val_df = pd.DataFrame()

    for author in train_data['user_name'].unique():
        author_df = train_data[train_data['user_name'] == author]
        train1, val1 = train_test_split(author_df, test_size=0.25)
        train_df = pd.concat([train_df, train1])
        val_df = pd.concat([val_df, val1])

    train_df.reset_index(drop=True, inplace=True)
    val_df.reset_index(drop=True, inplace=True)
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    test_bertaa(train_df, val_df, test_df, training_args)


In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'diffusiondb_1',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_1.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_random100_1.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/diffusiondb/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    print(train_df.columns)
    train_df = train_df[['user_name', 'prompt']]
    test_df = test_df[['user_name', 'prompt']]
    val_df = val_df[['user_name', 'prompt']]

    # train_df = pd.DataFrame()
    # val_df = pd.DataFrame()

    # for author in train_data['user_name'].unique():
    #     author_df = train_data[train_data['user_name'] == author]
    #     train1, val1 = train_test_split(author_df, test_size=0.25)
    #     train_df = pd.concat([train_df, train1])
    #     val_df = pd.concat([val_df, val1])

    # train_df.reset_index(drop=True, inplace=True)
    # val_df.reset_index(drop=True, inplace=True)
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


# diffusiondb_clean

In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'diffusiondb_clean',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/clean/train_random100_label_1.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/clean/val_random100_label_1.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/clean/test_random100_label_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/diffusiondb/clean/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    print(train_df.columns)
    train_df = train_df[['user_name', 'prompt']]
    test_df = test_df[['user_name', 'prompt']]
    val_df = val_df[['user_name', 'prompt']]
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


# diffusiondb_vary

In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'diffusiondb_vary_60',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random60_label_1.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/val_random60_label_1.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random60_label_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/diffusiondb/clean/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    print(train_df.columns)
    train_df = train_df[['user_name', 'prompt']]
    test_df = test_df[['user_name', 'prompt']]
    val_df = val_df[['user_name', 'prompt']]
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'diffusiondb_vary_80',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random80_label_1.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/val_random80_label_1.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random80_label_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/diffusiondb/clean/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    print(train_df.columns)
    train_df = train_df[['user_name', 'prompt']]
    test_df = test_df[['user_name', 'prompt']]
    val_df = val_df[['user_name', 'prompt']]
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'diffusiondb_vary_120',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random120_label_1.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/val_random120_label_1.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random120_label_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/diffusiondb/clean/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    print(train_df.columns)
    train_df = train_df[['user_name', 'prompt']]
    test_df = test_df[['user_name', 'prompt']]
    val_df = val_df[['user_name', 'prompt']]
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'diffusiondb_vary_100_150',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random100_150_label_1.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/val_random100_150_label_1.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random100_150_label_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/diffusiondb/clean/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    print(train_df.columns)
    train_df = train_df[['user_name', 'prompt']]
    test_df = test_df[['user_name', 'prompt']]
    val_df = val_df[['user_name', 'prompt']]
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'diffusiondb_vary_100_200',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/train_random100_200_label_1.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/val_random100_200_label_1.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random100_200_label_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/diffusiondb/clean/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    print(train_df.columns)
    train_df = train_df[['user_name', 'prompt']]
    test_df = test_df[['user_name', 'prompt']]
    val_df = val_df[['user_name', 'prompt']]
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


## twitter_micro

In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'twitter_micro_1',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random100_1.csv',
      'val_dataset': '',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/twitter_micro/processed/test_random100_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/twitter_micro/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_data = pd.read_csv(training_args['train_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    train_data.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.DataFrame()
    val_df = pd.DataFrame()

    for author in train_data['user_name'].unique():
        author_df = train_data[train_data['user_name'] == author]
        train1, val1 = train_test_split(author_df, test_size=0.25)
        train_df = pd.concat([train_df, train1])
        val_df = pd.concat([val_df, val1])

    train_df.reset_index(drop=True, inplace=True)
    val_df.reset_index(drop=True, inplace=True)
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


In [None]:
if __name__ == "__main__":
  training_args = {
      'wandb_project': 'bertaa',
      'wandb_name': 'twitter_micro_1',
      'wandb_tags': '["baseline"]',
      'train_dataset': '/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random100_1.csv',
      'val_dataset': '/content/drive/MyDrive/msc_project/data/twitter_micro/processed/val_random100_1.csv',
      'test_dataset': '/content/drive/MyDrive/msc_project/data/twitter_micro/processed/test_random100_1.csv',
      'model_path': 'bert-base-cased',
      'epochs': 20,
      'batch_size': 16,
      'lr': 3e-5,
      'warmup_ratio': 0.15,
      'warmup_steps': 0,
      'weight_decay': 1e-5,
      'max_seq_len': 512,
      'doc_stride': 0.8,
      'output_dir': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/',
      'early_stopping_delta': 0.01,
      'early_stopping_patience': 5,
      'final_run': False,
      'only_train_classifier': False,
      'encoder_path': '/content/drive/MyDrive/msc_project/model/baseline/BertAA/twitter_micro/label_encoder.pkl',
      'evaluate_chunk': True,
  }

  if training_args['final_run']:
    train_df = pd.read_csv(training_args['train_dataset'])
    val_df = pd.read_csv(training_args['val_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])

    train_df.columns = ['user_name', 'prompt']
    val_df.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.concat([train_df, val_df], ignore_index=True)
    val_df = pd.Dataframe()

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, test_df, training_args)

  else:
    train_data = pd.read_csv(training_args['train_dataset'])
    test_df = pd.read_csv(training_args['test_dataset'])
    train_data.columns = ['user_name', 'prompt']
    test_df.columns = ['user_name', 'prompt']

    train_df = pd.DataFrame()
    val_df = pd.DataFrame()

    for author in train_data['user_name'].unique():
        author_df = train_data[train_data['user_name'] == author]
        train1, val1 = train_test_split(author_df, test_size=0.25)
        train_df = pd.concat([train_df, train1])
        val_df = pd.concat([val_df, val1])

    train_df.reset_index(drop=True, inplace=True)
    val_df.reset_index(drop=True, inplace=True)
    # train_df = train_data
    # val_df = val_data

    # def sample_prompts(group, n=1):
    #   return group.sample(n=min(n, len(group)), random_state=42)

    # train_df = train_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # val_df = val_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)
    # test_df = test_df.groupby('user_name').apply(sample_prompts).reset_index(drop=True)

    training_args['num_labels'] = len(train_df['user_name'].drop_duplicates())

    run_bertaa(train_df, val_df, test_df, training_args)


## test

In [None]:
def training_BERT_varying(df_100_random_users, sample, epochs, number_of_users):


  pretrained_model = 'bert-base-cased'
  batch_size = 128
  # learning_rate = 0.001
  learning_rate = 2e-5

  # load tokenizer
  tokenizer = BertTokenizer.from_pretrained(pretrained_model)

  print('Size of DF :', len(df_100_random_users))
  total_rows = len(df_100_random_users)
  desired_train_size = sample * 100

  print('Training Samples :', sample)

  # Calculate the percentage of the DataFrame to be used for training
  train_size_percent = desired_train_size / total_rows

  # Perform train-test split based on the calculated train size percentage
  train_df, test_df = train_test_split(df_100_random_users, test_size=1-train_size_percent, random_state=42)
  train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
  train_df.to_csv('train_df.csv', index=False)
  test_df.to_csv('test_df.csv', index=False)
  val_df.to_csv('val_df.csv', index=False)

  print('Train DF Size :', len(train_df))

  print('Remaining DF Size :', len(df_100_random_users) - len(train_df))
  # Randomly select 30% of the prompts from test_dfs for testing
  #    test_df = pd.concat([df.sample(frac=0.3, random_state=42) for df in test_dfs])

  print('Test DF Size :', len(test_df))
  print('Check (proof) :', '30 \%', 'of', len(df_100_random_users) - len(train_df), '=',  (len(df_100_random_users) - len(train_df)) * 0.3)
  # print(train_df)

  train_dataset, val_dataset, test_dataset = preprocess_data(train_df, val_df, test_df, tokenizer)
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
  test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  # model = BertForSequenceClassification.from_pretrained(
  #     pretrained_model,
  #     num_labels=100,
  #     output_attentions=False,
  #     output_hidden_states=False,
  # )

  model = BertClassifier(pretrained_model, number_of_users)
  model = model.to(device)

  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  criterion = nn.CrossEntropyLoss()
  # total_steps = len(train_loader) * epochs
  # scheduler = get_linear_schedule_with_warmup(optimizer,
  #                                             num_warmup_steps=0, # Default value
  #                                             num_training_steps=total_steps)

  patience = 3
  train(model, optimizer, criterion, epochs, patience, train_loader, val_loader)
  torch.save(model, 'model.pth')
  accuracy, top5_accuracy = test(model, test_loader)

  return top5_accuracy

In [None]:
# fine tune bert + linear - new experiments - T=50, number_of_users=100, 150, 200
results_BERT = []
#samples = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 170, 175, 180, 200]
samples = [70]
times = [1, 2, 3, 4, 5]
# times = [1]
epochs = 30
# epochs = 0

number_of_users_all = [196]

for number_of_users in number_of_users_all:
  for time in times:
    df_100_random_users = pd.DataFrame()
    df_100_random_users = get_random_100_users(number_of_users)

    i = 0
    for sample in samples:
      i += 1
      print('Subrun :', i, '/', len(samples))

      #### BERT ####
      # try:
      print(' - BERT - ')
      top5_accuracy = training_BERT(df_100_random_users, sample, epochs, number_of_users)
      results_BERT.append([sample, top5_accuracy, number_of_users])
      print(results_BERT)
      # except:
      #   print('Error with BERT')
      ##############

      print('Run', time, 'finished!')
      print('\nInterim Results :\n')
      print('BERT_Results_left =', results_BERT, '\n')

  print('BERT_Results_left =', results_BERT, '\n')
