In [None]:
import argparse
import os
import random
import warnings
import tarfile
import gdown

import numpy as np
import pandas as pd
from tqdm import tqdm
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, Sampler
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F

import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
from matplotlib.collections import QuadMesh
import seaborn as sn

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

from transformers import BertModel, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
ckpt_dir = '/content/drive/MyDrive/msc_project/model/contrastive/lcl'
dataset_path = "/content/drive/MyDrive/msc_project/datasets"
# dataset_file_name = {
#     "imdb62": 'full_imdb62.csv',
#     "blog": 'full_blog.csv',
#     "turing": "turing_ori_0208.csv"
# }
datasets = {
    'contrax_datasets.tar': 'https://drive.google.com/uc?id=1T3VgMe-dCy5QVI7b1K2KdfL-2e2gq2Rn'
}
os.makedirs(dataset_path, exist_ok=True)

nltk.download('stopwords')
nltk.download('punkt')

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)

# utils

In [None]:
def get_new_fig(fn, figsize=[9, 9]):
    """ Init graphics """
    fig1 = plt.figure(fn, figsize)
    ax1 = fig1.gca()  # Get Current Axis
    ax1.cla()  # clear existing plot
    return fig1, ax1


def configcell_text_and_colors(array_df, lin, col, oText, facecolors, posi, fz, fmt, show_null_values=0):
    """
      config cell text and colors
      and return text elements to add and to dell
      @TODO: use fmt
    """
    text_add = [];
    text_del = [];
    cell_val = array_df[lin][col]
    tot_all = array_df[-1][-1]
    per = (float(cell_val) / tot_all) * 100
    curr_column = array_df[:, col]
    ccl = len(curr_column)

    # last line  and/or last column
    if (col == (ccl - 1)) or (lin == (ccl - 1)):
        # tots and percents
        if (cell_val != 0):
            if (col == ccl - 1) and (lin == ccl - 1):
                tot_rig = 0
                for i in range(array_df.shape[0] - 1):
                    tot_rig += array_df[i][i]
                per_ok = (float(tot_rig) / cell_val) * 100
            elif (col == ccl - 1):
                tot_rig = array_df[lin][lin]
                per_ok = (float(tot_rig) / cell_val) * 100
            elif (lin == ccl - 1):
                tot_rig = array_df[col][col]
                per_ok = (float(tot_rig) / cell_val) * 100
            per_err = 100 - per_ok
        else:
            per_ok = per_err = 0

        per_ok_s = ['%.2f%%' % (per_ok), '100%'][per_ok == 100]

        # text to DEL
        text_del.append(oText)

        # text to ADD
        font_prop = fm.FontProperties(weight='bold', size=fz)
        text_kwargs = dict(color='w', ha="center", va="center", gid='sum', fontproperties=font_prop)
        lis_txt = ['%d' % (cell_val), per_ok_s, '%.2f%%' % (per_err)]
        lis_kwa = [text_kwargs]
        dic = text_kwargs.copy();
        dic['color'] = 'g';
        lis_kwa.append(dic);
        dic = text_kwargs.copy();
        dic['color'] = 'r';
        lis_kwa.append(dic);
        lis_pos = [(oText._x, oText._y - 0.3), (oText._x, oText._y), (oText._x, oText._y + 0.3)]
        for i in range(len(lis_txt)):
            newText = dict(x=lis_pos[i][0], y=lis_pos[i][1], text=lis_txt[i], kw=lis_kwa[i])
            # print 'lin: %s, col: %s, newText: %s' %(lin, col, newText)
            text_add.append(newText)
        # print '\n'

        # set background color for sum cells (last line and last column)
        carr = [0.27, 0.30, 0.27, 1.0]
        if (col == ccl - 1) and (lin == ccl - 1):
            carr = [0.17, 0.20, 0.17, 1.0]
        facecolors[posi] = carr

    else:
        if (per > 0):
            txt = '%s\n%.2f%%' % (cell_val, per)
        else:
            if (show_null_values == 0):
                txt = ''
            elif (show_null_values == 1):
                txt = '0'
            else:
                txt = '0\n0.0%'
        oText.set_text(txt)

        # main diagonal
        if (col == lin):
            # set color of the textin the diagonal to white
            oText.set_color('w')
            # set background color in the diagonal to blue
            facecolors[posi] = [0.35, 0.8, 0.55, 1.0]
        else:
            oText.set_color('r')

    return text_add, text_del


def insert_totals(df_cm):
    """ insert total column and line (the last ones) """
    sum_col = []
    for c in df_cm.columns:
        sum_col.append(df_cm[c].sum())
    sum_lin = []
    for item_line in df_cm.iterrows():
        sum_lin.append(item_line[1].sum())
    df_cm['sum_lin'] = sum_lin
    sum_col.append(np.sum(sum_lin))
    df_cm.loc['sum_col'] = sum_col


def pretty_plot_confusion_matrix(df_cm, annot=True, cmap="Oranges", fmt='.2f', fz=11,
                                 lw=0.5, cbar=False, figsize=[8, 8], show_null_values=0, pred_val_axis='y'):
    """
      print conf matrix with default layout (like matlab)
      params:
        df_cm          dataframe (pandas) without totals
        annot          print text in each cell
        cmap           Oranges,Oranges_r,YlGnBu,Blues,RdBu, ... see:
        fz             fontsize
        lw             linewidth
        pred_val_axis  where to show the prediction values (x or y axis)
                        'col' or 'x': show predicted values in columns (x axis) instead lines
                        'lin' or 'y': show predicted values in lines   (y axis)
    """
    if (pred_val_axis in ('col', 'x')):
        xlbl = 'Predicted'
        ylbl = 'Actual'
    else:
        xlbl = 'Actual'
        ylbl = 'Predicted'
        df_cm = df_cm.T

    # create "Total" column
    insert_totals(df_cm)

    # this is for print allways in the same window
    fig, ax1 = get_new_fig('Conf matrix default', figsize)

    # thanks for seaborn
    ax = sn.heatmap(df_cm, annot=annot, annot_kws={"size": fz}, linewidths=lw, ax=ax1,
                    cbar=cbar, cmap=cmap, linecolor='w', fmt=fmt)

    # set ticklabels rotation
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=10)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=25, fontsize=10)

    # Turn off all the ticks
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # face colors list
    quadmesh = ax.findobj(QuadMesh)[0]
    facecolors = quadmesh.get_facecolors()

    # iter in text elements
    array_df = np.array(df_cm.to_records(index=False).tolist())
    text_add = [];
    text_del = [];
    posi = -1  # from left to right, bottom to top.
    for t in ax.collections[0].axes.texts:  # ax.texts:
        pos = np.array(t.get_position()) - [0.5, 0.5]
        lin = int(pos[1]);
        col = int(pos[0]);
        posi += 1
        # print ('>>> pos: %s, posi: %s, val: %s, txt: %s' %(pos, posi, array_df[lin][col], t.get_text()))

        # set text
        txt_res = configcell_text_and_colors(array_df, lin, col, t, facecolors, posi, fz, fmt, show_null_values)

        text_add.extend(txt_res[0])
        text_del.extend(txt_res[1])

    # remove the old ones
    for item in text_del:
        item.remove()
    # append the new ones
    for item in text_add:
        ax.text(item['x'], item['y'], item['text'], **item['kw'])

    # titles and legends
    ax.set_title('Confusion matrix')
    ax.set_xlabel(xlbl)
    ax.set_ylabel(ylbl)
    plt.tight_layout()  # set layout slim
    plt.show()


def plot_confusion_matrix_from_data(y_test, predictions, columns=None, annot=True, cmap="Oranges",
                                    fmt='.2f', fz=11, lw=0.5, cbar=False, figsize=[8, 8], show_null_values=0,
                                    pred_val_axis='lin'):
    """
        plot confusion matrix function with y_test (actual values) and predictions (predic),
        whitout a confusion matrix yet
    """
    from sklearn.metrics import confusion_matrix
    from pandas import DataFrame

    # data
    if (not columns):
        from string import ascii_uppercase
        columns = ['class %s' % (i) for i in list(ascii_uppercase)[0:len(np.unique(y_test))]]

    confm = confusion_matrix(y_test, predictions)
    cmap = 'Oranges';
    fz = 11;
    figsize = [9, 9];
    show_null_values = 2
    df_cm = DataFrame(confm, index=columns, columns=columns)
    pretty_plot_confusion_matrix(df_cm, fz=fz, cmap=cmap, figsize=figsize, show_null_values=show_null_values,
                                 pred_val_axis=pred_val_axis)


def fil_sent(sent):
    """
    Filter stopwords
    """
    stop_words = set(stopwords.words('english'))
    filtered_sentence = ' '.join([w for w in sent.split() if not w in stop_words])
    return filtered_sentence


def process(sent):
    """
    Apply stemming
    """
    sent = str(sent)
    ps = PorterStemmer()
    return fil_sent(' '.join([ps.stem(str(x).lower()) for x in word_tokenize(sent)]))


def extract_style(text):
    """
    Extracting stylometric features of a text
    """

    text = str(text)
    len_text = len(text)
    len_words = len(text.split())
    avg_len = np.mean([len(t) for t in text.split()])
    num_short_w = len([t for t in text.split() if len(t) < 3])
    per_digit = sum(t.isdigit() for t in text) / len(text)
    per_cap = sum(1 for t in text if t.isupper()) / len(text)
    f_a = sum(1 for t in text if t.lower() == "a") / len(text)
    f_b = sum(1 for t in text if t.lower() == "b") / len(text)
    f_c = sum(1 for t in text if t.lower() == "c") / len(text)
    f_d = sum(1 for t in text if t.lower() == "d") / len(text)
    f_e = sum(1 for t in text if t.lower() == "e") / len(text)
    f_f = sum(1 for t in text if t.lower() == "f") / len(text)
    f_g = sum(1 for t in text if t.lower() == "g") / len(text)
    f_h = sum(1 for t in text if t.lower() == "h") / len(text)
    f_i = sum(1 for t in text if t.lower() == "i") / len(text)
    f_j = sum(1 for t in text if t.lower() == "j") / len(text)
    f_k = sum(1 for t in text if t.lower() == "k") / len(text)
    f_l = sum(1 for t in text if t.lower() == "l") / len(text)
    f_m = sum(1 for t in text if t.lower() == "m") / len(text)
    f_n = sum(1 for t in text if t.lower() == "n") / len(text)
    f_o = sum(1 for t in text if t.lower() == "o") / len(text)
    f_p = sum(1 for t in text if t.lower() == "p") / len(text)
    f_q = sum(1 for t in text if t.lower() == "q") / len(text)
    f_r = sum(1 for t in text if t.lower() == "r") / len(text)
    f_s = sum(1 for t in text if t.lower() == "s") / len(text)
    f_t = sum(1 for t in text if t.lower() == "t") / len(text)
    f_u = sum(1 for t in text if t.lower() == "u") / len(text)
    f_v = sum(1 for t in text if t.lower() == "v") / len(text)
    f_w = sum(1 for t in text if t.lower() == "w") / len(text)
    f_x = sum(1 for t in text if t.lower() == "x") / len(text)
    f_y = sum(1 for t in text if t.lower() == "y") / len(text)
    f_z = sum(1 for t in text if t.lower() == "z") / len(text)
    f_1 = sum(1 for t in text if t.lower() == "1") / len(text)
    f_2 = sum(1 for t in text if t.lower() == "2") / len(text)
    f_3 = sum(1 for t in text if t.lower() == "3") / len(text)
    f_4 = sum(1 for t in text if t.lower() == "4") / len(text)
    f_5 = sum(1 for t in text if t.lower() == "5") / len(text)
    f_6 = sum(1 for t in text if t.lower() == "6") / len(text)
    f_7 = sum(1 for t in text if t.lower() == "7") / len(text)
    f_8 = sum(1 for t in text if t.lower() == "8") / len(text)
    f_9 = sum(1 for t in text if t.lower() == "9") / len(text)
    f_0 = sum(1 for t in text if t.lower() == "0") / len(text)
    f_e_0 = sum(1 for t in text if t.lower() == "!") / len(text)
    f_e_1 = sum(1 for t in text if t.lower() == "-") / len(text)
    f_e_2 = sum(1 for t in text if t.lower() == ":") / len(text)
    f_e_3 = sum(1 for t in text if t.lower() == "?") / len(text)
    f_e_4 = sum(1 for t in text if t.lower() == ".") / len(text)
    f_e_5 = sum(1 for t in text if t.lower() == ",") / len(text)
    f_e_6 = sum(1 for t in text if t.lower() == ";") / len(text)
    f_e_7 = sum(1 for t in text if t.lower() == "'") / len(text)
    f_e_8 = sum(1 for t in text if t.lower() == "/") / len(text)
    f_e_9 = sum(1 for t in text if t.lower() == "(") / len(text)
    f_e_10 = sum(1 for t in text if t.lower() == ")") / len(text)
    f_e_11 = sum(1 for t in text if t.lower() == "&") / len(text)
    richness = len(list(set(text.split()))) / len(text.split())

    return pd.Series(
        [avg_len, len_text, len_words, num_short_w, per_digit, per_cap, f_a, f_b, f_c, f_d, f_e, f_f, f_g, f_h, f_i,
         f_j, f_k, f_l, f_m, f_n, f_o, f_p, f_q, f_r, f_s, f_t, f_u, f_v, f_w, f_x, f_y, f_z, f_0, f_1, f_2, f_3,
         f_4, f_5, f_6, f_7, f_8, f_9, f_e_0, f_e_1, f_e_2, f_e_3, f_e_4, f_e_5, f_e_6, f_e_7, f_e_8, f_e_9, f_e_10,
         f_e_11, richness])


def build_train_test(df, source, limit, per_author=None, seed=None):
    # Select top N senders and build Train and Test
    # list_spk = list(pd.DataFrame(df['From'].value_counts().iloc[:limit]).reset_index()['index'])
    list_spk = list(pd.DataFrame(df['From'].value_counts().iloc[:limit]).reset_index().iloc[:, 0])
    sub_df = df[df['From'].isin(list_spk)]

    if per_author is not None:
        raise NotImplementedError()

    if source == 'turing':
        sub_df = sub_df[
            [
                'From', 'content', 'content_tfidf', "avg_len", "len_text", "len_words", "num_short_w", "per_digit",
                "per_cap", "f_a", "f_b", "f_c", "f_d", "f_e", "f_f", "f_g", "f_h", "f_i", "f_j", "f_k", "f_l", "f_m",
                "f_n", "f_o", "f_p", "f_q", "f_r", "f_s", "f_t", "f_u", "f_v", "f_w", "f_x", "f_y", "f_z", "f_0", "f_1",
                "f_2", "f_3", "f_4", "f_5", "f_6", "f_7", "f_8", "f_9", "f_e_0", "f_e_1", "f_e_2", "f_e_3", "f_e_4",
                "f_e_5", "f_e_6", "f_e_7", "f_e_8", "f_e_9", "f_e_10", "f_e_11", "richness", "train"
            ]
        ]
    else:
        sub_df = sub_df[
            [
                'From', 'content', 'content_tfidf', "avg_len", "len_text", "len_words", "num_short_w", "per_digit",
                "per_cap", "f_a", "f_b", "f_c", "f_d", "f_e", "f_f", "f_g", "f_h", "f_i", "f_j", "f_k", "f_l", "f_m",
                "f_n", "f_o", "f_p", "f_q", "f_r", "f_s", "f_t", "f_u", "f_v", "f_w", "f_x", "f_y", "f_z", "f_0", "f_1",
                "f_2", "f_3", "f_4", "f_5", "f_6", "f_7", "f_8", "f_9", "f_e_0", "f_e_1", "f_e_2", "f_e_3", "f_e_4",
                "f_e_5", "f_e_6", "f_e_7", "f_e_8", "f_e_9", "f_e_10", "f_e_11", "richness"
            ]
        ]
    sub_df = sub_df.dropna()

    print("Number of texts : ", len(sub_df))

    dict_nlp_enron = {}
    k = 0

    for val in np.unique(sub_df.From):
        dict_nlp_enron[val] = k
        k += 1

    sub_df['Target'] = sub_df['From'].apply(lambda x: dict_nlp_enron[x])

    if source == 'turing':
        perc = 0.5
        print("Percentage: " + str(perc))
        full_train = sub_df[sub_df["train"] == 1]
        nlp_train = full_train[['content', 'Target']]

        full_test = sub_df[sub_df["train"] == 0]
        test_dict = full_test[['content', 'Target']]

        full_valid = sub_df[sub_df["train"] == 2]
        val_dict = full_valid[['content', 'Target']]

        shrinked_train = nlp_train
        shrinked_test = test_dict
        shrinked_val = val_dict
        for l in range(20):
            part_train = nlp_train[nlp_train["Target"] == l]
            part_train = part_train[:int(len(part_train) * perc)]
            part_test = test_dict[test_dict["Target"] == l]
            part_test = part_test[:int(len(part_test) * perc)]
            part_val = val_dict[val_dict["Target"] == l]
            part_val = part_val[:int(len(part_val) * perc)]
            if l == 0:
                shrinked_train = part_train
                shrinked_test = part_test
                shrinked_val = part_val
            else:
                shrinked_train = pd.concat([shrinked_train, part_train], axis=0)
                shrinked_test = pd.concat([shrinked_test, part_test], axis=0)
                shrinked_val = pd.concat([shrinked_val, part_val], axis=0)

        return shrinked_train, shrinked_test, shrinked_val

    if 'blog' in source or 'imdb62' in source:
        perc = 0.75
        print("seed: " + str(seed))

        if seed is None:
            seed = 0

        ind = train_test_split(sub_df[['content', 'Target']], test_size=0.2, stratify=sub_df['Target'],
                               random_state=seed)
        ind_train = list(ind[0].index)
        nlp_train = sub_df.loc[ind_train]

        val_test_sub_df = ind[1]
        ind2 = train_test_split(val_test_sub_df[['content', 'Target']], test_size=0.5,
                                stratify=val_test_sub_df['Target'], random_state=seed)
        ind_val = list(ind2[0].index)
        ind_test = list(ind2[1].index)
        val_dict = val_test_sub_df.loc[ind_val]
        test_dict = val_test_sub_df.loc[ind_test]

        if 'blog' in source:
            shrinked_train = nlp_train
            shrinked_test = test_dict
            shrinked_val = val_dict
            for l in range(50):
                part_train = nlp_train[nlp_train["Target"] == l]
                part_train = part_train[:int(len(part_train) * perc)]
                part_test = test_dict[test_dict["Target"] == l]
                part_test = part_test[:int(len(part_test) * perc)]
                part_val = val_dict[val_dict["Target"] == l]
                part_val = part_val[:int(len(part_val) * perc)]
                if l == 0:
                    shrinked_train = part_train
                    shrinked_test = part_test
                    shrinked_val = part_val
                else:
                    shrinked_train = pd.concat([shrinked_train, part_train], axis=0)
                    shrinked_test = pd.concat([shrinked_test, part_test], axis=0)
                    shrinked_val = pd.concat([shrinked_val, part_val], axis=0)

            return shrinked_train, shrinked_test, shrinked_val

        return nlp_train, val_dict, test_dict

    ind = train_test_split(sub_df[['content', 'Target']], test_size=0.2, stratify=sub_df['Target'], random_state=seed)
    ind_train = list(ind[0].index)
    ind_test = list(ind[1].index)
    nlp_train = sub_df.loc[ind_train]
    test_dict = sub_df.loc[ind_test]

    return nlp_train, test_dict


def is_name_in_email(name, email):
    """
    Removing emails from Enron where name is in email
    """

    if str(name).lower() in str(email).lower():
        return 1
    else:
        return 0


def load_dataset_dataframe(source):
    print("Loading and processing dataframe")

    # dataset_path = "datasets"
    # dataset_file_name = {
    #     "imdb62": 'full_imdb62.csv',
    #     "blog": 'full_blog.csv',
    #     "turing": "turing_ori_0208.csv"
    # }

    df = None
    if source == "imdb62":
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]), index_col=0)
    elif source == "blog":
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]))
    elif source == 'diffusiondb':
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]))
        df = df[['prompt', 'user_name']]
        df.columns = ['content', 'Target']
    else:
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]))
        df.sort_values(by=['train', 'From'], inplace=True, ascending=[False, True])

    return df


class AverageMeter(object):
    """
    Computes and stores the average and current value
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def save_model(ckpt_dir, cp_name, model):
    """
    Create directory /Checkpoint under exp_data_path and save encoder as cp_name
    """
    os.makedirs(ckpt_dir, exist_ok=True)
    saving_model_path = os.path.join(ckpt_dir, cp_name)
    if isinstance(model, torch.nn.DataParallel):
        model = model.module  # convert to non-parallel form
    torch.save(model.state_dict(), saving_model_path)
    print(f'Model saved: {saving_model_path}')


def load_model_dic(model, ckpt_path, verbose=True, strict=True):
    """
    Load weights to model and take care of weight parallelism
    """
    assert os.path.exists(ckpt_path), f"trained model {ckpt_path} does not exist"

    try:
        model.load_state_dict(torch.load(ckpt_path), strict=strict)
    except:
        state_dict = torch.load(ckpt_path)
        state_dict = {k.partition('module.')[2]: state_dict[k] for k in state_dict.keys()}
        model.load_state_dict(state_dict, strict=strict)
    if verbose:
        print(f'Model loaded: {ckpt_path}')

    return model

# dataset

In [None]:
class BertDataset(Dataset):
    def __init__(self, x, y, tokenizer, length=128, return_idx=False):
        super(BertDataset, self).__init__()
        self.tokenizer = tokenizer
        self.length = length
        self.x = x
        self.return_idx = return_idx
        self.y = torch.tensor(y)
        self.tokens_cache = {}

    def tokenize(self, x):
        dic = self.tokenizer.batch_encode_plus(
            [x],  # input must be a list
            max_length=self.length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_tensors="pt"
        )
        return [x[0] for x in dic.values()]  # get rid of the first dim

    def __getitem__(self, idx):
        int_idx = int(idx)
        assert idx == int_idx
        idx = int_idx
        if idx not in self.tokens_cache:
            self.tokens_cache[idx] = self.tokenize(self.x[idx])
        input_ids, token_type_ids, attention_mask = self.tokens_cache[idx]
        if self.return_idx:
            return input_ids, token_type_ids, attention_mask, self.y[idx], idx, self.x[idx]
        return input_ids, token_type_ids, attention_mask, self.y[idx]

    def __len__(self):
        return len(self.y)


class TrainSampler(Sampler):
    def __init__(self, dataset, batch_size, sim_ratio=0.5):
        super().__init__(None)
        self.dataset = dataset
        self.batch_size = batch_size
        self.x = dataset.x
        self.y = dataset.y
        self.sim_ratio = sim_ratio
        self.num_pos_samples = int(batch_size * sim_ratio)
        print(f'train sampler with batch size = {batch_size} and postive sample ratio = {sim_ratio}')

        self.length = len(list(self.__iter__()))

    def __iter__(self):
        indices = list(range(len(self.y)))
        label_cluster = {}
        for i in indices:
            label = self.y[i].item()
            if label not in label_cluster:
                label_cluster[label] = []
            label_cluster[label].append(i)
        for key, value in label_cluster.items():
            random.shuffle(value)

        assert len(label_cluster[0]) > self.num_pos_samples, \
            f"only {len(label_cluster[0])} samples in each class, but {self.num_pos_samples} pos samples needed"

        # too time-consuming, i.e., O(|D||C|/|B|)s
        batch_indices = []
        flag = True
        while flag:
            # find a valid positive sample class
            available_classes = list(filter(lambda x: len(label_cluster[x]) >= self.num_pos_samples,
                                            list(range(max(self.y) + 1))))
            if len(available_classes) == 0:
                break
            class_count = random.choice(available_classes)

            # fill in positive samples
            batch_indices.append(label_cluster[class_count][-self.num_pos_samples:])
            del label_cluster[class_count][-self.num_pos_samples:]

            # fill in negative samples
            for i in range(self.batch_size - self.num_pos_samples):
                available_classes = list(filter(lambda x: len(label_cluster[x]) > 0, list(range(max(self.y) + 1))))
                if class_count in available_classes:
                    available_classes.remove(class_count)
                if len(available_classes) == 0:
                    flag = False
                    break
                rand_class = random.choice(available_classes)
                batch_indices[-1].append(label_cluster[rand_class].pop())

            random.shuffle(batch_indices[-1])

        random.shuffle(batch_indices)
        all = sum(batch_indices, [])

        return iter(all)

    def __len__(self):
        return self.length


class TrainSamplerMultiClass(Sampler):
    def __init__(self, dataset, batch_size, num_classes, samples_per_author):
        super().__init__(None)
        self.dataset = dataset
        self.batch_size = batch_size
        self.x = dataset.x
        self.y = dataset.y
        self.num_classes = num_classes
        self.samples_per_author = samples_per_author
        assert batch_size // num_classes * num_classes == batch_size, \
            f'batch size {batch_size} is not a multiple of num of classes {num_classes}'
        print(f'train sampler with batch size = {batch_size} and {num_classes} classes in a batch')
        self.length = len(list(self.__iter__()))

    def __iter__(self):
        indices = list(range(len(self.y)))
        label_cluster = {}
        for i in indices:
            label = self.y[i].item()
            if label not in label_cluster:
                label_cluster[label] = []
            label_cluster[label].append(i)

        assert len(label_cluster) > self.num_classes, \
            f'number of available classes {label_cluster} < required classes {self.num_classes}'

        num_samples_per_class_batch = self.batch_size // self.num_classes
        min_class_samples = min([len(x) for x in label_cluster.values()])
        assert min_class_samples > self.samples_per_author, \
            f"expected {self.samples_per_author} per author, but got {min_class_samples} in the dataset"
        class_samples_needed = self.samples_per_author // num_samples_per_class_batch * num_samples_per_class_batch

        dataset_matrix = []
        for key, value in label_cluster.items():
            random.shuffle(value)
            # value = [key] * len(value)    # debugging use
            dataset_matrix.append(torch.tensor(value[:class_samples_needed]).view(num_samples_per_class_batch, -1))

        tuples = torch.cat(dataset_matrix, dim=1).transpose(1, 0).split(1, dim=0)
        tuples = [x.flatten().tolist() for x in tuples]
        random.shuffle(tuples)
        all = sum(tuples, [])

        print(f'from dataset sampler: batch size {self.batch_size}, num of classes in a batch {self.num_classes}, '
              f'num of samples per author in total {self.samples_per_author} (specified) / {class_samples_needed} (true).'
              f'dataset size {len(all)}')

        return iter(all)

    def __len__(self):
        return self.length


class TrainSamplerMultiClassUnit(Sampler):
    def __init__(self, dataset, sample_unit_size):
        super().__init__(None)
        self.x = dataset.x
        self.y = dataset.y
        self.sample_unit_size = sample_unit_size
        print(f'train sampler with sample unit size {sample_unit_size}')
        self.length = len(list(self.__iter__()))

    def __iter__(self):
        indices = list(range(len(self.y)))
        label_cluster = {}
        for i in indices:
            label = self.y[i].item()
            if label not in label_cluster:
                label_cluster[label] = []
            label_cluster[label].append(i)

        dataset_matrix = []
        for key, value in label_cluster.items():
            random.shuffle(value)
            num_valid_samples = len(value) // self.sample_unit_size * self.sample_unit_size
            dataset_matrix.append(torch.tensor(value[:num_valid_samples]).view(self.sample_unit_size, -1))

        tuples = torch.cat(dataset_matrix, dim=1).transpose(1, 0).split(1, dim=0)
        tuples = [x.flatten().tolist() for x in tuples]
        random.shuffle(tuples)
        all = sum(tuples, [])

        print(f'from dataset sampler: original dataset size {len(self.y)}, resampled dataset size {len(all)}. '
              f'sample unit size {self.sample_unit_size}')

        return iter(all)

    def __len__(self):
        return self.length


# loss

In [None]:
from math import log

class lcl_contrastiveAA(nn.Module):
    def __init__(self, temperature=0.1, margin=0.2):
        """
        Implementation of the loss described in the paper Supervised Contrastive Learning :
        https://arxiv.org/abs/2004.11362
        :param temperature: int
        """
        super(lcl_contrastiveAA, self).__init__()
        self.temperature = temperature
        self.cos = nn.CosineSimilarity(dim=-1)
        self.margin = margin

    def forward(self, projections, targets, W):
        """
        :param projections: torch.Tensor, shape [batch_size, projection_dim]
        :param targets: torch.Tensor, shape [batch_size]
        :return: torch.Tensor, scalar
        """
        device = torch.device("cuda") if projections.is_cuda else torch.device("cpu")

        W = F.softmax(W,dim=1)

        # Compute similarity matrix
        # dot_product_tempered = torch.mm(projections, projections.T) / self.temperature
        # The cosine similarity between all pairs of projection vectors is computed
        dot_product_tempered = self.cos(projections.unsqueeze(1), projections.unsqueeze(0)) / self.temperature

        # Compute softmax probabilities over all pairs (positive and negative)
        # Minus max for numerical stability with exponential. Same done in cross entropy. Epsilon added to avoid log(0)
        exp_dot_tempered = (
            torch.exp(dot_product_tempered - torch.max(dot_product_tempered, dim=1, keepdim=True)[0]) + 1e-5
        )

        index = targets.view(1, -1).repeat(projections.size(0), 1)
        # Gather the weights w_{i,y_k} where y_k is the class of sample x_k
        weights_i_yk = W.gather(1, index)
        # Compute the weighted exponential of tempered dot products
        exp_dot_tempered = weights_i_yk * torch.exp(dot_product_tempered)

        # Identify positive pairs for each anchor sample
        # This mask identifies pairs of samples that belong to the same class
        mask_similar_class = (targets.unsqueeze(1).repeat(1, targets.shape[0]) == targets).to(device)
        # This mask removes the self-similarity (diagonal elements)
        mask_anchor_out = (1 - torch.eye(exp_dot_tempered.shape[0])).to(device)
        # This is the combined mask that identifies positive pairs (i.e., samples that belong to the same class but are not the same sample)
        mask_combined_pos = mask_similar_class * mask_anchor_out

        mask_diff_class = (targets.unsqueeze(1).repeat(1, targets.shape[0]) != targets).to(device)
        mask_combined_neg = mask_diff_class * mask_anchor_out

        # exp_sum = torch.sum(exp_dot_tempered * mask_anchor_out, dim=1, keepdim=True)
        # probabilities = exp_dot_tempered / (exp_sum + 1e-5)

        # Compute number of relevant positive samples for each anchor sample
        cardinality_pos = torch.sum(mask_combined_pos, dim=1)

        # to avoid nan value of the loss if there is only one sample of a category  on the batch
        # Ensures that if there's only one sample of a class (i.e., no positive pairs), the division by zero is avoided by setting the count to 1
        for i in range(cardinality_pos.size(0)):
            if cardinality_pos[i]==0:
                cardinality_pos[i] = 1

        # # Compute log probability of positive pairs
        # log_prob = -torch.log(exp_dot_tempered / (torch.sum(exp_dot_tempered * mask_anchor_out, dim=1, keepdim=True)))
        # supervised_contrastive_loss_per_sample = torch.sum(log_prob * mask_combined_pos, dim=1) / cardinality_pos
        # supervised_contrastive_loss = torch.mean(supervised_contrastive_loss_per_sample)

        # Sum of the exponentiated similarities for the negative pairs
        exp_sum_neg = torch.sum(exp_dot_tempered * mask_combined_neg, dim=1)
        # print(exp_sum_neg.shape)
        # prob = exp_dot_tempered / (exp_dot_tempered + exp_sum_neg + 1e-5)
        prob = exp_dot_tempered / (exp_dot_tempered + exp_sum_neg.view(-1, 1) + 1e-5)

        log_prob = -torch.log(prob) * mask_combined_pos
        for i in range(cardinality_pos.size(0)):
            if cardinality_pos[i]==0:
                cardinality_pos[i] = 1

        total_loss = torch.mean(torch.sum(log_prob, dim=1) / cardinality_pos)

        return total_loss


class LCL(nn.Module):

    def __init__(self, temperature=0.07):
        super(LCL, self).__init__()
        self.temperature = temperature
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, features, labels=None, weights=None,mask=None):
        """
        Returns:
            A loss scalar.
        """
        device = (torch.device('cuda')
                  if features.is_cuda
                  else torch.device('cpu'))

        batch_size = features.shape[0]
        weights = F.softmax(weights,dim=1)
        # print(weights)

        if labels is not None and mask is not None:
            raise ValueError('Cannot define both `labels` and `mask`')
        elif labels is None and mask is None:
            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
        elif labels is not None:
            labels = labels.contiguous().view(-1, 1)
            if labels.shape[0] != batch_size:
                raise ValueError('Num of labels does not match num of features')
            mask = torch.eq(labels, labels.T).float().to(device)
        else:
            mask = mask.float().to(device)

        contrast_feature = features
        anchor_feature = contrast_feature
        anchor_count = 2

        # compute logits
        # anchor_dot_contrast = torch.div(
        #     torch.matmul(anchor_feature, contrast_feature.T),
        #     self.temperature)
        anchor_dot_contrast = self.cos(features.unsqueeze(1), features.unsqueeze(0)) / self.temperature
        # print('anchor_do_contrast', anchor_dot_contrast)
        logits_mask = torch.scatter(
            torch.ones_like(mask),
            1,
            torch.arange(batch_size).view(-1, 1).to(device),
            0
        )

        ## it produces 0 for the non-matching places and 1 for matching places and neg mask does the opposite
        mask = mask * logits_mask
        # print('mask', mask)
        # print('logits_mask', logits_mask)

        weighted_mask = torch.zeros_like(logits_mask).float().to(device)


        for i,val in enumerate(labels):
            for j,jval in enumerate(labels):
                weighted_mask[i,j] = weights[i,jval]

        weighted_mask = weighted_mask * logits_mask
        pos_weighted_mask = weighted_mask * mask
        # print(weighted_mask)

        # compute log_prob with logsumexp
        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)

        logits = anchor_dot_contrast - logits_max.detach()
        # print('logits', logits)

        exp_logits = torch.exp(logits) * weighted_mask
        # print('exp_logits', exp_logits)
        ## log_prob = x - max(x1,..,xn) - logsumexp(x1,..,xn) the equation
        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
        # print('log_prob', log_prob)
        # compute mean of log-likelihood over positive
        mean_log_prob_pos = (pos_weighted_mask * log_prob).sum(1) / mask.sum(1)
        # print('mean_log_prob_pos', mean_log_prob_pos)
        # loss
        loss = -1 * mean_log_prob_pos
        # loss = loss.view(anchor_count, batch_size).mean()
        loss = loss.mean()
        # print(loss)
        return loss


class SupConLoss_contrastiveAA(nn.Module):
    def __init__(self, temperature=0.1, margin=0.2):
        """
        Implementation of the loss described in the paper Supervised Contrastive Learning :
        https://arxiv.org/abs/2004.11362
        :param temperature: int
        """
        super(SupConLoss_contrastiveAA, self).__init__()
        self.temperature = temperature
        self.cos = nn.CosineSimilarity(dim=-1)
        self.margin = margin

    def forward(self, projections, targets):
        """
        :param projections: torch.Tensor, shape [batch_size, projection_dim]
        :param targets: torch.Tensor, shape [batch_size]
        :return: torch.Tensor, scalar
        """
        device = torch.device("cuda") if projections.is_cuda else torch.device("cpu")

        # Compute similarity matrix
        # dot_product_tempered = torch.mm(projections, projections.T) / self.temperature
        # The cosine similarity between all pairs of projection vectors is computed
        dot_product_tempered = self.cos(projections.unsqueeze(1), projections.unsqueeze(0)) / self.temperature

        # Compute softmax probabilities over all pairs (positive and negative)
        # Minus max for numerical stability with exponential. Same done in cross entropy. Epsilon added to avoid log(0)
        exp_dot_tempered = (
            torch.exp(dot_product_tempered - torch.max(dot_product_tempered, dim=1, keepdim=True)[0]) + 1e-5
        )
        # Identify positive pairs for each anchor sample
        # This mask identifies pairs of samples that belong to the same class
        mask_similar_class = (targets.unsqueeze(1).repeat(1, targets.shape[0]) == targets).to(device)
        # This mask removes the self-similarity (diagonal elements)
        mask_anchor_out = (1 - torch.eye(exp_dot_tempered.shape[0])).to(device)
        # This is the combined mask that identifies positive pairs (i.e., samples that belong to the same class but are not the same sample)
        mask_combined_pos = mask_similar_class * mask_anchor_out

        mask_diff_class = (targets.unsqueeze(1).repeat(1, targets.shape[0]) != targets).to(device)
        mask_combined_neg = mask_diff_class * mask_anchor_out

        # exp_sum = torch.sum(exp_dot_tempered * mask_anchor_out, dim=1, keepdim=True)
        # probabilities = exp_dot_tempered / (exp_sum + 1e-5)

        # Compute number of relevant positive samples for each anchor sample
        cardinality_pos = torch.sum(mask_combined_pos, dim=1)

        # to avoid nan value of the loss if there is only one sample of a category  on the batch
        # Ensures that if there's only one sample of a class (i.e., no positive pairs), the division by zero is avoided by setting the count to 1
        for i in range(cardinality_pos.size(0)):
            if cardinality_pos[i]==0:
                cardinality_pos[i] = 1

        # # Compute log probability of positive pairs
        # log_prob = -torch.log(exp_dot_tempered / (torch.sum(exp_dot_tempered * mask_anchor_out, dim=1, keepdim=True)))
        # supervised_contrastive_loss_per_sample = torch.sum(log_prob * mask_combined_pos, dim=1) / cardinality_pos
        # supervised_contrastive_loss = torch.mean(supervised_contrastive_loss_per_sample)

        # Sum of the exponentiated similarities for the negative pairs
        exp_sum_neg = torch.sum(exp_dot_tempered * mask_combined_neg, dim=1)
        # print(exp_sum_neg.shape)
        # prob = exp_dot_tempered / (exp_dot_tempered + exp_sum_neg + 1e-5)
        prob = exp_dot_tempered / (exp_dot_tempered + exp_sum_neg.view(-1, 1) + 1e-5)
        # prob = exp_dot_tempered / (exp_sum_neg.view(-1, 1) + 1e-5)

        log_prob = -torch.log(prob) * mask_combined_pos
        for i in range(cardinality_pos.size(0)):
            if cardinality_pos[i]==0:
                cardinality_pos[i] = 1

        total_loss = torch.mean(torch.sum(log_prob, dim=1) / cardinality_pos)

        return total_loss

# model

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim, dropout=0):
        super().__init__()
        print(f'Logistic Regression classifier of dim ({in_dim} {hid_dim} {out_dim})')

        self.nn = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(in_dim, hid_dim, bias=True),
            nn.LeakyReLU(negative_slope=0.2, inplace=True),
            nn.Dropout(p=dropout),
            nn.Linear(hid_dim, out_dim, bias=True),
        )

    def forward(self, x, return_feat=False):
        out = self.nn(x)
        if return_feat:
            return out, x
        return out


class BertClassifier(nn.Module):
    FEAT_LEN = 768

    def __init__(self, raw_bert, classifier):
        super().__init__()
        self.bert = raw_bert
        self.fc = classifier

    def forward(self, x, return_feat=False):
        # x is a tokenized input
        # feature = self.bert(input_ids=x[0], token_type_ids=x[1], attention_mask=x[2])
        feature = self.bert(input_ids=x[0], attention_mask=x[2])
        # out = self.fc(feature.pooler_output.flatten(1))       # not good for our task     # (BS, E)
        out = self.fc(feature.last_hidden_state.flatten(1))  # (BS, T, E)
        if return_feat:
            return out, feature.last_hidden_state.flatten(1)
        return out


@dataclass
class BertClassiferHyperparams:
    mlp_size: int
    token_len: int
    embed_len: int


class weighting_network(nn.Module):

    def __init__(self,batch_size,hidden_size,emotion_size,encoder_type="electra"):
        super(weighting_network, self).__init__()

        if encoder_type == "electra":
            options_name = "google/electra-base-discriminator"
            self.encoder_supcon_2 = ElectraForSequenceClassification.from_pretrained(options_name,num_labels=emotion_size)

            ## to make it faster
            self.encoder_supcon_2.electra.encoder.config.gradient_checkpointing=True
        elif 'bert' in encoder_type:
            options_name = encoder_type
            self.encoder_supcon_2 = BertForSequenceClassification.from_pretrained(options_name,num_labels=emotion_size)

            ## to make it faster
            self.encoder_supcon_2.bert.encoder.config.gradient_checkpointing=True


    def forward(self, x):

        supcon_fea_2 = self.encoder_supcon_2(x[0], x[2], output_hidden_states=True,return_dict=True)

        return supcon_fea_2.logits





# train

In [None]:
def train_bert(train_dict, test_dic, tqdm_on, model_name, embed_len, id, num_epochs, base_bs, base_lr,
               mask_classes, coefficient, coefficient1, num_authors, val_dic=None):
    print(f'mask classes = {mask_classes}')

    # tokenizer and pretrained model
    tokenizer, extractor = None, None
    if 'albert' in model_name:
        from transformers import AlbertTokenizer, AlbertModel
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
        extractor = AlbertModel.from_pretrained(model_name)
    elif 'bert-base' in model_name:
        from transformers import BertTokenizer, BertModel
        tokenizer = BertTokenizer.from_pretrained(model_name)
        extractor = BertModel.from_pretrained(model_name)
    elif 'deberta' in model_name:
        from transformers import DebertaTokenizer, DebertaModel
        tokenizer = DebertaTokenizer.from_pretrained(model_name)
        extractor = DebertaModel.from_pretrained(model_name)
    elif 'roberta' in model_name:
        from transformers import RobertaTokenizer, RobertaModel
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        extractor = RobertaModel.from_pretrained(model_name)

    else:
        raise NotImplementedError(f"model {model_name} not implemented")

    # update extractor
    for param in extractor.parameters():
        param.requires_grad = True

    # get dataset
    train_x, train_y = train_dict['content'].tolist(), train_dict['Target'].tolist()
    test_x, test_y = test_dic['content'].tolist(), test_dic['Target'].tolist()

    if val_dic is not None:
        val_x, val_y = val_dic['content'].tolist(), val_dic['Target'].tolist()

    # training config
    ngpus, dropout = torch.cuda.device_count(), 0.35
    num_tokens, hidden_dim, out_dim = 256, 512, num_authors
    model = BertClassifier(extractor, LogisticRegression(embed_len * num_tokens, hidden_dim, out_dim, dropout=dropout))
    model = nn.DataParallel(model).cuda()

    # model_helper = weighting_network(base_bs, hidden_dim, num_authors, model_name)
    model_helper = BertClassifier(extractor, LogisticRegression(embed_len * num_tokens, hidden_dim, out_dim, dropout=dropout))
    model_helper = nn.DataParallel(model_helper).cuda()

    total_params = list(model.named_parameters()) + list(model_helper.named_parameters())
    # no_decay = ['bias', 'LayerNorm.weight']
    no_decay = []
    optimizer_grouped_parameters = [
    {'params': [p for n, p in total_params if not any(nd in n for nd in no_decay)], 'weight_decay': 3e-4},
    {'params': [p for n, p in total_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    num_training_steps = int(len(train_x) * num_epochs)
    optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr * ngpus)

    # optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=base_lr * ngpus)


    # optimizer = torch.optim.AdamW(params=model.parameters(), lr=base_lr * ngpus, weight_decay=3e-4)
    criterion = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


    train_set = BertDataset(train_x, train_y, tokenizer, num_tokens)
    test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)

    if val_dic is not None:
        val_set = BertDataset(val_x, val_y, tokenizer, num_tokens)

    temperature, sample_unit_size = 0.1, 2
    print(f'coefficient, temperature, sample_unit_size = {coefficient, temperature, sample_unit_size}')

    # logger
    exp_dir = os.path.join(ckpt_dir,
                           f'{id}_{model_name.split("/")[-1]}_coe{coefficient}_temp{temperature}_unit{sample_unit_size}_epoch{num_epochs}')
    writer = SummaryWriter(os.path.join(exp_dir, 'board'))

    # load data
    train_sampler = TrainSamplerMultiClassUnit(train_set, sample_unit_size=sample_unit_size)
    train_loader = DataLoader(train_set, batch_size=base_bs * ngpus, sampler=train_sampler, shuffle=False,
                              num_workers=4 * ngpus, pin_memory=True, drop_last=False)
    # train_loader = DataLoader(train_set, batch_size=base_bs * ngpus, shuffle=True,
    #                           num_workers=4 * ngpus, pin_memory=True, drop_last=False)
    test_loader = DataLoader(test_set, batch_size=base_bs * ngpus, shuffle=False, num_workers=4 * ngpus,
                             pin_memory=True, drop_last=False)

    if val_dic is not None:
        val_loader = DataLoader(val_set, batch_size=base_bs * ngpus, shuffle=False, num_workers=4 * ngpus,
                                pin_memory=True, drop_last=False)

    final_test_acc = None
    final_train_preds, final_test_preds = [], []
    best_acc = -1
    best_tv_acc = -1
    # lcl = lcl_contrastiveAA()
    supcon = SupConLoss_contrastiveAA()
    lcl = LCL()
    # supcon = losses.SupConLoss()
    # miner = miners.MultiSimilarityMiner(epsilon=0.1)

    # training loop
    for epoch in range(num_epochs):
        train_acc = AverageMeter()
        train_loss = AverageMeter()
        train_loss_1 = AverageMeter()
        train_loss_2 = AverageMeter()
        train_loss_3 = AverageMeter()
        train_loss_4 = AverageMeter()
        train_acc_helper = AverageMeter()

        # decay coefficient
        # coefficient = coefficient - 1 / num_epochs

        # training
        model.train()
        model_helper.train()

        pg = tqdm(train_loader, leave=False, total=len(train_loader), disable=not tqdm_on)
        for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
            y1 = torch.tensor(list(set(y.tolist()))).cuda()
            x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
            pred, feats = model(x, return_feat=True)
            pred1 = model_helper(x)
            # print(pred.shape) # batch_size * num_author
            # print(feats.shape) # batch_size * (max_length * hidden_dim)
            # print(y1)
            # print(pred)
            # print(feats)
            # print(pred1)
            # print(y)
            # print()

            # classification loss
            loss_1 = criterion(pred, y.long())

            # generate the mask
            mask = y.clone().cpu().apply_(lambda x: x not in mask_classes).type(torch.bool).cuda()
            feats, pred, y = feats[mask], pred[mask], y[mask]
            if len(y) == 0:
                continue

            # contrastive learning

            loss_2 = lcl(feats, y.long(), pred1)
            loss_3 = supcon(feats, y.long())
            loss_4 = criterion(pred1, y.long())

            # total loss
            loss = loss_1 + coefficient * loss_2 + coefficient1 * loss_3 + loss_4
            # loss = loss_1 + coefficient1 * loss_3
            # loss = loss_2
            # loss = loss_1

            acc = (pred.argmax(1) == y).sum().item() / len(y)
            train_acc.update(acc)
            train_loss.update(loss.item())
            train_loss_1.update(loss_1.item())
            train_loss_2.update(loss_2.item())

            acc_helper = (pred1.argmax(1) == y).sum().item() / len(y)
            train_acc_helper.update(acc_helper)
            train_loss_3.update(loss_3.item())
            train_loss_4.update(loss_4.item())

            loss.backward()
            optimizer.step()
            model.zero_grad()
            model_helper.zero_grad()
            optimizer.zero_grad()

            pg.set_postfix({
                'train acc': '{:.6f}'.format(train_acc.avg),
                'train acc helper': '{:.6f}'.format(train_acc_helper.avg),
                'train L1': '{:.6f}'.format(train_loss_1.avg),
                'train L2': '{:.6f}'.format(train_loss_2.avg),
                'train L3': '{:.6f}'.format(train_loss_3.avg),
                'train L4': '{:.6f}'.format(train_loss_4.avg),
                'train L': '{:.6f}'.format(train_loss.avg),
                'epoch': '{:03d}'.format(epoch)
            })

            # iteration logger
            step = i + epoch * len(pg)
            writer.add_scalar("train-iteration/L1", loss_1.item(), step)
            writer.add_scalar("train-iteration/L2", loss_2.item(), step)
            writer.add_scalar("train-iteration/L3", loss_3.item(), step)
            writer.add_scalar("train-iteration/L4", loss_4.item(), step)
            writer.add_scalar("train-iteration/L", loss.item(), step)
            writer.add_scalar("train-iteration/acc", acc, step)
            writer.add_scalar("train-iteration/acc_helper", acc_helper, step)

        print('train acc: {:.6f}'.format(train_acc.avg), 'train acc helper: {:.6f}'.format(train_acc_helper.avg),
              'train L1 {:.6f}'.format(train_loss_1.avg), 'train L2 {:.6f}'.format(train_loss_2.avg),
              'train L3 {:.6f}'.format(train_loss_3.avg), 'train L4 {:.6f}'.format(train_loss_3.avg),
              'train L {:.6f}'.format(train_loss.avg), f'epoch {epoch}')

        # epoch logger
        writer.add_scalar("train/L1", train_loss_1.avg, epoch)
        writer.add_scalar("train/L2", train_loss_2.avg, epoch)
        writer.add_scalar("train/L3", train_loss_3.avg, epoch)
        writer.add_scalar("train/L4", train_loss_4.avg, epoch)
        writer.add_scalar("train/L", train_loss.avg, epoch)
        writer.add_scalar("train/acc", train_acc.avg, epoch)
        writer.add_scalar("train/acc_helper", train_acc_helper.avg, epoch)

        # validation
        if val_dic is not None:
            model.eval()
            model_helper.eval()
            pg = tqdm(val_loader, leave=False, total=len(val_loader), disable=not tqdm_on)
            with torch.no_grad():
                tv_acc = AverageMeter()  # tv stands for train_val
                tv_acc_helper = AverageMeter()
                tv_loss_1 = AverageMeter()
                tv_loss_2 = AverageMeter()
                tv_loss_3 = AverageMeter()
                tv_loss = AverageMeter()
                for i, (x1, x2, x3, y) in enumerate(pg):
                    x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
                    pred, feats = model(x, return_feat=True)
                    pred1 = model_helper(x)

                    # classification
                    loss_1 = criterion(pred, y.long())

                    loss_2 = lcl(feats, y.long(), pred1)
                    loss_3 = supcon(feats, y.long())
                    loss_4 = criterion(pred1, y.long())

                    # total loss
                    loss = loss_1 + coefficient * loss_2 + coefficient1 * loss_3 + loss_4
                    # loss = loss_1 + coefficient1 * loss_3

                    # # total loss
                    # loss = loss_1 + coefficient * loss_2 + coefficient1 * loss_3
                    # loss = loss_1

                    # logger
                    tv_acc.update((pred.argmax(1) == y).sum().item() / len(y))
                    tv_acc_helper.update((pred1.argmax(1) == y).sum().item() / len(y))
                    # test_acc.update(
                    #     f1_score(y.cpu().detach().numpy(), pred.argmax(1).cpu().detach().numpy(), average='macro'))
                    tv_loss.update(loss.item())
                    tv_loss_1.update(loss_1.item())
                    tv_loss_2.update(loss_2.item())
                    tv_loss_3.update(loss_3.item())

                    pg.set_postfix({
                        'train_val acc': '{:.6f}'.format(tv_acc.avg),
                        'train_val acc helper': '{:.6f}'.format(tv_acc_helper.avg),
                        'epoch': '{:03d}'.format(epoch)
                    })

        # testing
        model.eval()
        model_helper.eval()
        pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=not tqdm_on)
        with torch.no_grad():
            test_acc = AverageMeter()
            test_acc_helper = AverageMeter()
            test_loss_1 = AverageMeter()
            test_loss_2 = AverageMeter()
            test_loss_3 = AverageMeter()
            test_loss = AverageMeter()
            for i, (x1, x2, x3, y) in enumerate(pg):
                x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
                pred, feats = model(x, return_feat=True)
                pred1 = model_helper(x)

                # classification
                loss_1 = criterion(pred, y.long())

                loss_2 = lcl(feats, y.long(), pred1)
                loss_3 = supcon(feats, y.long())
                loss_4 = criterion(pred1, y.long())

                # total loss
                loss = loss_1 + coefficient * loss_2 + coefficient1 * loss_3 + loss_4
                # loss = loss_1 + coefficient1 * loss_3
                # loss = loss_2
                # loss = loss_1

                # logger
                test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
                test_acc_helper.update((pred1.argmax(1) == y).sum().item() / len(y))
                # test_acc.update(
                #     f1_score(y.cpu().detach().numpy(), pred.argmax(1).cpu().detach().numpy(), average='macro'))
                test_loss.update(loss.item())
                test_loss_1.update(loss_1.item())
                test_loss_2.update(loss_2.item())
                test_loss_3.update(loss_3.item())

                pg.set_postfix({
                    'test acc': '{:.6f}'.format(test_acc.avg),
                    'test acc helper': '{:.6f}'.format(test_acc_helper.avg),
                    'epoch': '{:03d}'.format(epoch)
                })

        # logging
        if val_dic is not None:
            writer.add_scalar("tv/L1", tv_loss_1.avg, epoch)
            writer.add_scalar("tv/L2", tv_loss_2.avg, epoch)
            writer.add_scalar("tv/L3", tv_loss_3.avg, epoch)
            writer.add_scalar("tv/L", tv_loss.avg, epoch)
            writer.add_scalar("tv/acc", tv_acc.avg, epoch)
            writer.add_scalar("tv/acc_helper", tv_acc_helper.avg, epoch)

        writer.add_scalar("test/L1", test_loss_1.avg, epoch)
        writer.add_scalar("test/L2", test_loss_2.avg, epoch)
        writer.add_scalar("test/L3", test_loss_3.avg, epoch)
        writer.add_scalar("test/L", test_loss.avg, epoch)
        writer.add_scalar("test/acc", test_acc.avg, epoch)
        writer.add_scalar("test/acc_helper", test_acc_helper.avg, epoch)

        # scheduler.step(test_loss.avg)
        scheduler.step()

        print(f'epoch {epoch}, train acc {train_acc.avg}, test acc {test_acc.avg}')

        final_test_acc = test_acc.avg

        # save model
        if tv_acc.avg:
            if tv_acc.avg >= best_acc:
                cur_models = os.listdir(exp_dir)
                for cur_model in cur_models:
                    if cur_model.endswith(".pt"):
                        os.remove(os.path.join(exp_dir, cur_model))
                save_model(exp_dir, f'{id}_val{final_test_acc:.5f}_e{epoch}.pt', model)
        best_acc = max(best_acc, tv_acc.avg)

        if val_dic is not None:
            print(f'epoch {epoch}, train val acc {tv_acc.avg}')
            final_tv_acc = tv_acc.avg
            best_tv_acc = max(best_tv_acc, tv_acc.avg)

    # save checkpoint
    save_model(exp_dir, f'{id}_val{final_test_acc:.5f}_finale{epoch}.pt', model)

    print(
        f'Training complete after {num_epochs} epochs. Final val acc = {final_tv_acc}, '
        f'best val acc = {best_tv_acc}, best test acc = {best_acc}.'
        f'Final test acc {final_test_acc}')

    return final_test_acc, final_train_preds, final_test_preds

# main

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
      '--dataset', 'diffusiondb',
      '--id', 'diffusiondb100_lcl_para_40',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '100',
      '--epochs', '30',
      '--model', 'bert-base-cased',
      '--coe', '1',
      '--coe1', '1'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_label_1.csv')
    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
    def split_author_data(author_data):
        # Split into two parts
        df_20 = author_data.iloc[:40]
        return df_20

    grouped = df_shuffled.groupby('user_name')

    # Create two new DataFrames by concatenating the splits
    nlp_train = pd.concat([split_author_data(group) for name, group in grouped])

    nlp_train.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random40_label_1.csv', index=False)

    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_random100_label_1.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
    nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test.columns = ['content', 'Target']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
      '--dataset', 'diffusiondb',
      '--id', 'diffusiondb100_lcl_para_20',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '100',
      '--epochs', '30',
      '--model', 'bert-base-cased',
      '--coe', '1',
      '--coe1', '1'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_label_1.csv')
    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
    def split_author_data(author_data):
        # Split into two parts
        df_20 = author_data.iloc[:20]
        return df_20

    grouped = df_shuffled.groupby('user_name')

    # Create two new DataFrames by concatenating the splits
    nlp_train = pd.concat([split_author_data(group) for name, group in grouped])

    nlp_train.to_csv('/content/drive/MyDrive/msc_project/data/twitter_micro/processed/train_random20_label_1.csv', index=False)

    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_random100_label_1.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
    nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test.columns = ['content', 'Target']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
      '--dataset', 'diffusiondb',
      '--id', 'diffusiondb100_cls_para',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '100',
      '--epochs', '30',
      '--model', 'albert-base-v2',
      '--coe', '1',
      '--coe1', '1'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_label_1.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_random100_label_1.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
    nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test.columns = ['content', 'Target']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
      '--dataset', 'diffusiondb',
      '--id', 'diffusiondb100_cls_para',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '100',
      '--epochs', '30',
      '--model', 'roberta-base',
      '--coe', '1',
      '--coe1', '1'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_label_1.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_random100_label_1.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
    nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test.columns = ['content', 'Target']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb', 'blogs50']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
        'blogs50': 50
    }

    training_args = [
      '--dataset', 'imdb62',
      '--id', 'imdb62_cls',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '62',
      '--epochs', '30',
      '--model', 'bert-base-cased',
      '--coe', '1',
      '--coe1', '1'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
        'blogs50': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_train.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_val.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_test.csv')
    print(nlp_train.columns)
    nlp_train = nlp_train[['text', 'author_id']]
    # nlp_train = nlp_train[['prompt', 'user_label']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['text', 'author_id']]
    # nlp_val = nlp_val[['prompt', 'user_label']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['text', 'author_id']]
    # nlp_test = nlp_test[['prompt', 'user_label']]
    nlp_test.columns = ['content', 'Target']
    # print(nlp_train['Target'])


    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb', 'blogs50']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
        'blogs50': 50
    }

    training_args = [
      '--dataset', 'blogs50',
      '--id', 'blogs50_cls',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '50',
      '--epochs', '30',
      '--model', 'bert-base-cased',
      '--coe', '1',
      '--coe1', '1'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
        'blogs50': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/processed/blogs50_train.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/processed/blogs50_AA_val.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/processed/blogs50_AA_test.csv')
    nlp_train = nlp_train[['text', 'author_id']]
    # nlp_train = nlp_train[['prompt', 'user_label']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['text', 'author_id']]
    # nlp_val = nlp_val[['prompt', 'user_label']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['text', 'author_id']]
    # nlp_test = nlp_test[['prompt', 'user_label']]
    nlp_test.columns = ['content', 'Target']
    # print(nlp_train['Target'])
    # print(nlp_train.columns)

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
      '--dataset', 'diffusiondb',
      '--id', 'diffusiondb100_supcon_coe1_para_topic',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '100',
      '--epochs', '30',
      '--model', 'bert-base-cased',
      '--coe', '1',
      '--coe1', '1'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_topicseparate100_label_1.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_topicseparate100_label_1.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_topicseparate100_label_1.csv')
    # nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train = nlp_train[['prompt', 'user_label']]
    nlp_train.columns = ['content', 'Target']
    # nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val = nlp_val[['prompt', 'user_label']]
    nlp_val.columns = ['content', 'Target']
    # nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test = nlp_test[['prompt', 'user_label']]
    nlp_test.columns = ['content', 'Target']
    # print(nlp_train['Target'])

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
      '--dataset', 'diffusiondb',
      '--id', 'diffusiondb100_lcl_coe1_para_topic',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '100',
      '--epochs', '30',
      '--model', 'bert-base-cased',
      '--coe', '1',
      '--coe1', '1'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_topicseparate100_label_1.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_topicseparate100_label_1.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_topicseparate100_label_1.csv')
    # nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train = nlp_train[['prompt', 'user_label']]
    nlp_train.columns = ['content', 'Target']
    # nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val = nlp_val[['prompt', 'user_label']]
    nlp_val.columns = ['content', 'Target']
    # nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test = nlp_test[['prompt', 'user_label']]
    nlp_test.columns = ['content', 'Target']
    # print(nlp_train['Target'])

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
      '--dataset', 'diffusiondb',
      '--id', 'diffusiondb100_lcl_coe1_para',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '100',
      '--epochs', '30',
      '--model', 'bert-base-cased',
      '--coe', '1',
      '--coe1', '1'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_label_1.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_random100_label_1.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
    nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test.columns = ['content', 'Target']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
      '--dataset', 'diffusiondb',
      '--id', 'diffusiondb100_lcl_para',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '100',
      '--epochs', '30',
      '--model', 'bert-base-cased',
      '--coe', '1',
      '--coe1', '0.5'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_label_1.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_random100_label_1.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
    nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test.columns = ['content', 'Target']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

In [None]:
# change coefficients

if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)
    parser.add_argument('--coe1', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
      '--dataset', 'diffusiondb',
      '--id', 'diffusiondb100_lcl_coe2_para',
      '--gpu', '0',
      '--tqdm', 'True',
      '--authors', '100',
      '--epochs', '30',
      '--model', 'bert-base-cased',
      '--coe', '2',
      '--coe1', '1'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_label_1.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_random100_label_1.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
    nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test.columns = ['content', 'Target']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, coefficient1=args.coe1, num_authors=num_authors,
                   val_dic=nlp_val)

# crucial analysis

In [None]:
!pip install lda

In [None]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\d+", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def load_diffusiondb(data):
    # load data (can be used if data already split into train and test set)
    x = data['prompt'].tolist()
    X = []
    for i in range(len(x)):
        X.append(clean_str(x[i]))
    return X

import pandas as pd
import numpy as np
import lda
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/final_llama3.csv')
print(df.columns)

X = []
X = load_diffusiondb(df)
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=1, stop_words='english')
X_tf = tf_vectorizer.fit_transform(X)
print(f'vocabulary', len(tf_vectorizer.vocabulary_))

lda_model = lda.LDA(n_topics=50, n_iter=1000, random_state=1000)
lda_model.fit(X_tf)
doc_topic = lda_model.doc_topic_

df['topic'] = np.argmax(doc_topic, axis=1)

topic_word = lda_model.topic_word_
vocab = tf_vectorizer.get_feature_names_out()

# Print the top words for each topic
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(10+1):-1]
    print(f"Topic {i}: {' '.join(topic_words)}")

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assume df is your DataFrame with columns 'user_name', 'prompt', and 'topic'

# Step 1: Create a pivot table
pivot_table = df.pivot_table(index='topic', columns='user_name', aggfunc='size', fill_value=0)

# Step 2: Plot the heatmap
plt.figure(figsize=(20, 10))  # Adjust the size as necessary
sns.heatmap(pivot_table, cmap="YlGnBu", annot=True, fmt="d", linewidths=.5)

# Customize the plot
plt.title('Number of Prompts per Author for Each Topic')
plt.xlabel('Authors')
plt.ylabel('Topics')

# Show the plot
plt.show()


In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import random

# Assume df is your DataFrame with columns 'user_name', 'prompt', and 'topic'

# Step 1: Initialize lists for train, validation, and test sets
train_data = []
val_data = []
test_data = []

# Step 2: Process each author
for user_name, group in df.groupby('user_name'):
    # Group the user's prompts by topic
    topics = defaultdict(list)
    for _, row in group.iterrows():
        topics[row['topic']].append(row)

    # Convert the grouped topics to a list of (topic, prompts) tuples
    topics_list = list(topics.items())
    random.shuffle(topics_list)  # Shuffle to introduce randomness

    # Determine the number of prompts required for each split
    num_train = int(0.6 * 100)
    num_val = int(0.2 * 100)
    num_test = int(0.2 * 100)  # This is 20

    train_prompts = []
    test_prompts = []
    val_prompts = []

    # Allocate prompts while enforcing the correct split sizes
    for topic, prompts in topics_list:
        if len(train_prompts) < num_train:
            train_prompts.extend(prompts)
            if len(train_prompts) > num_train:
                excess = len(train_prompts) - num_train
                val_prompts.extend(train_prompts[-excess:])
                train_prompts = train_prompts[:-excess]
            continue

        if len(test_prompts) < num_test:
            test_prompts.extend(prompts)
            if len(test_prompts) > num_test:
                excess = len(test_prompts) - num_test
                val_prompts.extend(test_prompts[-excess:])
                test_prompts = test_prompts[:-excess]
            continue

        val_prompts.extend(prompts)

    # If test prompts are less than 20, move from validation set ensuring no topic overlap with train set
    if len(test_prompts) < num_test:
        required = num_test - len(test_prompts)
        additional_test_prompts = [p for p in val_prompts if p['topic'] not in set([tp['topic'] for tp in train_prompts])]

        if len(additional_test_prompts) >= required:
            test_prompts.extend(additional_test_prompts[:required])
            val_prompts = [p for p in val_prompts if p not in additional_test_prompts[:required]]
        else:
            # If there are not enough prompts with different topics, move what we can and log the issue
            test_prompts.extend(additional_test_prompts)
            val_prompts = [p for p in val_prompts if p not in additional_test_prompts]

    # If still underfilled, move prompts from val to test without checking topic overlap
    if len(test_prompts) < num_test:
        required = num_test - len(test_prompts)
        test_prompts.extend(val_prompts[:required])
        val_prompts = val_prompts[required:]

    # Step 3: Add the prompts to the respective datasets
    train_data.extend(train_prompts)
    val_data.extend(val_prompts)
    test_data.extend(test_prompts)

# Step 4: Convert lists back to DataFrames
train_df = pd.DataFrame(train_data, columns=df.columns)
val_df = pd.DataFrame(val_data, columns=df.columns)
test_df = pd.DataFrame(test_data, columns=df.columns)

# Verify the sizes
print(f"Train set: {len(train_df)} prompts")
print(f"Validation set: {len(val_df)} prompts")
print(f"Test set: {len(test_df)} prompts")

# Now, train_df, val_df, and test_df contain the split data with the desired properties, including the 'user_name' column


In [None]:
train_df.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_topicseparate100_1.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_topicseparate100_1.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_topicseparate100_1.csv', index=False)

In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Fit the LabelEncoder on the entire dataset
label_encoder = LabelEncoder()

# Apply the encoder to each DataFrame
train_df['user_label'] = label_encoder.fit_transform(train_df['user_name'])
val_df['user_label'] = label_encoder.transform(val_df['user_name'])
test_df['user_label'] = label_encoder.transform(test_df['user_name'])
joblib.dump(label_encoder, '/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/label_encoder_topicseparate100_1.pkl')
train_df.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_topicseparate100_label_1.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/val_topicseparate100_label_1.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_topicseparate100_label_1.csv', index=False)

In [None]:
val_df

In [None]:
test_df

In [None]:
df