In [None]:
import argparse
import os
import random
import warnings
import tarfile
import gdown

import numpy as np
import pandas as pd
from tqdm import tqdm
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, Sampler
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F

import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
from matplotlib.collections import QuadMesh
import seaborn as sn

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split


In [None]:
ckpt_dir = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data'
dataset_path = "/content/drive/MyDrive/msc_project/datasets"
# dataset_file_name = {
#     "imdb62": 'full_imdb62.csv',
#     "blog": 'full_blog.csv',
#     "turing": "turing_ori_0208.csv"
# }
datasets = {
    'contrax_datasets.tar': 'https://drive.google.com/uc?id=1T3VgMe-dCy5QVI7b1K2KdfL-2e2gq2Rn'
}
os.makedirs(dataset_path, exist_ok=True)

nltk.download('stopwords')
nltk.download('punkt')

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)

# Utils

In [None]:
def get_new_fig(fn, figsize=[9, 9]):
    """ Init graphics """
    fig1 = plt.figure(fn, figsize)
    ax1 = fig1.gca()  # Get Current Axis
    ax1.cla()  # clear existing plot
    return fig1, ax1


def configcell_text_and_colors(array_df, lin, col, oText, facecolors, posi, fz, fmt, show_null_values=0):
    """
      config cell text and colors
      and return text elements to add and to dell
      @TODO: use fmt
    """
    text_add = [];
    text_del = [];
    cell_val = array_df[lin][col]
    tot_all = array_df[-1][-1]
    per = (float(cell_val) / tot_all) * 100
    curr_column = array_df[:, col]
    ccl = len(curr_column)

    # last line  and/or last column
    if (col == (ccl - 1)) or (lin == (ccl - 1)):
        # tots and percents
        if (cell_val != 0):
            if (col == ccl - 1) and (lin == ccl - 1):
                tot_rig = 0
                for i in range(array_df.shape[0] - 1):
                    tot_rig += array_df[i][i]
                per_ok = (float(tot_rig) / cell_val) * 100
            elif (col == ccl - 1):
                tot_rig = array_df[lin][lin]
                per_ok = (float(tot_rig) / cell_val) * 100
            elif (lin == ccl - 1):
                tot_rig = array_df[col][col]
                per_ok = (float(tot_rig) / cell_val) * 100
            per_err = 100 - per_ok
        else:
            per_ok = per_err = 0

        per_ok_s = ['%.2f%%' % (per_ok), '100%'][per_ok == 100]

        # text to DEL
        text_del.append(oText)

        # text to ADD
        font_prop = fm.FontProperties(weight='bold', size=fz)
        text_kwargs = dict(color='w', ha="center", va="center", gid='sum', fontproperties=font_prop)
        lis_txt = ['%d' % (cell_val), per_ok_s, '%.2f%%' % (per_err)]
        lis_kwa = [text_kwargs]
        dic = text_kwargs.copy();
        dic['color'] = 'g';
        lis_kwa.append(dic);
        dic = text_kwargs.copy();
        dic['color'] = 'r';
        lis_kwa.append(dic);
        lis_pos = [(oText._x, oText._y - 0.3), (oText._x, oText._y), (oText._x, oText._y + 0.3)]
        for i in range(len(lis_txt)):
            newText = dict(x=lis_pos[i][0], y=lis_pos[i][1], text=lis_txt[i], kw=lis_kwa[i])
            # print 'lin: %s, col: %s, newText: %s' %(lin, col, newText)
            text_add.append(newText)
        # print '\n'

        # set background color for sum cells (last line and last column)
        carr = [0.27, 0.30, 0.27, 1.0]
        if (col == ccl - 1) and (lin == ccl - 1):
            carr = [0.17, 0.20, 0.17, 1.0]
        facecolors[posi] = carr

    else:
        if (per > 0):
            txt = '%s\n%.2f%%' % (cell_val, per)
        else:
            if (show_null_values == 0):
                txt = ''
            elif (show_null_values == 1):
                txt = '0'
            else:
                txt = '0\n0.0%'
        oText.set_text(txt)

        # main diagonal
        if (col == lin):
            # set color of the textin the diagonal to white
            oText.set_color('w')
            # set background color in the diagonal to blue
            facecolors[posi] = [0.35, 0.8, 0.55, 1.0]
        else:
            oText.set_color('r')

    return text_add, text_del


def insert_totals(df_cm):
    """ insert total column and line (the last ones) """
    sum_col = []
    for c in df_cm.columns:
        sum_col.append(df_cm[c].sum())
    sum_lin = []
    for item_line in df_cm.iterrows():
        sum_lin.append(item_line[1].sum())
    df_cm['sum_lin'] = sum_lin
    sum_col.append(np.sum(sum_lin))
    df_cm.loc['sum_col'] = sum_col


def pretty_plot_confusion_matrix(df_cm, annot=True, cmap="Oranges", fmt='.2f', fz=11,
                                 lw=0.5, cbar=False, figsize=[8, 8], show_null_values=0, pred_val_axis='y'):
    """
      print conf matrix with default layout (like matlab)
      params:
        df_cm          dataframe (pandas) without totals
        annot          print text in each cell
        cmap           Oranges,Oranges_r,YlGnBu,Blues,RdBu, ... see:
        fz             fontsize
        lw             linewidth
        pred_val_axis  where to show the prediction values (x or y axis)
                        'col' or 'x': show predicted values in columns (x axis) instead lines
                        'lin' or 'y': show predicted values in lines   (y axis)
    """
    if (pred_val_axis in ('col', 'x')):
        xlbl = 'Predicted'
        ylbl = 'Actual'
    else:
        xlbl = 'Actual'
        ylbl = 'Predicted'
        df_cm = df_cm.T

    # create "Total" column
    insert_totals(df_cm)

    # this is for print allways in the same window
    fig, ax1 = get_new_fig('Conf matrix default', figsize)

    # thanks for seaborn
    ax = sn.heatmap(df_cm, annot=annot, annot_kws={"size": fz}, linewidths=lw, ax=ax1,
                    cbar=cbar, cmap=cmap, linecolor='w', fmt=fmt)

    # set ticklabels rotation
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=10)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=25, fontsize=10)

    # Turn off all the ticks
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # face colors list
    quadmesh = ax.findobj(QuadMesh)[0]
    facecolors = quadmesh.get_facecolors()

    # iter in text elements
    array_df = np.array(df_cm.to_records(index=False).tolist())
    text_add = [];
    text_del = [];
    posi = -1  # from left to right, bottom to top.
    for t in ax.collections[0].axes.texts:  # ax.texts:
        pos = np.array(t.get_position()) - [0.5, 0.5]
        lin = int(pos[1]);
        col = int(pos[0]);
        posi += 1
        # print ('>>> pos: %s, posi: %s, val: %s, txt: %s' %(pos, posi, array_df[lin][col], t.get_text()))

        # set text
        txt_res = configcell_text_and_colors(array_df, lin, col, t, facecolors, posi, fz, fmt, show_null_values)

        text_add.extend(txt_res[0])
        text_del.extend(txt_res[1])

    # remove the old ones
    for item in text_del:
        item.remove()
    # append the new ones
    for item in text_add:
        ax.text(item['x'], item['y'], item['text'], **item['kw'])

    # titles and legends
    ax.set_title('Confusion matrix')
    ax.set_xlabel(xlbl)
    ax.set_ylabel(ylbl)
    plt.tight_layout()  # set layout slim
    plt.show()


def plot_confusion_matrix_from_data(y_test, predictions, columns=None, annot=True, cmap="Oranges",
                                    fmt='.2f', fz=11, lw=0.5, cbar=False, figsize=[8, 8], show_null_values=0,
                                    pred_val_axis='lin'):
    """
        plot confusion matrix function with y_test (actual values) and predictions (predic),
        whitout a confusion matrix yet
    """
    from sklearn.metrics import confusion_matrix
    from pandas import DataFrame

    # data
    if (not columns):
        from string import ascii_uppercase
        columns = ['class %s' % (i) for i in list(ascii_uppercase)[0:len(np.unique(y_test))]]

    confm = confusion_matrix(y_test, predictions)
    cmap = 'Oranges';
    fz = 11;
    figsize = [9, 9];
    show_null_values = 2
    df_cm = DataFrame(confm, index=columns, columns=columns)
    pretty_plot_confusion_matrix(df_cm, fz=fz, cmap=cmap, figsize=figsize, show_null_values=show_null_values,
                                 pred_val_axis=pred_val_axis)


def fil_sent(sent):
    """
    Filter stopwords
    """
    stop_words = set(stopwords.words('english'))
    filtered_sentence = ' '.join([w for w in sent.split() if not w in stop_words])
    return filtered_sentence


def process(sent):
    """
    Apply stemming
    """
    sent = str(sent)
    ps = PorterStemmer()
    return fil_sent(' '.join([ps.stem(str(x).lower()) for x in word_tokenize(sent)]))


def extract_style(text):
    """
    Extracting stylometric features of a text
    """

    text = str(text)
    len_text = len(text)
    len_words = len(text.split())
    avg_len = np.mean([len(t) for t in text.split()])
    num_short_w = len([t for t in text.split() if len(t) < 3])
    per_digit = sum(t.isdigit() for t in text) / len(text)
    per_cap = sum(1 for t in text if t.isupper()) / len(text)
    f_a = sum(1 for t in text if t.lower() == "a") / len(text)
    f_b = sum(1 for t in text if t.lower() == "b") / len(text)
    f_c = sum(1 for t in text if t.lower() == "c") / len(text)
    f_d = sum(1 for t in text if t.lower() == "d") / len(text)
    f_e = sum(1 for t in text if t.lower() == "e") / len(text)
    f_f = sum(1 for t in text if t.lower() == "f") / len(text)
    f_g = sum(1 for t in text if t.lower() == "g") / len(text)
    f_h = sum(1 for t in text if t.lower() == "h") / len(text)
    f_i = sum(1 for t in text if t.lower() == "i") / len(text)
    f_j = sum(1 for t in text if t.lower() == "j") / len(text)
    f_k = sum(1 for t in text if t.lower() == "k") / len(text)
    f_l = sum(1 for t in text if t.lower() == "l") / len(text)
    f_m = sum(1 for t in text if t.lower() == "m") / len(text)
    f_n = sum(1 for t in text if t.lower() == "n") / len(text)
    f_o = sum(1 for t in text if t.lower() == "o") / len(text)
    f_p = sum(1 for t in text if t.lower() == "p") / len(text)
    f_q = sum(1 for t in text if t.lower() == "q") / len(text)
    f_r = sum(1 for t in text if t.lower() == "r") / len(text)
    f_s = sum(1 for t in text if t.lower() == "s") / len(text)
    f_t = sum(1 for t in text if t.lower() == "t") / len(text)
    f_u = sum(1 for t in text if t.lower() == "u") / len(text)
    f_v = sum(1 for t in text if t.lower() == "v") / len(text)
    f_w = sum(1 for t in text if t.lower() == "w") / len(text)
    f_x = sum(1 for t in text if t.lower() == "x") / len(text)
    f_y = sum(1 for t in text if t.lower() == "y") / len(text)
    f_z = sum(1 for t in text if t.lower() == "z") / len(text)
    f_1 = sum(1 for t in text if t.lower() == "1") / len(text)
    f_2 = sum(1 for t in text if t.lower() == "2") / len(text)
    f_3 = sum(1 for t in text if t.lower() == "3") / len(text)
    f_4 = sum(1 for t in text if t.lower() == "4") / len(text)
    f_5 = sum(1 for t in text if t.lower() == "5") / len(text)
    f_6 = sum(1 for t in text if t.lower() == "6") / len(text)
    f_7 = sum(1 for t in text if t.lower() == "7") / len(text)
    f_8 = sum(1 for t in text if t.lower() == "8") / len(text)
    f_9 = sum(1 for t in text if t.lower() == "9") / len(text)
    f_0 = sum(1 for t in text if t.lower() == "0") / len(text)
    f_e_0 = sum(1 for t in text if t.lower() == "!") / len(text)
    f_e_1 = sum(1 for t in text if t.lower() == "-") / len(text)
    f_e_2 = sum(1 for t in text if t.lower() == ":") / len(text)
    f_e_3 = sum(1 for t in text if t.lower() == "?") / len(text)
    f_e_4 = sum(1 for t in text if t.lower() == ".") / len(text)
    f_e_5 = sum(1 for t in text if t.lower() == ",") / len(text)
    f_e_6 = sum(1 for t in text if t.lower() == ";") / len(text)
    f_e_7 = sum(1 for t in text if t.lower() == "'") / len(text)
    f_e_8 = sum(1 for t in text if t.lower() == "/") / len(text)
    f_e_9 = sum(1 for t in text if t.lower() == "(") / len(text)
    f_e_10 = sum(1 for t in text if t.lower() == ")") / len(text)
    f_e_11 = sum(1 for t in text if t.lower() == "&") / len(text)
    richness = len(list(set(text.split()))) / len(text.split())

    return pd.Series(
        [avg_len, len_text, len_words, num_short_w, per_digit, per_cap, f_a, f_b, f_c, f_d, f_e, f_f, f_g, f_h, f_i,
         f_j, f_k, f_l, f_m, f_n, f_o, f_p, f_q, f_r, f_s, f_t, f_u, f_v, f_w, f_x, f_y, f_z, f_0, f_1, f_2, f_3,
         f_4, f_5, f_6, f_7, f_8, f_9, f_e_0, f_e_1, f_e_2, f_e_3, f_e_4, f_e_5, f_e_6, f_e_7, f_e_8, f_e_9, f_e_10,
         f_e_11, richness])


def build_train_test(df, source, limit, per_author=None, seed=None):
    # Select top N senders and build Train and Test
    # list_spk = list(pd.DataFrame(df['From'].value_counts().iloc[:limit]).reset_index()['index'])
    list_spk = list(pd.DataFrame(df['From'].value_counts().iloc[:limit]).reset_index().iloc[:, 0])
    sub_df = df[df['From'].isin(list_spk)]

    if per_author is not None:
        raise NotImplementedError()

    if source == 'turing':
        sub_df = sub_df[
            [
                'From', 'content', 'content_tfidf', "avg_len", "len_text", "len_words", "num_short_w", "per_digit",
                "per_cap", "f_a", "f_b", "f_c", "f_d", "f_e", "f_f", "f_g", "f_h", "f_i", "f_j", "f_k", "f_l", "f_m",
                "f_n", "f_o", "f_p", "f_q", "f_r", "f_s", "f_t", "f_u", "f_v", "f_w", "f_x", "f_y", "f_z", "f_0", "f_1",
                "f_2", "f_3", "f_4", "f_5", "f_6", "f_7", "f_8", "f_9", "f_e_0", "f_e_1", "f_e_2", "f_e_3", "f_e_4",
                "f_e_5", "f_e_6", "f_e_7", "f_e_8", "f_e_9", "f_e_10", "f_e_11", "richness", "train"
            ]
        ]
    else:
        sub_df = sub_df[
            [
                'From', 'content', 'content_tfidf', "avg_len", "len_text", "len_words", "num_short_w", "per_digit",
                "per_cap", "f_a", "f_b", "f_c", "f_d", "f_e", "f_f", "f_g", "f_h", "f_i", "f_j", "f_k", "f_l", "f_m",
                "f_n", "f_o", "f_p", "f_q", "f_r", "f_s", "f_t", "f_u", "f_v", "f_w", "f_x", "f_y", "f_z", "f_0", "f_1",
                "f_2", "f_3", "f_4", "f_5", "f_6", "f_7", "f_8", "f_9", "f_e_0", "f_e_1", "f_e_2", "f_e_3", "f_e_4",
                "f_e_5", "f_e_6", "f_e_7", "f_e_8", "f_e_9", "f_e_10", "f_e_11", "richness"
            ]
        ]
    sub_df = sub_df.dropna()

    print("Number of texts : ", len(sub_df))

    dict_nlp_enron = {}
    k = 0

    for val in np.unique(sub_df.From):
        dict_nlp_enron[val] = k
        k += 1

    sub_df['Target'] = sub_df['From'].apply(lambda x: dict_nlp_enron[x])

    if source == 'turing':
        perc = 0.5
        print("Percentage: " + str(perc))
        full_train = sub_df[sub_df["train"] == 1]
        nlp_train = full_train[['content', 'Target']]

        full_test = sub_df[sub_df["train"] == 0]
        test_dict = full_test[['content', 'Target']]

        full_valid = sub_df[sub_df["train"] == 2]
        val_dict = full_valid[['content', 'Target']]

        shrinked_train = nlp_train
        shrinked_test = test_dict
        shrinked_val = val_dict
        for l in range(20):
            part_train = nlp_train[nlp_train["Target"] == l]
            part_train = part_train[:int(len(part_train) * perc)]
            part_test = test_dict[test_dict["Target"] == l]
            part_test = part_test[:int(len(part_test) * perc)]
            part_val = val_dict[val_dict["Target"] == l]
            part_val = part_val[:int(len(part_val) * perc)]
            if l == 0:
                shrinked_train = part_train
                shrinked_test = part_test
                shrinked_val = part_val
            else:
                shrinked_train = pd.concat([shrinked_train, part_train], axis=0)
                shrinked_test = pd.concat([shrinked_test, part_test], axis=0)
                shrinked_val = pd.concat([shrinked_val, part_val], axis=0)

        return shrinked_train, shrinked_test, shrinked_val

    if 'blog' in source or 'imdb62' in source:
        perc = 0.75
        print("seed: " + str(seed))

        if seed is None:
            seed = 0

        # ind = train_test_split(sub_df[['content', 'Target']], test_size=0.2, stratify=sub_df['Target'],
        #                        random_state=seed)
        ind = train_test_split(sub_df, test_size=0.2, stratify=sub_df['Target'],
                        random_state=seed)
        ind_train = list(ind[0].index)
        nlp_train = sub_df.loc[ind_train]

        val_test_sub_df = ind[1]
        # ind2 = train_test_split(val_test_sub_df[['content', 'Target']], test_size=0.5,
        #                         stratify=val_test_sub_df['Target'], random_state=seed)
        ind2 = train_test_split(val_test_sub_df, test_size=0.5,
                        stratify=val_test_sub_df['Target'], random_state=seed)
        ind_val = list(ind2[0].index)
        ind_test = list(ind2[1].index)
        val_dict = val_test_sub_df.loc[ind_val]
        test_dict = val_test_sub_df.loc[ind_test]

        if 'blog' in source:
            shrinked_train = nlp_train
            shrinked_test = test_dict
            shrinked_val = val_dict
            for l in range(50):
                part_train = nlp_train[nlp_train["Target"] == l]
                part_train = part_train[:int(len(part_train) * perc)]
                part_test = test_dict[test_dict["Target"] == l]
                part_test = part_test[:int(len(part_test) * perc)]
                part_val = val_dict[val_dict["Target"] == l]
                part_val = part_val[:int(len(part_val) * perc)]
                if l == 0:
                    shrinked_train = part_train
                    shrinked_test = part_test
                    shrinked_val = part_val
                else:
                    shrinked_train = pd.concat([shrinked_train, part_train], axis=0)
                    shrinked_test = pd.concat([shrinked_test, part_test], axis=0)
                    shrinked_val = pd.concat([shrinked_val, part_val], axis=0)

            return shrinked_train, shrinked_test, shrinked_val

        return nlp_train, val_dict, test_dict

    ind = train_test_split(sub_df[['content', 'Target']], test_size=0.2, stratify=sub_df['Target'], random_state=seed)
    ind_train = list(ind[0].index)
    ind_test = list(ind[1].index)
    nlp_train = sub_df.loc[ind_train]
    test_dict = sub_df.loc[ind_test]

    return nlp_train, test_dict


def is_name_in_email(name, email):
    """
    Removing emails from Enron where name is in email
    """

    if str(name).lower() in str(email).lower():
        return 1
    else:
        return 0


def load_dataset_dataframe(source):
    print("Loading and processing dataframe")

    # dataset_path = "datasets"
    # dataset_file_name = {
    #     "imdb62": 'full_imdb62.csv',
    #     "blog": 'full_blog.csv',
    #     "turing": "turing_ori_0208.csv"
    # }

    df = None
    if source == "imdb62":
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]), index_col=0)
    elif source == "blog":
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]))
    elif source == 'diffusiondb':
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]))
        df = df[['prompt', 'user_name']]
        df.columns = ['content', 'Target']
    else:
        df = pd.read_csv(os.path.join(dataset_path, dataset_file_name[source]))
        df.sort_values(by=['train', 'From'], inplace=True, ascending=[False, True])

    return df


class AverageMeter(object):
    """
    Computes and stores the average and current value
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def save_model(ckpt_dir, cp_name, model):
    """
    Create directory /Checkpoint under exp_data_path and save encoder as cp_name
    """
    os.makedirs(ckpt_dir, exist_ok=True)
    saving_model_path = os.path.join(ckpt_dir, cp_name)
    if isinstance(model, torch.nn.DataParallel):
        model = model.module  # convert to non-parallel form
    torch.save(model.state_dict(), saving_model_path)
    print(f'Model saved: {saving_model_path}')


def load_model_dic(model, ckpt_path, verbose=True, strict=True):
    """
    Load weights to model and take care of weight parallelism
    """
    assert os.path.exists(ckpt_path), f"trained model {ckpt_path} does not exist"

    try:
        model.load_state_dict(torch.load(ckpt_path), strict=strict)
    except:
        state_dict = torch.load(ckpt_path)
        state_dict = {k.partition('module.')[2]: state_dict[k] for k in state_dict.keys()}
        model.load_state_dict(state_dict, strict=strict)
    if verbose:
        print(f'Model loaded: {ckpt_path}')

    return model

# Dataset

In [None]:
class NumpyDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = torch.from_numpy(x).float()
        self.y = torch.from_numpy(y).float()

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)


class BertDataset(Dataset):
    def __init__(self, x, y, tokenizer, length=128, return_idx=False):
        super(BertDataset, self).__init__()
        self.tokenizer = tokenizer
        self.length = length
        self.x = x
        self.return_idx = return_idx
        self.y = torch.tensor(y)
        self.tokens_cache = {}

    def tokenize(self, x):
        dic = self.tokenizer.batch_encode_plus(
            [x],  # input must be a list
            max_length=self.length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_tensors="pt"
        )
        return [x[0] for x in dic.values()]  # get rid of the first dim

    def __getitem__(self, idx):
        int_idx = int(idx)
        assert idx == int_idx
        idx = int_idx
        if idx not in self.tokens_cache:
            self.tokens_cache[idx] = self.tokenize(self.x[idx])
        input_ids, token_type_ids, attention_mask = self.tokens_cache[idx]
        if self.return_idx:
            return input_ids, token_type_ids, attention_mask, self.y[idx], idx, self.x[idx]
        return input_ids, token_type_ids, attention_mask, self.y[idx]

    def __len__(self):
        return len(self.y)


class TrainSampler(Sampler):
    def __init__(self, dataset, batch_size, sim_ratio=0.5):
        super().__init__(None)
        self.dataset = dataset
        self.batch_size = batch_size
        self.x = dataset.x
        self.y = dataset.y
        self.sim_ratio = sim_ratio
        self.num_pos_samples = int(batch_size * sim_ratio)
        print(f'train sampler with batch size = {batch_size} and postive sample ratio = {sim_ratio}')

        self.length = len(list(self.__iter__()))

    def __iter__(self):
        indices = list(range(len(self.y)))
        label_cluster = {}
        for i in indices:
            label = self.y[i].item()
            if label not in label_cluster:
                label_cluster[label] = []
            label_cluster[label].append(i)
        for key, value in label_cluster.items():
            random.shuffle(value)

        assert len(label_cluster[0]) > self.num_pos_samples, \
            f"only {len(label_cluster[0])} samples in each class, but {self.num_pos_samples} pos samples needed"

        # too time-consuming, i.e., O(|D||C|/|B|)s
        batch_indices = []
        flag = True
        while flag:
            # find a valid positive sample class
            available_classes = list(filter(lambda x: len(label_cluster[x]) >= self.num_pos_samples,
                                            list(range(max(self.y) + 1))))
            if len(available_classes) == 0:
                break
            class_count = random.choice(available_classes)

            # fill in positive samples
            batch_indices.append(label_cluster[class_count][-self.num_pos_samples:])
            del label_cluster[class_count][-self.num_pos_samples:]

            # fill in negative samples
            for i in range(self.batch_size - self.num_pos_samples):
                available_classes = list(filter(lambda x: len(label_cluster[x]) > 0, list(range(max(self.y) + 1))))
                if class_count in available_classes:
                    available_classes.remove(class_count)
                if len(available_classes) == 0:
                    flag = False
                    break
                rand_class = random.choice(available_classes)
                batch_indices[-1].append(label_cluster[rand_class].pop())

            random.shuffle(batch_indices[-1])

        random.shuffle(batch_indices)
        all = sum(batch_indices, [])

        return iter(all)

    def __len__(self):
        return self.length


class TrainSamplerMultiClass(Sampler):
    def __init__(self, dataset, batch_size, num_classes, samples_per_author):
        super().__init__(None)
        self.dataset = dataset
        self.batch_size = batch_size
        self.x = dataset.x
        self.y = dataset.y
        self.num_classes = num_classes
        self.samples_per_author = samples_per_author
        assert batch_size // num_classes * num_classes == batch_size, \
            f'batch size {batch_size} is not a multiple of num of classes {num_classes}'
        print(f'train sampler with batch size = {batch_size} and {num_classes} classes in a batch')
        self.length = len(list(self.__iter__()))

    def __iter__(self):
        indices = list(range(len(self.y)))
        label_cluster = {}
        for i in indices:
            label = self.y[i].item()
            if label not in label_cluster:
                label_cluster[label] = []
            label_cluster[label].append(i)

        assert len(label_cluster) > self.num_classes, \
            f'number of available classes {label_cluster} < required classes {self.num_classes}'

        num_samples_per_class_batch = self.batch_size // self.num_classes
        min_class_samples = min([len(x) for x in label_cluster.values()])
        assert min_class_samples > self.samples_per_author, \
            f"expected {self.samples_per_author} per author, but got {min_class_samples} in the dataset"
        class_samples_needed = self.samples_per_author // num_samples_per_class_batch * num_samples_per_class_batch

        dataset_matrix = []
        for key, value in label_cluster.items():
            random.shuffle(value)
            # value = [key] * len(value)    # debugging use
            dataset_matrix.append(torch.tensor(value[:class_samples_needed]).view(num_samples_per_class_batch, -1))

        tuples = torch.cat(dataset_matrix, dim=1).transpose(1, 0).split(1, dim=0)
        tuples = [x.flatten().tolist() for x in tuples]
        random.shuffle(tuples)
        all = sum(tuples, [])

        print(f'from dataset sampler: batch size {self.batch_size}, num of classes in a batch {self.num_classes}, '
              f'num of samples per author in total {self.samples_per_author} (specified) / {class_samples_needed} (true).'
              f'dataset size {len(all)}')

        return iter(all)

    def __len__(self):
        return self.length


class TrainSamplerMultiClassUnit(Sampler):
    def __init__(self, dataset, sample_unit_size):
        super().__init__(None)
        self.x = dataset.x
        self.y = dataset.y
        self.sample_unit_size = sample_unit_size
        print(f'train sampler with sample unit size {sample_unit_size}')
        self.length = len(list(self.__iter__()))

    def __iter__(self):
        indices = list(range(len(self.y)))
        label_cluster = {}
        for i in indices:
            label = self.y[i].item()
            if label not in label_cluster:
                label_cluster[label] = []
            label_cluster[label].append(i)

        dataset_matrix = []
        for key, value in label_cluster.items():
            random.shuffle(value)
            num_valid_samples = len(value) // self.sample_unit_size * self.sample_unit_size
            dataset_matrix.append(torch.tensor(value[:num_valid_samples]).view(self.sample_unit_size, -1))

        tuples = torch.cat(dataset_matrix, dim=1).transpose(1, 0).split(1, dim=0)
        # print(torch.cat(dataset_matrix, dim=1).transpose(1, 0).shape)
        # print(len(tuples))
        tuples = [x.flatten().tolist() for x in tuples]
        # print(len(tuples))
        random.shuffle(tuples)
        all = sum(tuples, [])

        print(f'from dataset sampler: original dataset size {len(self.y)}, resampled dataset size {len(all)}. '
              f'sample unit size {self.sample_unit_size}')

        return iter(all)

    def __len__(self):
        return self.length


class EnsembleDataset(Dataset):
    def __init__(self, x_style, x_char, x_bert, y):
        super(EnsembleDataset, self).__init__()
        self.x_style = x_style
        self.x_char = x_char
        self.x_bert = x_bert
        self.y = y

    def __getitem__(self, idx):
        return self.x_style[idx], self.x_char[idx], torch.tensor(self.x_bert['input_ids'][idx]), \
               torch.tensor(self.x_bert['attention_mask'][idx]), self.y[idx]

    def __len__(self):
        return len(self.y)


class TransformerEnsembleDataset(Dataset):
    def __init__(self, x, y, tokenizers, lengths):
        super(TransformerEnsembleDataset, self).__init__()
        self.x = x
        self.tokenizers = tokenizers
        self.lengths = lengths
        self.caches = [{} for i in range(len(tokenizers))]
        self.y = torch.tensor(y)

    def tokenize(self, x, i):
        dic = self.tokenizers[i].batch_encode_plus(
            batch_text_or_text_pairs=[x],  # input must be a list
            max_length=self.lengths[i],
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_tensors="pt"
        )
        return [x[0] for x in dic.values()]  # get rid of the first dim

    def __getitem__(self, idx):
        if idx not in self.caches[0]:
            for i in range(len(self.tokenizers)):
                self.caches[i][idx] = self.tokenize(self.x[idx], i)

        return [self.caches[i][idx] for i in range(len(self.tokenizers))], self.y[idx]

    def __len__(self):
        return len(self.y)

# Loss

In [None]:
def compute_sim_matrix(feats):
    """
    Takes in a batch of features of size (bs, feat_len).
    """
    sim_matrix = F.cosine_similarity(feats.unsqueeze(2).expand(-1, -1, feats.size(0)),
                                     feats.unsqueeze(2).expand(-1, -1, feats.size(0)).transpose(0, 2),
                                     dim=1)

    return sim_matrix


def compute_target_matrix(labels):
    """
    Takes in a label vector of size (bs)
    """
    label_matrix = labels.unsqueeze(-1).expand((labels.shape[0], labels.shape[0]))
    trans_label_matrix = torch.transpose(label_matrix, 0, 1)
    target_matrix = (label_matrix == trans_label_matrix).type(torch.float)

    return target_matrix


def contrastive_loss(pred_sim_matrix, target_matrix, temperature, labels):
    return F.kl_div(F.softmax(pred_sim_matrix / temperature).log(), F.softmax(target_matrix / temperature),
                    reduction="batchmean", log_target=False)

# Model

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim, dropout=0):
        super().__init__()
        print(f'Logistic Regression classifier of dim ({in_dim} {hid_dim} {out_dim})')

        self.nn = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(in_dim, hid_dim, bias=True),
            nn.LeakyReLU(negative_slope=0.2, inplace=True),
            nn.Dropout(p=dropout),
            nn.Linear(hid_dim, out_dim, bias=True),
        )

    def forward(self, x, return_feat=False):
        out = self.nn(x)
        if return_feat:
            return out, x
        return out


class BertClassifier(nn.Module):
    FEAT_LEN = 768

    def __init__(self, raw_bert, classifier):
        super().__init__()
        self.bert = raw_bert
        self.fc = classifier

    def forward(self, x, return_feat=False):
        # x is a tokenized input
        # feature = self.bert(input_ids=x[0], token_type_ids=x[1], attention_mask=x[2])
        feature = self.bert(input_ids=x[0], attention_mask=x[2])
        # out = self.fc(feature.pooler_output.flatten(1))       # not good for our task     # (BS, E)
        out = self.fc(feature.last_hidden_state.flatten(1))  # (BS, T, E)
        if return_feat:
            return out, feature.last_hidden_state.flatten(1)
        return out


@dataclass
class BertClassiferHyperparams:
    mlp_size: int
    token_len: int
    embed_len: int


class SimpleEnsemble(nn.Module):
    """
    The simplest ensemble model, ie, averaging
    """

    def __init__(self, components):  # components is a list of models
        super(SimpleEnsemble, self).__init__()
        self.components = components

    def forward(self, inputs):
        assert len(self.components) == len(inputs)
        preds = []
        for model, input in zip(self.components, inputs):
            preds.append(model(input))
        return sum(preds) / len(preds)


class FixedWeightEnsemble(nn.Module):
    """
    Learn a fixed set of weights
    """

    def __init__(self, components):
        super(FixedWeightEnsemble, self).__init__()
        self.components = components
        self.weights = nn.Linear(1, len(components), bias=False)
        self.weightsInput = torch.tensor([1], dtype=torch.float).cuda()

    def forward(self, inputs):
        assert len(self.components) == len(inputs)

        preds = []
        for model, input in zip(self.components, inputs):
            pred = model(input)
            preds.append(pred)

        weights = self.weights(self.weightsInput)
        for i, weight in enumerate(weights):
            preds[i] = preds[i] * weight

        return sum(preds)


class DynamicWeightEnsemble(nn.Module):
    """
    Learn the dynamic weights for different components
    """

    def __init__(self, components, total_feat_len, dropout=0.2, hidden_len=256):
        super(DynamicWeightEnsemble, self).__init__()
        self.components = components
        self.attention = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(total_feat_len, hidden_len, bias=True),
            nn.LeakyReLU(negative_slope=0.2, inplace=True),
            nn.Dropout(dropout),
            nn.Linear(hidden_len, len(components), bias=True),
            nn.Softmax()
        )

    def forward(self, inputs):
        assert len(self.components) == len(inputs)

        preds, feats = [], []
        for model, input in zip(self.components, inputs):
            pred, feat = model(input, return_feat=True)
            preds.append(pred)
            feats.append(feat)

        weights = self.attention(torch.cat(feats, dim=1))
        weights = torch.transpose(weights, 0, 1)
        for i in range(weights.size(0)):
            for j in range(weights.size(1)):
                preds[i][j] *= weights[i][j]

        return sum(preds)


class AggregateFeatEnsemble(nn.Module):
    """
    Learn the dynamic weights for different components
    """

    def __init__(self, components, total_feat_len, num_classes, dropout=0.2, hidden_len=256):
        super(AggregateFeatEnsemble, self).__init__()
        self.components = nn.ModuleList(components)
        self.nn = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(total_feat_len, hidden_len, bias=True),
            nn.LeakyReLU(negative_slope=0.2, inplace=True),
            nn.Dropout(dropout),
            nn.Linear(hidden_len, num_classes, bias=True)
        )
        #         self.nn2 = nn.Sequential(
        #             nn.Dropout(dropout),
        #             nn.Linear(total_feat_len, hidden_len, bias=True),
        #             nn.LeakyReLU(negative_slope=0.2, inplace=True),
        #             nn.Dropout(dropout),
        #             nn.Linear(hidden_len, num_classes, bias=True)
        #         )
        print(f'aggregate feat ensemble, input feat len {total_feat_len}, hidden size {hidden_len}')

    def forward(self, inputs, return_feats=False, return_preds=False):
        assert len(self.components) == len(inputs)

        preds, feats = [], []
        for model, input in zip(self.components, inputs):
            pred, feat = model(input, return_feat=True)
            preds.append(pred)
            feats.append(feat)

        #         hidden_feat = self.nn(torch.cat(feats, dim=1))
        #         pred = self.nn2(hidden_feat)
        pred = self.nn(torch.cat(feats, dim=1))

        out = [pred]
        if return_feats:
            out.append(feats)
        if return_preds:
            out.append(preds)
        if len(out) == 1:
            return out[0]
        else:
            return out

    # def forward(self, feats):
    #     return self.nn(feats)


class EnsembleClassifier(nn.Module):
    FEAT_LEN = 768

    def __init__(self, raw_bert, styleClassifier, charClassifier, bertClassifier, finalClassifier):
        super().__init__()
        self.bert = raw_bert
        self.styleClassifier = styleClassifier
        self.charClassifier = charClassifier
        self.bertClassifier = bertClassifier
        self.finalClassifier = finalClassifier

    def forward(self, x, return_feat=False):
        # x is a tokenized input
        # print("ENS Forward")

        stylePred = self.styleClassifier(x[0])

        charPred = self.charClassifier(x[1])

        bertFeature = self.bert(x[2], x[3]).last_hidden_state.flatten(1)
        bertPred = self.bertClassifier(bertFeature)
        # print(stylePred.shape)
        # print(charPred.shape)
        # print(bertFeature.shape)
        # print(bertPred.shape)
        # print(x[0].shape)
        # print(x[1].shape)
        ensembleTensor = torch.cat((stylePred, charPred, bertPred, x[0], x[1], bertFeature), dim=1)
        # out = self.fc(feature.pooler_output.flatten(1))
        out = self.finalClassifier(ensembleTensor)
        if return_feat:
            return out, bertFeature
        return out

# Train

In [None]:
def train_bert(train_dict, test_dic, tqdm_on, model_name, embed_len, id, num_epochs, base_bs, base_lr,
               mask_classes, coefficient, num_authors, val_dic=None):
    print(f'mask classes = {mask_classes}')

    # tokenizer and pretrained model
    tokenizer, extractor = None, None
    if 'bert-base' in model_name:
        from transformers import BertTokenizer, BertModel
        tokenizer = BertTokenizer.from_pretrained(model_name)
        extractor = BertModel.from_pretrained(model_name)
    elif 'deberta' in model_name:
        from transformers import DebertaTokenizer, DebertaModel
        tokenizer = DebertaTokenizer.from_pretrained(model_name)
        extractor = DebertaModel.from_pretrained(model_name)
    else:
        raise NotImplementedError(f"model {model_name} not implemented")

    # update extractor
    for param in extractor.parameters():
        param.requires_grad = True

    # get dataset
    train_x, train_y = train_dict['content'].tolist(), train_dict['Target'].tolist()
    test_x, test_y = test_dic['content'].tolist(), test_dic['Target'].tolist()

    if val_dic is not None:
        val_x, val_y = val_dic['content'].tolist(), val_dic['Target'].tolist()

    # training config
    ngpus, dropout = torch.cuda.device_count(), 0.35
    num_tokens, hidden_dim, out_dim = 256, 512, num_authors
    model = BertClassifier(extractor, LogisticRegression(embed_len * num_tokens, hidden_dim, out_dim, dropout=dropout))
    model = nn.DataParallel(model).cuda()

    optimizer = torch.optim.AdamW(params=model.parameters(), lr=base_lr * ngpus, weight_decay=3e-4)
    criterion = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

    train_set = BertDataset(train_x, train_y, tokenizer, num_tokens)
    test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)

    if val_dic is not None:
        val_set = BertDataset(val_x, val_y, tokenizer, num_tokens)

    temperature, sample_unit_size = 0.1, 6
    print(f'coefficient, temperature, sample_unit_size = {coefficient, temperature, sample_unit_size}')

    # logger
    exp_dir = os.path.join(ckpt_dir,
                           f'{id}_{model_name.split("/")[-1]}_coe{coefficient}_temp{temperature}_unit{sample_unit_size}_epoch{num_epochs}')
    writer = SummaryWriter(os.path.join(exp_dir, 'board'))

    # load data
    train_sampler = TrainSamplerMultiClassUnit(train_set, sample_unit_size=sample_unit_size)
    train_loader = DataLoader(train_set, batch_size=base_bs * ngpus, sampler=train_sampler, shuffle=False,
                              num_workers=4 * ngpus, pin_memory=True, drop_last=True)
    test_loader = DataLoader(test_set, batch_size=base_bs * ngpus, shuffle=False, num_workers=4 * ngpus,
                             pin_memory=True, drop_last=True)

    if val_dic is not None:
        val_loader = DataLoader(val_set, batch_size=base_bs * ngpus, shuffle=False, num_workers=4 * ngpus,
                                pin_memory=True, drop_last=True)

    final_test_acc = None
    final_train_preds, final_test_preds = [], []
    best_acc = -1
    best_tv_acc = -1

    # training loop
    for epoch in range(num_epochs):
        train_acc = AverageMeter()
        train_loss = AverageMeter()
        train_loss_1 = AverageMeter()
        train_loss_2 = AverageMeter()

        # decay coefficient
        # coefficient = coefficient - 1 / num_epochs

        # training
        model.train()
        pg = tqdm(train_loader, leave=False, total=len(train_loader), disable=not tqdm_on)
        for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
            x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
            # print(x[0].shape)
            pred, feats = model(x, return_feat=True)

            # classification loss
            loss_1 = criterion(pred, y.long())

            # generate the mask
            mask = y.clone().cpu().apply_(lambda x: x not in mask_classes).type(torch.bool).cuda()
            feats, pred, y = feats[mask], pred[mask], y[mask]
            if len(y) == 0:
                continue

            # contrastive learning
            sim_matrix = compute_sim_matrix(feats)
            target_matrix = compute_target_matrix(y)
            loss_2 = contrastive_loss(sim_matrix, target_matrix, temperature, y)

            # total loss
            # loss = loss_1 + coefficient * loss_2
            loss = loss_1

            acc = (pred.argmax(1) == y).sum().item() / len(y)
            train_acc.update(acc)
            train_loss.update(loss.item())
            train_loss_1.update(loss_1.item())
            train_loss_2.update(loss_2.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            pg.set_postfix({
                'train acc': '{:.6f}'.format(train_acc.avg),
                'train L1': '{:.6f}'.format(train_loss_1.avg),
                'train L2': '{:.6f}'.format(train_loss_2.avg),
                'train L': '{:.6f}'.format(train_loss.avg),
                'epoch': '{:03d}'.format(epoch)
            })

            # iteration logger
            step = i + epoch * len(pg)
            writer.add_scalar("train-iteration/L1", loss_1.item(), step)
            writer.add_scalar("train-iteration/L2", loss_2.item(), step)
            writer.add_scalar("train-iteration/L", loss.item(), step)
            writer.add_scalar("train-iteration/acc", acc, step)

        print('train acc: {:.6f}'.format(train_acc.avg), 'train L1 {:.6f}'.format(train_loss_1.avg),
              'train L2 {:.6f}'.format(train_loss_2.avg), 'train L {:.6f}'.format(train_loss.avg), f'epoch {epoch}')

        # epoch logger
        writer.add_scalar("train/L1", train_loss_1.avg, epoch)
        writer.add_scalar("train/L2", train_loss_2.avg, epoch)
        writer.add_scalar("train/L", train_loss.avg, epoch)
        writer.add_scalar("train/acc", train_acc.avg, epoch)

        # validation
        if val_dic is not None:
            model.eval()
            pg = tqdm(val_loader, leave=False, total=len(val_loader), disable=not tqdm_on)
            with torch.no_grad():
                tv_acc = AverageMeter()  # tv stands for train_val
                tv_loss_1 = AverageMeter()
                tv_loss_2 = AverageMeter()
                tv_loss = AverageMeter()
                for i, (x1, x2, x3, y) in enumerate(pg):
                    x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
                    pred, feats = model(x, return_feat=True)

                    # classification
                    loss_1 = criterion(pred, y.long())

                    # contrastive learning
                    sim_matrix = compute_sim_matrix(feats)
                    target_matrix = compute_target_matrix(y)
                    loss_2 = contrastive_loss(sim_matrix, target_matrix, temperature, y)

                    # total loss
                    # loss = loss_1 + coefficient * loss_2
                    loss = loss_1

                    # logger
                    tv_acc.update((pred.argmax(1) == y).sum().item() / len(y))
                    # test_acc.update(
                    #     f1_score(y.cpu().detach().numpy(), pred.argmax(1).cpu().detach().numpy(), average='macro'))
                    tv_loss.update(loss.item())
                    tv_loss_1.update(loss_1.item())
                    tv_loss_2.update(loss_2.item())

                    pg.set_postfix({
                        'train_val acc': '{:.6f}'.format(tv_acc.avg),
                        'epoch': '{:03d}'.format(epoch)
                    })

        # testing
        model.eval()
        pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=not tqdm_on)
        with torch.no_grad():
            test_acc = AverageMeter()
            test_loss_1 = AverageMeter()
            test_loss_2 = AverageMeter()
            test_loss = AverageMeter()
            for i, (x1, x2, x3, y) in enumerate(pg):
                x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
                pred, feats = model(x, return_feat=True)

                # classification
                loss_1 = criterion(pred, y.long())

                # contrastive learning
                sim_matrix = compute_sim_matrix(feats)
                target_matrix = compute_target_matrix(y)
                loss_2 = contrastive_loss(sim_matrix, target_matrix, temperature, y)

                # total loss
                # loss = loss_1 + coefficient * loss_2
                loss = loss_1

                # logger
                test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
                # test_acc.update(
                #     f1_score(y.cpu().detach().numpy(), pred.argmax(1).cpu().detach().numpy(), average='macro'))
                test_loss.update(loss.item())
                test_loss_1.update(loss_1.item())
                test_loss_2.update(loss_2.item())

                pg.set_postfix({
                    'test acc': '{:.6f}'.format(test_acc.avg),
                    'epoch': '{:03d}'.format(epoch)
                })

        # logging
        if val_dic is not None:
            writer.add_scalar("tv/L1", tv_loss_1.avg, epoch)
            writer.add_scalar("tv/L2", tv_loss_2.avg, epoch)
            writer.add_scalar("tv/L", tv_loss.avg, epoch)
            writer.add_scalar("tv/acc", tv_acc.avg, epoch)

        writer.add_scalar("test/L1", test_loss_1.avg, epoch)
        writer.add_scalar("test/L2", test_loss_2.avg, epoch)
        writer.add_scalar("test/L", test_loss.avg, epoch)
        writer.add_scalar("test/acc", test_acc.avg, epoch)

        # scheduler.step(test_loss.avg)
        scheduler.step()

        print(f'epoch {epoch}, train acc {train_acc.avg}, test acc {test_acc.avg}')

        final_test_acc = test_acc.avg

        # save model
        if test_acc.avg:
            if test_acc.avg >= best_acc:
                cur_models = os.listdir(exp_dir)
                for cur_model in cur_models:
                    if cur_model.endswith(".pt"):
                        os.remove(os.path.join(exp_dir, cur_model))
                save_model(exp_dir, f'{id}_val{final_test_acc:.5f}_e{epoch}.pt', model)
        best_acc = max(best_acc, test_acc.avg)

        if val_dic is not None:
            print(f'epoch {epoch}, train val acc {tv_acc.avg}')
            final_tv_acc = tv_acc.avg
            best_tv_acc = max(best_tv_acc, tv_acc.avg)

    # save checkpoint
    save_model(exp_dir, f'{id}_val{final_test_acc:.5f}_finale{epoch}.pt', model)

    print(
        f'Training complete after {num_epochs} epochs. Final val acc = {final_tv_acc}, '
        f'best val acc = {best_tv_acc}, best test acc = {best_acc}.'
        f'Final test acc {final_test_acc}')

    return final_test_acc, final_train_preds, final_test_preds

# main

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
    '--dataset', 'diffusiondb',
    '--id', 'diffusiondb1000_cls',
    '--gpu', '0',
    '--tqdm', 'True',
    '--authors', '100',
    '--epochs', '10',
    '--model', 'bert-base-cased'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/train_random100_label_2.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/val_random100_label_2.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_2.csv')
    nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test.columns = ['content', 'Target']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors, val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
    '--dataset', 'diffusiondb',
    '--id', 'diffusiondb1000_cls',
    '--gpu', '0',
    '--tqdm', 'True',
    '--authors', '100',
    '--epochs', '20',
    '--model', 'bert-base-cased'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/train_random100_label_2.csv')
    nlp_val = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/val_random100_label_2.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_2.csv')
    nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train.columns = ['content', 'Target']
    nlp_val = nlp_val[['prompt', 'user_name']]
    nlp_val.columns = ['content', 'Target']
    nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test.columns = ['content', 'Target']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=8e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors, val_dic=nlp_val)

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
    '--dataset', 'diffusiondb',
    '--id', 'diffusiondb100',
    '--gpu', '0',
    '--tqdm', 'True',
    '--authors', '100',
    '--epochs', '20',
    '--model', 'bert-base-cased'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    # df = load_dataset_dataframe(source)
    nlp_train = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/train_random100_label_1.csv')
    nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
    nlp_train = nlp_train[['prompt', 'user_name']]
    nlp_train.columns = ['content', 'Target']
    nlp_test = nlp_test[['prompt', 'user_name']]
    nlp_test.columns = ['content', 'Target']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    # nlp_train, nlp_val, nlp_test = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # train
    if 'enron' in source or 'imdb62' in source or 'blog' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=8, base_lr=1e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    elif 'turing' in source:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=7, base_lr=5e-6,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors,
                   val_dic=nlp_val)
    else:
        train_bert(nlp_train, nlp_test, args.tqdm, args.model, 768, args.id, args.epochs, base_bs=24, base_lr=2e-5,
                   mask_classes=mask_classes[args.dataset], coefficient=args.coe, num_authors=num_authors)

In [None]:
train_x, train_y = nlp_train['content'].tolist(), nlp_train['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, num_authors
train_set = BertDataset(train_x, train_y, tokenizer, num_tokens)
train_sampler = TrainSamplerMultiClassUnit(train_set, sample_unit_size=2)
train_loader = DataLoader(train_set, batch_size=8, sampler=train_sampler, shuffle=False,
                              num_workers=4, pin_memory=True, drop_last=True)
pg = tqdm(train_loader, leave=False, total=len(train_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, num_authors
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
model = nn.DataParallel(model).cuda()

for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  print(x1.shape)
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  print(y.shape)
  pred, feats = model(x, return_feat=True)
  print(pred.shape)
  print(feats.shape)
  # generate the mask
  mask = y.clone().cpu().apply_(lambda x: x not in mask_classes).type(torch.bool).cuda()
  feats, y = feats[mask], y[mask]
  if len(y) == 0:
      continue

  # contrastive learning
  sim_matrix = compute_sim_matrix(feats)
  print(sim_matrix)
  target_matrix = compute_target_matrix(y)
  print(target_matrix)
  loss_2 = contrastive_loss(sim_matrix, target_matrix, 0.1, y)
  print(F.softmax(sim_matrix / 0.1).log())
  print(F.softmax(target_matrix / 0.1))

  break

In [None]:
df = pd.read_csv('/content/drive/MyDrive/msc_project/datasets/full_blog.csv')
df

In [None]:
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/train_random100_1.csv')
df

In [None]:
list_spk = pd.DataFrame(df['From'].value_counts().iloc[:limit]).reset_index()
list_spk

# Download dataset

In [None]:
if __name__ == "__main__":
    for name, link in datasets.items():
        if name in os.listdir(dataset_path):
            continue
        gdown.download(link, name, quiet=False)

    tar = tarfile.open(list(datasets.keys())[0])
    tar.extractall(path='datasets')
    tar.close()

# Analyze the model

## TSNE plot

In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

# Randomly select 10 authors
selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_bert-base-cased_coe1_temp0.1_unit2_epoch20/diffusiondb100_val0.77108_e17.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# # Create a color map for authors
# unique_authors = df['user_name'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# # Plot t-SNE results
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     indices = df['user_name'] == author
#     plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

# plt.title("t-SNE Visualization of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# # Step 4: Calculate Cluster Centers
# cluster_centers = df.groupby('user_name')[['tsne_1', 'tsne_2']].mean()

# # Plot cluster centers with the same color coding
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

# plt.title("Cluster Centers of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_cls_bert-base-cased_coe1_temp0.1_unit2_epoch20/diffusiondb100_cls_val0.77510_e7.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# # Create a color map for authors
# unique_authors = df['user_name'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# # Plot t-SNE results
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     indices = df['user_name'] == author
#     plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

# plt.title("t-SNE Visualization of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# # Step 4: Calculate Cluster Centers
# cluster_centers = df.groupby('user_name')[['tsne_1', 'tsne_2']].mean()

# # Plot cluster centers with the same color coding
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

# plt.title("Cluster Centers of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_bert-base-cased_coe1_temp0.1_unit2_epoch30/diffusiondb100_supcon_val0.78125_e26.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_lib_mining_bert-base-cased_coe1_temp0.1_unit2_epoch30/diffusiondb100_supcon_lib_mining_val0.78472_e22.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


## TSNE plot 1

In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_topicseparate100_label_1.csv')
df = df[['prompt', 'user_label']]
df.columns = ['content', 'Target']

# Randomly select 10 authors
selected_authors = np.random.choice(df['Target'].unique(), size=20, replace=False)
df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe1_para_topic_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe1_para_topic_val0.48859_e29.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# # Create a color map for authors
# unique_authors = df['user_name'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# # Plot t-SNE results
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     indices = df['user_name'] == author
#     plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

# plt.title("t-SNE Visualization of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# # Step 4: Calculate Cluster Centers
# cluster_centers = df.groupby('user_name')[['tsne_1', 'tsne_2']].mean()

# # Plot cluster centers with the same color coding
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

# plt.title("Cluster Centers of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_topicseparate100_label_1.csv')
# df = df[['prompt', 'user_label']]
# df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=20, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe1_para_topic_1_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe1_para_topic_1_val0.48462_e26.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# # Create a color map for authors
# unique_authors = df['user_name'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# # Plot t-SNE results
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     indices = df['user_name'] == author
#     plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

# plt.title("t-SNE Visualization of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# # Step 4: Calculate Cluster Centers
# cluster_centers = df.groupby('user_name')[['tsne_1', 'tsne_2']].mean()

# # Plot cluster centers with the same color coding
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

# plt.title("Cluster Centers of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_topicseparate100_label_1.csv')
# df = df[['prompt', 'user_label']]
# df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=20, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_supcon_coe1_para_topic_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_supcon_coe1_para_topic_val0.48413_e26.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# # Create a color map for authors
# unique_authors = df['user_name'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# # Plot t-SNE results
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     indices = df['user_name'] == author
#     plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

# plt.title("t-SNE Visualization of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# # Step 4: Calculate Cluster Centers
# cluster_centers = df.groupby('user_name')[['tsne_1', 'tsne_2']].mean()

# # Plot cluster centers with the same color coding
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

# plt.title("Cluster Centers of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np
from transformers import BertGenerationEncoder

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
# model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
model = BertGenerationEncoder.from_pretrained('bert-base-cased')
ckpt_path = "/content/drive/MyDrive/msc_project/model/contrastive/club/content_encoder_supcon_18.pt"
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  # pred, feats = model(x, return_feat=True)
  outputs = model(input_ids=x[0], attention_mask=x[2])
  feats = outputs.last_hidden_state.flatten(1)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
!pip install sentence-transformers

In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np
from sentence_transformers import SentenceTransformer

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

# Randomly select 10 authors
selected_authors = np.random.choice(df['Target'].unique(), size=20, replace=False)
df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = SentenceTransformer('bert-base-nli-mean-tokens')
model = model.cuda()
all_feats = model.encode(test_x, convert_to_tensor=False)

# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


## current! - tsne plot

In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

# Randomly select 10 authors
selected_authors = np.random.choice(df['Target'].unique(), size=20, replace=False)
df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_para_bert-base-cased_coe1_temp0.1_unit2_epoch30/diffusiondb100_supcon_para_val0.72321_e29.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# # Create a color map for authors
# unique_authors = df['user_name'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# # Plot t-SNE results
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     indices = df['user_name'] == author
#     plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

# plt.title("t-SNE Visualization of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# # Step 4: Calculate Cluster Centers
# cluster_centers = df.groupby('user_name')[['tsne_1', 'tsne_2']].mean()

# # Plot cluster centers with the same color coding
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

# plt.title("Cluster Centers of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=20, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe1_para_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe1_para_val0.73264_e16.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]


# Create a color map for authors
# unique_authors = df['Target'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)



In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_try2/style_encoder_supcon_9.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]


# Create a color map for authors
# unique_authors = df['Target'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModel,
    EncoderDecoderModel,
    BertGenerationDecoder,
    BertGenerationEncoder,
    BertTokenizer,
    BertModel)

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()
print(test_x)
print(test_y)
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}
print(unique_authors)
print(color_map)


# from transformers import BertTokenizer, BertModel
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# extractor = BertModel.from_pretrained('bert-base-cased')

# num_tokens, hidden_dim, out_dim = 256, 512, 100
# test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
# test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
#                              pin_memory=True)

# pg = tqdm(test_loader, leave=False, total=len(test_loader))
# ngpus, dropout = torch.cuda.device_count(), 0.35
# num_tokens, hidden_dim, out_dim = 256, 512, 100
# model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))



ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_try2/content_encoder_supcon_9.pt'

import torch
from transformers import BertGenerationEncoder, BertConfig

# Load your model's state_dict from the .pt file
state_dict = torch.load(ckpt_path)

# Reinitialize the model with the same configuration used during training
# You can load a default or custom configuration if needed
config = BertConfig.from_pretrained('bert-base-cased')  # or use your specific configuration
model = BertGenerationEncoder(config)

# Load the weights into the model
model.load_state_dict(state_dict)

# The model is now loaded and ready for inference or further training


# model = BertGenerationEncoder.from_pretrained(ckpt_path)
# model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
# model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  # x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  outputs = model(x1, attention_mask=x3)
  # all_feats.append(outputs.last_hidden_state.flatten(1).cpu().detach().numpy())
  # all_labels.append(y.cpu().detach().numpy())
  all_feats.append(outputs.last_hidden_state.flatten(1).numpy())
  all_labels.append(y.numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]


# Create a color map for authors
# unique_authors = df['Target'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModel,
    EncoderDecoderModel,
    BertGenerationDecoder,
    BertGenerationEncoder,
    BertTokenizer,
    BertModel)

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

# test_x, test_y = df['content'].tolist(), df['Target'].tolist()
# print(test_x)
# print(test_y)
# unique_authors = df['Target'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}
# print(unique_authors)
# print(color_map)

test_x = ['walking around obi castle town, miyazaki, japan. volumetric lighting, spring early morning, late afternoon, cherry blossom trees, nice weather, few clouds, realistic illustration, perfectly shaded, soft painting, art by krenz cushart and wenjun lin', 'real life Pokemon, cute!!!, fluffy!!!, ultra realistic!!!, golden hour, ultra detailed, sharp focus', 'parked 2 0 2 2 dodge charger srt hellcat, fog, rain, volumetric lighting, beautiful, golden hour, golden ratio, sharp focus, highly detailed, cgsociety', 'Donald Duck as a Star Trek Voyager cast member. highly detailed, 4k, CGI, Photoreal, frame from the tv show.', 'photo taken of an epic intricate, ultra detailed, super realistic gritty, wet, slimy, lifelike sculpture of a nightmarish hellish alien creature with tentacle dreadlocks created by weta workshop for james cameron, zoomed in shots, photorealistic, sharp focus, white wall coloured workshop, cold blueish colour temperature, f 0. 4', 'Tifa Lockheart, intricate, seductive, erotic, tempting, portrait, character photography,', 'regal aristocratic a young!!!, long blond haired tom cruise as the vampire lestat de lioncourt portrait, luxurious indoor setting, atmospheric lighting, painted, menacing, intricate, volumetric lighting, rich deep colours masterpiece, sharp focus, ultra detailed, by leesha hannigan, ross tran, thierry doizon, kai carpenter, ignacio fernandez rios', 'retro futuristic vintage cars in showroom, atmospheric lighting, painted, intricate, volumetric lighting, beautiful, daytime, sunny weather, slight overcast, sharp focus, deep colours, ultra detailed, by leesha hannigan, ross tran, thierry doizon, kai carpenter, ignacio fernandez rios', 'wide angle shot of dilapidated fallout 5 tropical coastal city in real life, desolate, dilapidated, empty streets, nightmarish, some rusted retro futuristic fallout vintage style parked vehicles like cars, buses, trucks, trams, sunny weather, few clouds, volumetric lighting, photorealistic, daytime, autumn, sharp focus, ultra detailed, cgsociety', 'majestic gracious regal seductive isis priestess portrait, ancient egyptian, atmospheric lighting, curvy, painted, intricate, volumetric lighting, beautiful, rich deep colours masterpiece, golden hour, sharp focus, ultra detailed, by leesha hannigan, ross tran, thierry doizon, kai carpenter, ignacio fernandez rios', 'fallout 5, charismatic beautiful rugged brunette female protagonist, portrait, outdoors ruined cityscape, atmospheric lighting, painted, intricate, volumetric lighting, beautiful, foggy, daytime, slight overcast weather, sharp focus, deep colours, ultra detailed, by leesha hannigan, ross tran, thierry doizon, kai carpenter, ignacio fernandez rios', 'majestic gracious regal aristocratic blond female vampire portrait, atmospheric lighting, painted, voluptuous, menacing, intricate, volumetric lighting, beautiful, rich deep colours masterpiece, sharp focus, ultra detailed, by leesha hannigan, ross tran, thierry doizon, kai carpenter, ignacio fernandez rios', 'aerith gainsborough in red cottagecore dress, portrait, illustration, rim light, top light, overcast cloudy weather, perfectly shaded, soft painting, art by krenz cushart and wenjun lin', 'close - up photo of a real life smurf, f 1. 4, garden, golden ratio, rim light, top light, overcast day', 'real life pokemon, cute!!!, happy mood!!!, adorable!!!, fluffy!!!, ultra realistic!!!, golden hour, sharp focus', 'real life Pokemon, creepy!!!, scaly!!!, menacing, evil, ultra realistic!!!, daytime, slight overcast, sharp focus', 'Dream Theater, images and words new cover art', 'battle hardened, overpowering, pragmatic, charismatic character from the animatrix 2, face centered portrait, confident, ruined cityscape, sterile minimalistic room, architecture, fog, volumetric lighting, illustration, perfectly shaded, greenish tinge, cold lights soft painting, art by krenz cushart and wenjun lin', 'a photo of a shiny retro futuristic vintage car parked at deserted scenic viewpoint in alaska, volumetric lighting, serene, epic, beautiful, summer morning dew, sharp focus, ultra detailed, cgsociety', 'silent hill in real life, streets, sombre, parked cars, overcast, blankets of fog pockets, rain, volumetric lighting, beautiful, night time, autumn, sharp focus, 7 0 s visuals, ultra detailed, cgsociety', 'os homens da perna de pau e seus peixes. by marcel caram', 'blade runner. woodchucks in blade runner', 'the scariest image ever seen, by yves tanguy', 'axolotl themed final boss, rendered on a playstation 1', 'meteorite themed cosmic horror, re - imagined as the pepsi logo. city sized clay sculpture in a huge room. billowing clouds. brightest sun barely visible through the light fog', 'deep sea horror by ricardo bofill', 'image in the style of Hugh Ferriss. Black and dark grey. Tall, wide, imposing building in a dramatically lit metropolis. eerie. incomprehensible size.', 'keanu reeves dog hybrid rendered in n 6 4. nintendo 6 4 graphics keanu reeves mixed with a dog', 'a painting by ricardo bofill', 'Photorealistic image of a Rayquaza Elephant hybrid. realism, high definition', 'painterly brush strokes. brutalism hybrid. hugh ferriss and john singer sargent', 'planet sized deep sea trench', 'lithograph of a mans face that is made of wires, veins, soil and roots', 'large red bear on a street in new york city in the style of picasso', 'clay sculpture. portrait of a man with a hollow head. instead of a skull, there is a loose wire mesh, with gold liquid spilling out. wire mesh skull with gaps. painted clay sculpture. neon green or pink or yellow colored background, intense lighting and shadows. astonishing detail', 'a picture of a large building in the sky, a matte painting by mike winkelmann, cgsociety, deconstructivism, matte painting, matte drawing, cryengine', 'subtle mist. gargantuan creature off in the distance', 'afrofuturism hologram by magritte', 'an axolotl made of broken glass, astonishing detail. hyper realistic', 'green spiders. abstract collage made of paper, clay, and twine', 'ink illustration detailed portrait of DC black adam, artwork by mike mignola', 'a lomographic photo of old pacific rim ( 2 0 1 3 ) jaeger, standing in typical japanese yard in small town, hikone on background, cinestill, bokeh', 'jeff goldblum anime style, by Hiroyuki Imaishi, studio trigger', 'Metal Gear, Soild Snake, by ashley wood, character design, concept art', "v for vendetta by ashley wood, yoji shinkawa, jamie hewlett, 6 0's french movie poster, french impressionism, black red white colors, palette knife and brush strokes, dutch tilt", "anime key visual concept art of marvel ghost rider, riding a red akira motorcycle, by ashley wood, yoji shinkawa, jamie hewlett, 6 0's french movie poster, french impressionism, vivid colors, palette knife and brush strokes, style of kawacy and makoto shinkai and greg rutkowski", 'key anime visuals of maria de medeiros in a still from the anime your name ( 2 0 1 6 ) directed by makoto shinkai', 'a street skateboarding visual anime, in downtown tampa florida, by studio gainax, studio trigger, detailed, sharp, asymmetrical face, slice of life', 'batman cyber ninja animated movie still by kamikaze douga', "tv head anime robot, wearing red hoodie, holding katana, medium portrait by Ashley Wood, Yoji Shinkawa, Jamie Hewlett, 60's French movie poster, French Impressionism, vivid colors, palette knife and brush strokes, paint drips, Dutch tilt, 8k, hd, high resolution print", 'award winning color photo of,  tony hawk, skateboarding, doing 540 in the 1986 vert contest, fisheye lens, detailed faces, detailed skateboard, 8k, balanced composition', 'audrey hepburn, fashion portrait, illustrated by david downtown, color ink', 'attack of the 5 0 foot woman ( 1 9 5 8 ) film as a giant japanese cosplay girl towering over buildings', 'idris elba in a still from the anime your name ( 2 0 1 6 ) directed by makoto shinkai', "flik ( a bug's life 1 9 9 8 ) is a stormtrooper, holding a blaster, in the death star corridor, in a still of star wars episode iv a new hope ( 1 9 7 7 )", 'in the style of frank frazetta, a highly detailed matte portrait painting of conan the barbarian, standing on a mountain holding a sword, by ashley wood, triumph pose, eerie magazine cover, red orange brown colors, impressionism, palette knife and brush strokes, photorealistic, detailed, intricate, 4 k, focused, extreme details, masterpiece', "highly detailed orange blue canti standing triumph after a battle from flcl ( 2 0 0 1 ), hoody, style of ashley wood, yoji shinkawa, jamie hewlett, 6 0's french movie poster, french impressionism, vivid colors, palette knife and brush strokes, grunge aesthetic, dynamic composition", "close up of flik ( a bug's life ), holding a blaster, in a still of star wars episode iv a new hope ( 1 9 7 7 ), by george lucas, death star scifi corridor", 'a close up portrait of Astro boy in the style of Megaman, weapon on a ready looking determined overlooking a cyberpunk city in the background, full face portrait composition, 2D drawing by Mike Mignola, Yoji Shinkawa, flat colors, chiaroscuro lighting', '3 5 mm photo of 1 9 4 0 batmoblie car designed by norman bel geddes, driving fast on the streets of gotham', 'photograph of a sunset in the beach, windy weather, reflections, white birds flying on sky', 'a mughal era painting of a modern cruise ship in arctic ocean', 'Kpop girl riding a motorcycle, 4k photography', 'hyper realistic photograph of wonder woman chopping trees in a rainforest at night', 'i wanted the fame but not the cover of news week', 'Amy Jackson blowing up blue balloon for ASMR', 'walter white holding the wwe champion belt, art by dean ellis', 'photograph of selena gomez riding a blue sports bike on a crowded street', 'Saul Goodman 3d meme', 'Memes of 2022', 'a flying DeLorean chasing a train, 1980s photorealistic style', 'guard of the kingdom in neptune, matte painting, behance hd', 'portrait of an ancient female assassin in white dress riding a motorcycle, photorealistic digital art', 'photo of marilyn monroe riding a motorcycle at night, cosmic horror, vibrant artwork by ellis dean', 'Ahri from League of Legends, photorealistic style', 'dove camwron blowing white balloon, mountains in background, stock photo', 'D.va from overwatch driving a racecar', 'A police lamborghini chasing a flying motorcycle', 'Luigi in GTA San Andreas', 'portrait of a woman holding a chainsaw at night wearing a jason mask, matte painting, volumetric lighting, post apocalyptic scenario, destroyed mirror world, dense fog in background, daytime', 'a cartoon face of a girl smiling with her eyes closed. she has red glasses, ears with piercings, red lips, eyelashes, pink cheeks, and black hair on a yellow background. the cartoony face is 2 d.', 'A two sided girl. The left side shows her with no changes. The right side however, shows her in a robotic form. She has a smiling expression!', 'a photorealistic picture of an evil man with a mask resembling jason voorhees using his own axe to kill a pig in the slaughterhouse', 'a neon sign depicting a rainbow brain with yellow electric bolts coming out of it', "mario as a macy's parade balloon", 'an extremely creepy picture of a bone in the road with blood and other bones which are part of a skeleton', 'an emoji of a yellow smiley face with the eyes becoming green circles with dollar signs, and the smiley face is sticking its tongue out with the tongue being green too and has a dollar sign too, apple emoji', 'a carbon - made peppa pig in real life', 'cave drawings of people eating oreos', 'a comic book featuring peppa pig', 'a giant long haired black - yellow dog and a giant long haired black - red dog', 'boobs', 'an abandoned city hall with the door being covered by some maersk shipping containers', 'A place full of land dinosaurs, along with flying dinosaurs and sea dinosaurs, 2D animation, 2D art, kids artwork', 'a giant, long - haired dog which has raised ears, white - black fur, and a tongue sticking out of it, smiling next to a gas station', 'peppa pig using ak - 4 7', 'a photorealistic image of a man without eyes and with his limbs sliced off, dead, with blood all over him, and some people doing a crime scene', 'a billboard advertisement for an insurance service with an eggplant', 'a flickr screenshot of an abandoned internet cafe with a blue wall mural, and a japanese sign, with artworks of people playing on computers on the mural', 'pixel man nft', 'tracer from overwatch drawn by hajime sorayama accurate, highly detailed', 'zbrush sculpt of female rogue world of warcraft stylized, artstation, character concept art, octane render, unreal engine 5', 'zbrush sculpture of ryu from street fighter trending on artstation S- 3814655743 n-5', 'portrait of strong female chaos angel, beautiful! coherent! by frank frazetta, by brom, strong line, deep color, spiked metal armor, maximalist', 'grass seamless texture', 'rafael grassetti figure sculpt zbrush', 'wrath from full metal alchemist as god of war amazing details 8 k beautiful ultra realistic by adam hughes sharp focus cinematic lightning', '3 x 3 array of tileable grass textures, grunge, stylized', 'jinx from league of legends, model, intricate, elegant, highly detailed, ray tracing, digital painting, artstation, concept art, smooth, sharp focus, illustration, art by artgerm and greg rutkowski and alphonse mucha, 8 k', 'warrior', 'digital painting of jaina proudmoore amazing details 8 k beautiful ultra realistic sharp focus cinematic lightning highly detailed, digital painting, artstation, concept art, smooth, sharp focus, illustration artgerm, tomasz alen kopera, peter mohrbacher, donato giancola, joseph christian leyendecker, wlop, frank frazetta', 'tracer from overwatch as ciri from witche r 5', 'goku in fortnite', 'tracer from overwatch portrait, close up, zbrush artstation concept art, intricate details, highly detailed portrait cinematic lightning, octane render, 8 k hd by artgerm', 'he black and gold geometric mother of death', 'portrait of jaina proudmoore amazing details 8 k beautiful ultra realistic sharp focus cinematic lightning highly detailed, digital painting, artstation, concept art, smooth, sharp focus, illustration sozomaika', 'team fortress 2 in the style of yoji shinkawa', 'tracer megan fox fine _ detail _ anime _ realistic _ shaded _ lighting _ dramatic _ poster _ by _ ilya _ kuvshinov', 'medieval bikes in the style of h. r giger', 'portrait of ciri the witcher 5 in the style of jeehyung lee mirco cabbia amazing details 4 k beautiful ultra realistic sharp focus cinematic lightning highly detailed, digital painting, artstation, concept art, smooth, sharp focus, illustration, concept art', 'detailed, well-lit studio photo of an elegently dressed young girl who resembles Anya Taylor Joy looking at an elegant, detailed, complex mechanical steampunk brass orrery with a glowing sectioned glass sun, her face illuminated by the glass, elegant highly detailed digital painting artstation smooth sharp focus illustration, by Michael Whelan, James Gurney, John Williams Waterhouse, and Donato Giancola', 'realistic detailed 14-year old girl wearing future cybernetic battle armor by Alphonse Mucha, Ayami Kojima, Amano, Charlie Bowater, Karol Bak, Greg Hildebrandt, Jean Delville, and Donato Giancola, Art Nouveau, Neo-Gothic, gothic, rich deep colors', 'Detailed Interior of a flooded cathedral, light of god, light shafts, candles, stunning atmosphere, in Style of Peter Mohrbacher, cinematic lighting', 'a teenage girl lying on the floor, wearing a nightgown, by Frederic Leighton', 'a realistic face portrait of a teenage girl who looks like Uma Thurmond and Anya Taylor Joy with an anxious expression and parted lips, wearing mechanical robotic battle armor, by John William Waterhouse, Frederic Leighton, Alphonse Mucha, Edward Burne Jones', 'a full body art nouveau portrait of a 16-year old girl who resembles Audrey Hepburn and Saoirse Ronan with a worried, intense gaze and slightly opened mouth, wearing sheer silks and ornate intricate iridescent mother-of-pearl jewelry, intricate, elegant, highly detailed, digital painting, artstation, concept art, smooth, sharp focus, illustration, art by John William Waterhouse and Bouguereau and Donato Giancola and alphonse mucha', 'a gorfal corfunzel by Hanz Freighly,', 'a realistic portrait of a teenage girl lying on the floor, wearing a nightgown like Flaming June, by Frederic Leighton, Alphonse Mucha, Edward Burne Jones', 'realistic detailed 14-year old girl wearing future cybernetic battle armor by Donato Giancola, Art Nouveau, Neo-Gothic, gothic, rich deep colors', 'an early photograph of a steampunk cathedral with god rays from the 19th century', 'an armored lich king screaming and getting up from his throne, by Mike Mignola', 'a full body art nouveau portrait of a fully armored samurai astronaut, intricate, elegant, highly detailed, digital painting, artstation, concept art, smooth, sharp focus, illustration, art by John William Waterhouse and William Adolphe Bouguereau and Donato Giancola and Alphonse Mucha', 'dramtically lit, high quality studio photo of a girl who looks like 16-year old Audrey Hepburn and Scarlett Johansson, with parted lips and stunning, anxious eyes, wearing a silver satin gown, by Steve McCurry', 'Detailed Interior of a cathedral made of fruit and vegetables, light of god, light shafts, candles, stunning atmosphere, in Style of Peter Mohrbacher, cinematic lighting', 'a full body art nouveau portrait of a 16-year old girl who resembles Emma Watson, Saoirse Ronan and Anya Taylor Joy, ornate intricate golden battle armor, intricate, elegant, highly detailed, digital painting, artstation, concept art, smooth, sharp focus, illustration, art by John William Waterhouse and greg rutkowski and Donato Giancola and alphonse mucha', 'realistic detailed face portrait of Joan of Arc wearing iridescent armor by Alphonse Mucha, art nouveau', 'realistic detailed stained glass of 16-year old girl who looks like Scarlett Johansson and Audrey Hepburn, as Anna from Frozen by Alphonse Mucha, Louis Comfort Tiffany, Ayami Kojima, Amano, Charlie Bowater, Karol Bak, Greg Hildebrandt, Jean Delville, and Mark Brooks, Art Nouveau, Neo-Gothic, gothic, rich deep colors', 'realistic detailed face portrait of 16-year old Scarlett Johansson as Joan of Arc wearing iridescent armor by Alphonse Mucha, art nouveau', 'a robotic tortoise from Horizon Zero Dawn, trending on Artstation, 4k HD', 'well-lit studio photograph of cutaway of the interior of a human cell, showing the nucleus and organelles, made of fruit floating in jello', 'a highly detailed Mark Zuckerberg latex mask by Rick Baker, high quality, Hollywood special effects, makeup', 'movie still of sean connery as gandalf in the lord of the rings, 4 k, high quality', 'painting of rivendell by maxfield parrish, stunning, beautiful, very detailed, waterfalls, elven architecture!!!!!!!!!!!!, 4 k, high quality', 'photo of Richard Stallman using a smartphone', 'detailed, intrincate painting of a biomechanic being painted by Oscar Chichoni, HD, high quality', 'steampunk Batman illustration, high quality, very detailed, dramatic, artstation', 'first-person footage of a skinwalker running towards the camera', 'ryan gosling eating cereal, wearing a black t - shirt, looking at the camera, 8 k, high quality', 'still of Betty White as Agent Smith in The Matrix', 'Anguish by Zdzislaw Beksinski, intrincate, hightly detailed, 4k, high quality', 'Zack de la Rocha jumping on a trampoline', 'a nuclear mushroom cloud inside of a bottle, studio photography, 4 k, high quality', 'a fat orange cat peacefuly floating in outer space', "a train stopped in front of a dead whale that's blocking the train tracks, highly detailed, 4k, high quality", 'a perfect pug - turtle hybrid, new reptile species, 8 k, high quality', 'Donald Trump as a goblin painted by Alan Lee, muted colors, folklore, goblincore, high quality', 'movie still of timothee chalamet as frodo in the lord of the rings, 4 k, high quality', 'a fire tornado destroying everything in its path, 8 k, high quality', 'an AI that creates other AIs, artstation, intrincate, highly detailed, epic', 'Brad Pitt failing to use the microwave, looking frustrated and angry', 'Ancient Greek statue of Donald Trump looking smug', 'Joe Biden wrestling Vladimir Putin, golden hour, in a garden, artstation, by J. C. Leyendecker and Peter Paul Rubens,', 'a beautiful woman in a swimsuit lays by the pacific sea, turquoise waters and palm trees, extremely detailed oil painting, sargent and leyendecker, savrasov levitan polenov, bruce pennington, tim hildebrandt, digital art, landscape painting, trending on artstation, masterpiece', 'Fidel Castro as a Capybara', 'actual photograph of UFO hovering over LA, award winning, golden hour,', 'two freaky floating twin nuns', 'ben stiller laughing wearing a yarmulke, award winning portrait, 5 0 mm photo', 'ferrari f 4 0 sports car on a racetrack during an overcast day, realistic 4 k octane beautifully detailed render, 4 k post - processing, highly detailed, intricate complexity, epic composition, magical atmosphere, cinematic lighting, masterpiece, ultra hd', 'Jesus and the Devil playing cards in a garden, photorealistic, award winning, 8k, trending on major art outlets,', 'lisa ann movie star wearing nun robes and habit', 'an old man smoking a pipe, sitting in a wooden rocking chair on a front porch, Whittling a piece of wood, by Norman Rockwell', 'a beautiful day at a tropical pool,colorised,photograph', 'Painting of muscular Elon Musk. Art by william adolphe bouguereau. During golden hour. Extremely detailed. Beautiful. 4K. Award winning.', 'tom cruise as bob ross, cinematic still, actively painting, amazing photo', 'a rabbit listening to the radio', 'dolce & gabbana campaign featuring sofia vergara as a cowgirl, long eyeslashes, huge juicy lips, big seductive eyes, unprocessed colors, # nofilter, shot by annie leibovitz, realistic vfx simulation', 'Aphex Twin sitting by a grand piano, background made of large folding curtains, dark, hyper detailed, hyper realistic, 8K phot realistic, black and white color, dimly lit, dark,', 'a clearing in a forest with a cabin, Disney cartoon, animation, high detail, colorful', 'Jesus Christ rising out from a tomb in a cliff side, cinematic perspective, movie shot, 8k, full hd', 'cinematic still, blade runner, roger rabbit in a flying delorean, high quality, futuristic', 'teletubbies dressed in adidas in a rave party', 'a painting of hip hop dancers by edgar degas', 'a guy with an orange cap and black shirt sketching graffiti on a blackbook on a green sofa closed to his friend drinking a beer', 'a car made of donuts', 'picture of darth vader by vivian maier', 'a burning dollar bill', 'mcdonalds commercial with the joker', 'rza playing chess with marcel duchamps in cadaques', 'mugshot of the pope', 'a man saying a secret to a computer', 'type made of cows', 'a nightmare', 'a robbery in a toy store', 'a spray paint can burning', 'couche de soleil sur montagne des alpes a l aquarelle', 'a robot whispering to a man', 'picture of a slave robot drawing smileys', 'daft punk pic by vivian maier', 'peace', 'picture by a microscope of a virus with a smiley', 'fractal horse by giger, partially skeleton, partially robot, deep focus, d & d, dark fantasy, intricate glow accents, elegant, highly detailed, digital painting, artstation, concept art, matte, sharp focus, 8 k 3 d, hearthstone, art by artgerm and greg rutkowski and alphonse mucha', 'chinese princess in a long silk dress, pale, beautiful symmetric face, kissing a black giant dragon, fantasy art, highly detailed art, cinematic atmosphere, volumetric lighting, glow, trending on artstation, by wlop, by le vuong, by tom bagshaw', 'fractal lovers by giger, golden ratio, deep focus, d & d, dark fantasy, intricate glow accents, elegant, highly detailed, digital painting, artstation, concept art, matte, sharp focus, octane render, hearthstone, art by artgerm and greg rutkowski and alphonse mucha', 'beautiful sun goddess, in silver armour, full body, porcelain highlighted skin, detailed face with manga style traits, detailed golden hair accessories, glowing pattern on skin, iridescent fractal whirls in flowy hair, passionate pose, intricate, elegant, sharp focus, highly detailed linework, fantasy, concept art, trending on artstation, 3 d 8 k, by artgerm and greg rutkowski, mucha, giger, beksinski, ross tran', 'beautiful sun goddess, in silver flowy dress, porcelain highlighted skin, detailed face with big sad eyes, detailed golden hair accessories, glowing pattern on skin, iridescent fractal whirls in flowy hair, passionate pose, intricate, elegant, sharp focus, highly detailed linework, fantasy, concept art, popart elements, trending on artstation, 3 d 8 k, by artgerm and greg rutkowski, mucha, giger, beksinski, ross tran', 'portrait of 3 women with flowy hair, wings, confident pose, pixie, genshin impact, intricate, elegant, sharp focus, soft bokeh, illustration, highly detailed, concept art, matte, trending on artstation, bright colors, art by wlop and artgerm and greg rutkowski, mucha, giger, marvel comics', 'two ethereal hummingbird goddesses dressed in fractal feathers, beautiful porcelain faces, passionate poses, by kinkade, by giger, shepard fairey, botticelli, john singer sargent, pre - raphaelites, shoujo manga, harajuku fashion, iridescent colors, detailed lineart, delicate glow accents, 8 k 3 d, arnold render', 'beautiful night goddess, in long flowy dress, closeup, porcelain highlighted skin, detailed face with anime style traits, detailed golden hair accessories, glowing pattern on skin, with beautiful horse, iridescent fractal whirls, passionate pose, intricate, elegant, sharp focus, highly detailed linework, trending on artstation, 3 d 8 k, by artgerm and greg rutkowski, mucha, giger, beksinski, ross tran', 'death is swallowed up in victory, very detailed and beautiful womans face, screaming with fear, artwork by artgerm, centered shot, wide angle, full body, elfpunk, artwork by naoto hattori, giger, landscape art by john howe', 'beautiful goddess, in silver flowy dress, porcelain highlighted skin, detailed face with big sad eyes, detailed golden hair accessories, glowing pattern on skin, iridescent fractal whirls in flowy hair, passionate pose, intricate, elegant, sharp focus, highly detailed linework, fantasy, concept art, cyberpunk elements, trending on artstation, 3 d 8 k, by artgerm and greg rutkowski, mucha, giger, beksinski, ross tran', 'ultra realistic mermaid princess closeup, gorgeous symmetric face and body, dramatic pose, glowing eyes, blush skin with freckles, long flowy hair, in the middle of arctic desert, sci - fi, fantasy, intricate, elegant, highly detailed, trending on artstation, concept art, smooth, sharp focus, octane render, dramatic volumetric lighting, inner glow, art by tian zi and artgerm and xiaoguang sun and giger, by wlop', 'beautiful moon goddess in long flowy dress, closeup, porcelain highlighted skin, detailed face with anime style traits, iridescent fractal whirls, passionate pose, intricate, elegant, sharp focus, highly detailed linework, trending on artstation, purple glow, 3 d 8 k, by artgerm and greg rutkowski, mucha, giger, beksinski, ross tran', 'a romantic scene of a young angel girl in love with beautiful long flower hair, flowers, 3 d render, hyper realistic, digital painting, fantasy art, beeple, peter mohrbacher, thomas kinkade', 'fractal lovers by giger, ethereal, passionate pose, ethereal, golden ratio, deep focus, d & d, dark fantasy, intricate purple green glow accents, elegant, highly detailed, digital painting, artstation, devianart, concept art, matte, sharp focus, octane render, hearthstone, art by artgerm and greg rutkowski and alphonse mucha', '3 / 4 view of a portrait of woman with flowy hair, bird wings, confident pose, pixie, genshin impact, intricate, elegant, sharp focus, illustration, highly detailed, concept art, matte, trending on artstation, bright colors, art by wlop and artgerm and greg rutkowski, mucha, marvel comics', 'ultra realistic nymph princess closeup, gorgeous symmetric face and body, dramatic combat pose, glowing eyes, blush skin with freckles, long red flowy hair, in the middle of arctic desert, sci - fi, fantasy, intricate, elegant, highly detailed, trending on artstation, concept art, smooth, sharp focus, octane render, dramatic volumetric lighting, inner glow, art by tian zi and artgerm and xiaoguang sun and giger, by wlop', 'botticelli eve and giger style adam dressed in fractal smoke in eden techno garden, beautiful porcelain faces, passionate poses, by kinkade, by giger, shepard fairey, botticelli, john singer sargent, pre - raphaelites, harajuku fashion, iridescent colors, detailed lineart, delicate neon glow accents, 8 k 3 d, arnold render', 'redhead muse by botticelli & giger, inspired by pre - raphaelites, shoujo manga and harajuku fashion, made of painted carved high - relief, semi - transparent marble, opaque glass, filament, polished mahoganny wood, intricate detail, dark - blue, light - blue, black, gold, silver, black background, kintsugi, realistic, cinematic lighting', 'demonic man and angelic woman in armour, passionate pose, pixie, renaissance impact, intricate, elegant, golden glow, sharp focus, soft bokeh, illustration, highly detailed, concept art, matte, trending on artstation, pastel colors, 3 d 8 k, art by wlop and artgerm and greg rutkowski, mucha, giger, marvel comics, beksinski,', 'ultra realistic cyborg knight princess portrait, gorgeous symmetric face and body, dramatic combat pose, glowing eyes, blush skin with freckles, flowy hair, in the middle of arctic desert, sci - fi, fantasy, intricate, elegant, highly detailed, trending on artstation, concept art, smooth, sharp focus, octane render, dramatic volumetric lighting, inner glow, art by tian zi and artgerm and xiaoguang sun and giger, by wlop', 'beautiful full body portrait of a female cyberpunk gnome black, wearing a fancy velvet tunic, by wlop and artgerm, steampunk! fiction, detailed deep black eyes, starry background, trending, on artstation.', 'photographic portrait by helen levitt of the woman who inspired gioconda, studio lighting, sigma 8 5 mm lens', 'a fully dressed!!! portrait of beautiful ornated hanuman!!!! god with flowing medium hair, soft facial features, kind appearence, digital art by alphonse mucha, inspired by krishen khanna and madhvi parekh, symmetrical body, artgerm, portrait, muted color scheme, highly detailed, outrun art style', 'portrait of sergio aguero with almost no beard by greg rutkowski, young, attractive, highly detailed portrait, scifi, digital painting, artstation, concept art, smooth, sharp foccus ilustration, artstation hq', 'very detailed portrait of a rugged man in his early thirties, strong jaw, deep black eyes, latino features, wearing a black!! t - shirt, earthy color scheme, by wlop and krenz cushart and artgerm, 9 0 s style, detailed eyes, starry background, trending, on artstation.', "a sculpture of michelangelo's david, realistic charcoal portrait by frank auerbach", "a sculpture of a winged angel's torso, very detailed charcoal portrait by frank auerbach", '! dream dakini as a modern fairy wearing a pink outfit, flying in the style of superman alongside penguins.', 'portrait human - giraffe hybrid, scaley black onyx skin', 'highly detailed, semi - realistic contemporary digital illustration, colored contours, well shaded, portrait of king arthur leaning over a table', 'high fantasy portrait of a red - headed early thirties female fairy, pixie, fae, imp, sprite in vctorian double bun hairstyle and sparkly, gleam baby pink coral outfit, elegant, intricate, highly detailed, smooth, sharp focus, ethereal, misty, fireflies in the backdrop. octane render, pastel color scheme, by hayao miyazaki.', 'award winning full body portrait of a beautiful ornated hanuman god, leaping!!!!!, intricate, elegant, highly detailed, digital painting, artstation, concept art, smooth, sharp focus, digital illustration, art by krenz cushart and artem demura and alphonse mucha', 'apteryx mantelli', 'portrait of bald sergio aguero in mid thirties with gray designer stubble!!!!!!! by greg rutkowski, attractive, highly detailed portrait, scifi, digital painting, artstation, concept art, smooth, sharp foccus ilustration, artstation hq', 'very detailed portrait of a rugged brazilian man in his early thirties, in profile, strong jaw, clean face, light stubble!!! ( ( deep black eyes ) ), detailed, ( very slight asian features ), ( ( ( strong latino features ) ) ), sharp nose pointing down, pastel color scheme, by tyler oulton, starry background, trending, on artstation.', 'if paris was built on the moon', 'a lone indigenous man overlooking a ledge towards the jungle below in dense amazon | highly detailed | very intricate | cinematic lighting | by asher brown durand and eddie mendoza | featured on artstation', 'award winning watercolor of a 3 0 year old auburn - headed fairy in short pigtails wearing a sparkly baby pink swimsuit with blue translucent dragonfly wings, against a cloudy blue sky backdrop, by hayao miyazaki', 'hanumanasana yoga posture, stretching one leg forward and the other straight back, arms stretched above the head, and the palms joined together', 'beautiful profile painting, highly detailed of lord hanuman, the monkey god, doing a front split', 'dwayne johnson playing yao ming in a biopic film, basketball scene, wide shot', 'Coach Belichick studying a football playbook while on vacation at the Eiffel tower', 'Dwayne Johnson as a crusty sea captain', "film still from the new netflix fantasy adventure movie'chrono trigger'( 2 0 2 2 )", 'The President of the United States shaking hands with a sinister grey space alien, official portrait', 'a large sea bird covered in trash and filth', 'charlemagne as a new street fighter character, screenshot, character select', "comic book cover of'jfk meets the underground mole people ', art by alex ross", 'grainy photo of erin esurance as a creepy monster in a closet, harsh flash', 'dwayne johnson as john madden', 'a photo of a house burning down in the background and mr. bean with an eerie expression in the foreground, strong depth of field', 'Coach Belichick in Edo Period Japan trying to teach samurai how to play football', 'peyton manning as a new street fighter character, screenshot, character select', 'andy reid as doctor who, 1 9 7 0 s, wide shot', 'Optimus Prime as a football coach', 'patrick stewart as usain bolt', 'john madden in interstellar', "still image from the new studio ghibli animated film'coach tomlin wins the superbowl'", "still image from the new studio ghibli animated film'the crying of lot 4 9'", 'john madden as ian malcom from jurassic park', 'beautiful coherent award-winning manga OVA DVD cover art of a mysterious lonely cyborg anime woman wearing a plugsuit, serial experiments lain, neon genesis evangelion, anime, animated, painted by tsutomu nihei', 'beautiful coherent award-winning manga cover art of a mysterious lonely anime woman wearing a plugsuit and traversing an endless concrete hallway, by tsutomu nihei', 'beautiful portrait of anthony fantano, theneedledrop, standing in desolate empty brutalist ruins desert wasteland, close to the camera, painted by zdzislaw beksinski', 'highly detailed professional seinen manga cover art of goth woman with red hair, red eyes, leather clothes, black makeup. chunibyo. horror action cyberpunk action manga cover promotional art. detailed intricate environment. pencils by ilya kuvshinov, painted by zdzislaw beksinski, inks & layouts by tsutomu nihei. blame!', 'highly detailed professional portrait of 9 0 s seinen manga art of goth woman with red hair, black makeup, and red eyes. chunibyo. drawn by', 'a desolate landscape with a lonely looming brutalist tower in the center, drawn by tsutomu nihei', 'highly detailed professional late 2 0 0 0 s shonen manga cover art of goth woman with red hair, red eyes, leather clothes, black makeup. chunibyo. horror cyberpunk action manga cover promotional art. detailed and intricate environment. pencils by ilya kuvshinov and painted by zdzislaw beksinski, inked by tsutomu nihei', 'a creepy cell phone camera picture of an alleyway in west philadelphia at night, with a woman in the distance.', 'beautiful! coherent! detailed! expert! professional portrait art of a goth clowngirl, painted by ilya kuvshinov!!! and zdzislaw beksinski', 'beautiful! coherent! detailed! expert! professional manga seinen portrait art of an emo goth jester clowngirl, painted by ilya kuvshinov!!! and designed by tsutomu nihei and zdzislaw beksinski', 'a portrait of a depressed girl made in a magazine clipping collage style, made by a depressed art student, art project', 'professionally drawn 9 0 s seinen seinen seinen mature cyberpunk horror action manga comic cover, full color, beautifully drawn coherent professional, drawn by ilya kuvshinov, ilya kuvshinov, and hiromu arakawa and tsutomu nihei. japanese script on the cover. stern woman in foreground. award - winning manga.', 'a desolate landscape dotted with brutalist complexes, drawn by tsutomu nihei,', 'a beautiful professional portrait of mc ride, painted by tsutomu nihei', 'professionally drawn seinen mature cyberpunk detective horror action manga comic cover, full color, beautifully drawn coherent professional, drawn by ilya kuvshinov!, satoshi kon, kentaro miura, dave mckean, tsutomu nihei. japanese script kanji hiragana on the cover. minimalist stylized cover art. indigo blue cel shaded', 'a portrait of dril painted by zdzislaw beksinski', '9 0 s seinen horror manga cover art', 'professionally drawn shoujo mature horror mystery romance manga comic cover, beautifully drawn museum portrait coherent professional, drawn by ilya kuvshinov, gustav klimt, alphonse mucha and tsutomu nihei. japanese script kanji hiragana on the cover. simplistic minimalist stylized cover art. pink & green & blue full color.', 'adorable film still of a piece of garlic, produced by studio ghibli', 'professionally drawn shoujo mature cyberpunk horror romance manga comic cover full color, beautifully drawn coherent professional, drawn by ilya kuvshinov, ilya kuvshinov!, dave mckean, alphonse mucha and tsutomu nihei. japanese script kanji hiragana on the cover. simplistic minimalist stylized cover art. pink & purple & blue full color.', 'close up of nicolas cage staring deep into the camera, tears pouring down his face, big toothy smile, fisheye lens', 'Abraham Lincoln wearing a flat brim cap backwards, taking a hit from a Juul e-cigarette, portrait photography by Lee Jeffries', 'elderly gene wilder playing gandalf, still from lord of the rings ( 2 0 0 3 )', 'still of gollum from lord of the rings devouring a juicy delicious burger', 'cinestill of a large blonde labradoodle skiing down a snowy mountain', 'cinestill of a blonde labradoodle working in a cubicle, wearing glasses and a dress, photograph by annie leibovitz', 'cinematic shot of adorable black kitten walking through neon cyberpunk neo - tokyo, studio ghibli, hayao miyazaki, anime, detailed', 'still of a woman riding on a giant dachshund, sports photography, action photography', 'cinestill of jfk putting on a helmet', 'still of scarlett johansson staring angrily into the camera, close - up shot of eyes, by annie leibovitz, kodak portra 4 0 0, 5 0 mm f / 1. 8', 'action shot of obese people swimming in gravy pool, hot brown thick gravy, sports photography, espn, summer olympics', 'colorful concept art of gandalf fighting the balrog, rodel gonzalez, marc davis, milt kahl, jim warren, don bluth, rob kaz, glen keane, jason deamer', 'movie still of evan rachel wood pointing gun at young harrison ford in blade runner ( 1 9 8 2 ), portrait, film grain, atmospheric lighting, action shot, low profile, wide angle', 'matte painting of epic fantasy landscape with golden hour lighting, colorful concept art by ted nasmith, john howe, alan lee, paul raymond gregory, inger edelfeldt, tim kirk, angus mcbride, jenny dolfen, high fantasy, trending on artstation', 'idyllic underwater scene with sunbeam shining through water, oil painting, Impressionism, in the style of Claude Monet, 4K, trending on ArtStation', 'faded daguerreotype portrait of disturbing haunted demonic abomination clown body horror', 'movie still of evan rachel wood talking to young harrison ford in blade runner ( 1 9 8 2 ) by ridley scott, portrait, film grain, atmospheric lighting, action shot', 'gandalf flying on a giant eagle over mount doom while it erupts lava, vivid concept art by ted nasmith, john howe, alan lee, paul raymond gregory, inger edelfeldt, tim kirk, angus mcbride, jenny dolfen, high fantasy, trending on artstation', 'still of jfk dodging bullets as neo in the matrix ( 1 9 9 9 )', 'faded daguerreotype of a creepy old doll, uncanny valley, disturbing', 'Evolve Squirtle into Charizard', 'Cat Women crouches on edge of building, in style of Mark Brooks and Artgerm', 'Hyperdetailed digital artwork concept art of Night island and the aurora borealis reflected in the dark sapphire water in style of Albert Bierstadt and Jim Burns, 4k resolution post-processing, Octane Render, Unreal Engine 5, Global Illumination, smooth, epic composition, cinematic shot', 'Poison Ivy entangled with Vines, in style of Mark Brooks and Artgerm', 'Beautiful Elven Princess Portrait, fantasy style, intricate', 'Cyberpunk Android design drawn by a manga artist, in style of Akira Toriyama and Gustave Dore, cinematic shot', 'a beautiful painting of a building in a serene landscape by Adonna Khare', 'a beautiful painting of a A paradisiacal landscape of a sea lagoon and city built on water, rays of light illuminating the water by John Howe, Trending on Artstation, Landscape vista', 'a beautiful painting of a building in a serene landscape by Alexander Milne Calder', 'a portrait of princess jasmine, in style of Bowater Charlie and Chausheva Katia', 'a portrait of princess jasmine, in style of Bowater Charlie and Krentz Cushart', 'plasma body, anime spectral female character, emerge from big old creepy tree, mist aura, black eyes melt, full body portrait, photorealistic, volumetric lighting, octane rendering, dark and mysterious, atmospheric, ominous, creepy, cinematic, real, concept art, Epic, 8k, 4k, ultra detail, ultra realistic, trading art station, rendered by awesomeness', 'Athene Roman Statue, Intricate Details, 8k resolution', 'A Beautiful digital artwork of the A bunch of goblins in the basket of a travel balloon, war and battle, in style by Dan Mumford, Cyril Rolando and M.W Kaluta, 8k resolution, Ultrafine details, Rendered in Unreal Engine 5, Cinematic Composition, Reimagined by industrial light and magic, smooth,4k, beautiful lighting, HDR, IMAX, Cinema 4D, shadow depth', 'a beautiful painting of a building in a serene landscape by Anton Pieck', 'multi dimension tesseract hybrids with impossible non Euclidian geometry', 'philippina woman tattoos on body dramatic beach cinematic photorealistic sunset', 'a beautiful painting of a building in a serene landscape by Anton Otto Fischer', "a painting of a woman's face with a sky background, a comic book panel by makoto shinkai, featured on pixiv, crystal cubism, stained glass, made of glass, official art", 'strange horror house by junji ito, hugh ferriss, lee madgwick, alex grey and gustave dore ; spiralled blood red and smoke black art nouveau architecture ; in the style of gothic art, elaborate horror house by wes benscoter, weird, beautiful, gorgeous, incredible depth, concept art, lifelike, photorealistic, imposing, evil, biblical hell, 8 k resolution, hyperrealism, detailed painting, deviantart, trending on artstation, unreal engine', 'jared leto in the style of junji ito', 'karkalicious', 'fingers in a blender', 'negative film portrait of a woman', 'goth girlfriend', 'short spear with a spike at the bottom of the handle', 'dave strider in the style of ancient egyptian artwork', 'photograph of a human heart laying in a forest full of dead trees', 'bloodstained sakura flowers', 'humanoid barn owl wearing a roman toga and holding a javelin', 'man covered in teeth', 'morbius in ancient rome', 'jared leto on the cover of the weezer blue album', 'eyes outside your window at night', 'lin manuel miranda in the style of junji ito', 'elder scrolls 6 new game leaked footage', 'jared leto in outer space', 'frog wedding', 'train wearing a suit', 'Biblically accurate muppet in stained glass at a church being worshipped', 'thief, dagger, leather armor, full body, hyper realistic, extremely detailed, dnd character art portrait, dark fantasy art, intricate fantasy painting, dramatic lighting, vivid colors, deviantart, artstation, by edgar maxence and caravaggio and michael whelan and delacroix.', 'fantasy illustration of a manticore at a tea party with a gnome wizard and and ogre knight of the realm.  Setting is a forest.  Table is a tree stump with a kettle on top', "enchanted forest. rocky opening gaping hole in the ground ( 1 5'across, 2 0'deep ). misty waterfall ( stream cascades underground ). rocky sides ( lush patches of moss and ferns ). rough stone stairway ( cut into the hole's rocky side, leading down ). edgar maxence and caravaggio and michael whelan and delacroix style, artistic, intricate painting, cinematic lighting, hyper realistic, extremely detailed, vivid colors, establishing shot, dramatic lighting", 'barbarian, full body, savage, realistic, dnd character art portrait, dark fantasy art, matte fantasy painting, deviantart artstation, by jason felix by steve argyle by tyler jacobson by edgar maxence and caravaggio and michael whelan and delacroix', 'pinhole black and white photo of a victorian living room with costumed people dancing', 'stone sarcophagus. dnd, dark fantasy art, intricate fantasy painting, dramatic lighting, vivid colors, deviantart, artstation, by edgar maxence and caravaggio and michael whelan and delacroix.', 'fantasy illustration of giant mutant frogs', 'a painting by edgar maxence and caravaggio and michael whelan and delacroix style, artistic, intricate painting, cinematic lighting, hyper realistic, extremely detailed, vivid colors, establishing shot, dramatic lighting', 'elf bard playing lute, full body, hyper realistic, extremely detailed, dnd character art portrait, dark fantasy art, intricate fantasy painting, dramatic lighting, vivid colors, deviantart, artstation, by edgar maxence and caravaggio and michael whelan and delacroix.', 'gelatinous cube, realistic, dnd monster, dark fantasy art, matte fantasy painting, deviantart artstation, by jason felix by steve argyle by tyler jacobson by peter mohrbacher by paul hedley, cinema,', 'wizard casting thunderwave, dnd character art portrait, dark fantasy art, deviantart, artstation, by edgar maxence and caravaggio and michael whelan and delacroix.', 'tiefling bard, full body, hyper realistic, extremely detailed, dnd character art portrait, dark fantasy art, intricate fantasy painting, dramatic lighting, vivid colors, deviantart, artstation, by edgar maxence and caravaggio and michael whelan.', "the small gnome's garb is a riot of color, embroidered in gold thread. atop his head, a triangular hat of green felt sits at a rakish angle. emerald eyes gleam with mirth, and his smiling face is framed by a tangle of black hair. his brass instruments - - hand - bells and a panpipe", "jim henson's labyrinth. hedge maze. orchardlush orchard, packed with strange fruit trees. fairies flit from tree to tree, fighting with hummingbirds for nectar. 4 night trolls are beneath the largest tree. edgar maxence and caravaggio and michael whelan and delacroix style, artistic, intricate painting, cinematic lighting, hyper realistic, extremely detailed, vivid colors, establishing shot, dramatic lighting", 'druid. young man with braided brown hair. wildflowers in his hair. amber eyes. leather armor, and a longbow. wooden staff carved with strange symbols. edgar maxence and caravaggio and michael whelan and delacroix style, artistic, intricate painting, cinematic lighting, hyper realistic, extremely detailed, vivid colors, establishing shot, dramatic lighting', 'pinhole camera photo of a witch flying on a broomstick', 'spelljammer galleon sailing the galaxy by clyde caldwell', 'gnome barbarian, full body, hyper realistic, extremely detailed, dnd character art portrait, dark fantasy art, intricate fantasy painting, dramatic lighting, vivid colors, deviantart, artstation, by clyde caldwell and krenz cushart and artem demura and john williams waterhouse', 'black and white illustration by erol otus the pack of kobolds is crouched in a circle.', 'harpy, dnd character art portrait, dramatic lighting, vivid colors by edgar maxence and caravaggio.', 'detailed hipster skinny man wearing big vr head set, long vibrant beard, dmt, by james gurney + intricate and vibrant work + portrait + trending on artstation + incredible gothic illustration + exquisite detail', 'green dragon detailed pixel art', 'arabian hassan as - sabbah on throne chair rising as a phenix + epic wide scene, cinematic lighting, artgerm, artstation, deviantart, 8 k, high detailed', 'rabbit groot as marble statue with sunglasses, blue sunglasses, in red background, soft blue texture, blue realistic 3 d render, high blue lights, 4 k, high detailed photography cape, 5 0 mm lens, rich blue colors, smooth gradients, depth of field, cinematic, hyper realism, high detail, octane render, unreal engine, 8 k very red colors, cape', 'detailed hipster skinny man wearing htc vive headset, long vibrant beard, dmt, by james gurney + intricate and vibrant work + portrait + trending on artstation + incredible gothic illustration + exquisite detail', 'detailed hipster skinny man with sunglasses, long beard with fires, dmt, by james gurney + intricate and vibrant work + portrait + trending on artstation + incredible gothic illustration + exquisite detail', 'detailed umm kulthum golden statue, by james gurney + intricate and vibrant gold line work + tarot card + mandelbulb fractal + full of black layers + portrait + trending on artstation + incredible gold and black gothic illustration + exquisite detail', 'detailed hipster skinny man with! vr headset!, long vibrant beard, dmt, by james gurney + intricate and vibrant work + portrait + trending on artstation + incredible gothic illustration + exquisite detail', 'rabbit groot as marble statue, red sunglasses, in red background, soft red texture, red realistic 3 d render, soft red lights, 4 k, high red photography red, 5 0 mm lens, rich red colors, smooth gradients, depth of field, cinematic, hyper realism, high detail, octane render, very red, 8 k, very red colors', 'detailed image of Anaximander by Ayami Kojima, Amano, Karol Bak, Greg Hildebrandt, and Mark Brooks, rich deep purple colors. Beksinski painting, part by Adrian Ghenie and Gerhard Richter. art by Takato Yamamoto. masterpiece . intricate artwork by Tooth Wu and wlop and beeple, greg rutkowski, very coherent symmetrical artwork, cinematic, hyper realism, high detail, octane render, unreal engine, 8k, Vibrant colors, Smooth gradients, High contrast, depth of field. by Katsuhiro Otomo, full body character drawing, inspired by Evangeleon, clean ink detailed line drawing, intricate detail, extremely detailed. painting by Arthur Rackham, Eugene de Blaas, Frederic Leighton', 'detailed mighty dmt goddess, by hokusai and james gurney + black paper with intricate and vibrant dmt line work + tarot card + mandelbulb fractal + full of silver layers + portrait + trending on artstation + incredible dmt and black gothic illustration + exquisite detail', 'unicorn wearing vr headset, vr headset in techno background, soft gradient texture, realistic 3 d render, high lights, 4 k, high detailed photography, 5 0 mm lens, rich vivid colors, smooth gradients, depth of field, cinematic, hyper realism, high detail, octane render, unreal engine, 8 k', 'cairo old streets + night life of 1 9 4 0, muizz street + egyptian muslim girl wearing egyptian hijab', 'a detailed fantasy character portrait of morgan freeman as saudi arab king by lauri blank, artgerm, evelyn de morgan, 8K, 50mm lens', 'detailed image of bots by Ayami Kojima, Amano, Karol Bak, Greg Hildebrandt, and Mark Brooks, rich deep universe colors. Beksinski painting, part by Adrian Ghenie and Gerhard Richter. art by Takato Yamamoto. masterpiece . intricate artwork by Tooth Wu and wlop and beeple, greg rutkowski, very coherent symmetrical artwork, cinematic, hyper realism, high detail, octane render, unreal engine, 8k, Vibrant colors, Smooth gradients, High contrast, depth of field. by Katsuhiro Otomo, full body character drawing, inspired by Evangeleon, clean ink detailed line drawing, intricate detail, extremely detailed. painting by Arthur Rackham, Eugene de Blaas, Frederic Leighton', 'detailed portrait of pirate cat as claw video game, hyper detailed, digital art, trending in artstation, cinematic lighting, studio quality, smooth render, unreal engine 5 rendered, octane rendered, art style by klimt and nixeu and ian sprigger and wlop and krenz cushart', 'rabbit groot as marble statue with sunglasses, blue sunglasses, in red background, soft blue texture, blue cape, blue realistic 3 d render, high blue lights, 4 k, high detailed photography, 5 0 mm lens, rich blue colors, smooth gradients, depth of field, cinematic, hyper realism, high detail, octane render, unreal engine, 8 k', 'very detailed portrait of skull with pearl spheres glowing eyes + melting face skin as candles + 4 k hyper details render + dramatic lighting + cinematography photography', 'face bleeding gold liquid, realistic 3 d render, gold tones, global illumination', 'under water bugs bunny, water light scattering, underwater photography, high details, 8 k, realistic shot, cinematic lighting', 'Movie still of Diane Kruger in The Expanse', 'Sheryl Sandberg in Got Milk? Ad', 'Mark Zuckerberg marble statue by Michelangelo, Metropolitan Museum of Fine Art, 24mm f/1.4', 'Film still of Sheryl Sandberg in SoulCycle (2017), directed by Steven Spielberg', 'Movie still of Mark Zuckerberg taking cover on Omaha Beach in Saving Private Ryan, establishing shot, smoke, gritty, action photo', 'CEO Tony Haile', 'Photo of Emma Watson in swimsuit, soft studio lighting, photo taken by Julia Margaret Cameron for Abercrombie and Fitch, award-winning photograph, 24mm f/1.4', 'Photo of Angelina Jolie wearing Warby Parker glasses, soft studio lighting, photo taken by Martin Schoeller for Abercrombie and Fitch, award-winning photo, 24mm f/1.4', 'Sheryl Sandberg at SoulCycle, Instagram photo', 'Portrait photo of dominatrix Sheryl Sandberg, 85mm f/1.4', 'taylor swift emoji', 'Sheryl Sandberg as Overly Attached Girlfriend', 'Photo of Joe Biden in swimsuit, soft studio lighting, photo taken by Anne Liebovitz for Abercrombie and Fitch, award-winning photograph, 24mm f/1.4', 'Photo of Sam Altman, OpenAI CEO, at Folsom Street Fair, by Anne Liebovitz, 85mm f/1.4', 'Portrait photo of Mark Zuckerberg, photographed by Anne Geddes, soft studio lighting, 85mm f/1.4', 'Portrait photo of a baby Mark Zuckerberg with flower, photographed by Anne Geddes', 'Photo of Ryan Gosling in swimsuit, soft studio lighting, photo taken by Anne Liebovitz for Abercrombie and Fitch, award-winning photograph, 24mm f/1.4', 'sheryl sandberg, medal of honor', 'Movie still of Sheryl Sandberg imprisoned in Supermax in Facebook The Movie (2017), directed by Steven Spielberg', 'Movie still of Emma Watson in The Matrix']

test_y = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91]

# unique_authors = [ 2 10 15 18 21 25 31 38 41 45 49 51 69 72 78 84 85 88 89 91]
unique_authors = list(set(test_y))
color_map = {2: (0.12156862745098039, 0.4666666666666667, 0.7058823529411765, 1.0), 10: (0.6823529411764706, 0.7803921568627451, 0.9098039215686274, 1.0), 15: (1.0, 0.4980392156862745, 0.054901960784313725, 1.0), 18: (1.0, 0.7333333333333333, 0.47058823529411764, 1.0), 21: (0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0), 25: (0.596078431372549, 0.8745098039215686, 0.5411764705882353, 1.0), 31: (0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0), 38: (1.0, 0.596078431372549, 0.5882352941176471, 1.0), 41: (0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0), 45: (0.7725490196078432, 0.6901960784313725, 0.8352941176470589, 1.0), 49: (0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0), 51: (0.7686274509803922, 0.611764705882353, 0.5803921568627451, 1.0), 69: (0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0), 72: (0.9686274509803922, 0.7137254901960784, 0.8235294117647058, 1.0), 78: (0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0), 84: (0.7803921568627451, 0.7803921568627451, 0.7803921568627451, 1.0), 85: (0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0), 88: (0.8588235294117647, 0.8588235294117647, 0.5529411764705883, 1.0), 89: (0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0), 91: (0.6196078431372549, 0.8549019607843137, 0.8980392156862745, 1.0)}



from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))



ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_try2/content_encoder_supcon_9.pt'

import torch
from transformers import BertGenerationEncoder, BertConfig

# Load your model's state_dict from the .pt file
state_dict = torch.load(ckpt_path)

# Reinitialize the model with the same configuration used during training
# You can load a default or custom configuration if needed
config = BertConfig.from_pretrained('bert-base-cased')  # or use your specific configuration
model = BertGenerationEncoder(config)

# Load the weights into the model
model.load_state_dict(state_dict)

# The model is now loaded and ready for inference or further training

df = pd.DataFrame({'Target': test_y, 'content': test_x})

# model = BertGenerationEncoder.from_pretrained(ckpt_path)
# model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
# model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  # x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  outputs = model(x1, attention_mask=x3)
  # all_feats.append(outputs.last_hidden_state.flatten(1).cpu().detach().numpy())
  # all_labels.append(y.cpu().detach().numpy())
  all_feats.append(outputs.last_hidden_state.flatten(1).detach().numpy())
  all_labels.append(y.detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]


# Create a color map for authors
# unique_authors = df['Target'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModel,
    EncoderDecoderModel,
    BertGenerationDecoder,
    BertGenerationEncoder,
    BertTokenizer,
    BertModel)

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()
print(test_x)
print(test_y)

# from transformers import BertTokenizer, BertModel
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# extractor = BertModel.from_pretrained('bert-base-cased')

# num_tokens, hidden_dim, out_dim = 256, 512, 100
# test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
# test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
#                              pin_memory=True)

# pg = tqdm(test_loader, leave=False, total=len(test_loader))
# ngpus, dropout = torch.cuda.device_count(), 0.35
# num_tokens, hidden_dim, out_dim = 256, 512, 100
# model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))



ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_try2/content_encoder_supcon_9.pt'

import torch
from transformers import BertGenerationEncoder, BertConfig

# Load your model's state_dict from the .pt file
state_dict = torch.load(ckpt_path)

# Reinitialize the model with the same configuration used during training
# You can load a default or custom configuration if needed
config = BertConfig.from_pretrained('bert-base-cased')  # or use your specific configuration
model = BertGenerationEncoder(config)

# Load the weights into the model
model.load_state_dict(state_dict)

# The model is now loaded and ready for inference or further training


# model = BertGenerationEncoder.from_pretrained(ckpt_path)
# model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
# model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  # x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  outputs = model(x1, attention_mask=x3)
  # all_feats.append(outputs.last_hidden_state.flatten(1).cpu().detach().numpy())
  # all_labels.append(y.cpu().detach().numpy())
  all_feats.append(outputs.last_hidden_state.flatten(1).numpy())
  all_labels.append(y.numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]


# Create a color map for authors
# unique_authors = df['Target'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=20, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
# model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_para_bert-base-cased_coe1_temp0.1_unit2_epoch30/diffusiondb100_supcon_para_val0.72321_e29.pt'
model = BertModel.from_pretrained('bert-base-cased')
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  feature = model(input_ids=x[0], attention_mask=x[2])
  feats = feature.last_hidden_state.flatten(1)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# # Create a color map for authors
# unique_authors = df['user_name'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# # Plot t-SNE results
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     indices = df['user_name'] == author
#     plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

# plt.title("t-SNE Visualization of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# # Step 4: Calculate Cluster Centers
# cluster_centers = df.groupby('user_name')[['tsne_1', 'tsne_2']].mean()

# # Plot cluster centers with the same color coding
# plt.figure(figsize=(12, 8))
# for author in unique_authors:
#     plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

# plt.title("Cluster Centers of Prompts")
# plt.xlabel("t-SNE Dimension 1")
# plt.ylabel("t-SNE Dimension 2")
# # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# plt.show()

# Create a color map for authors
# unique_authors = df['Target'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_cls_para_bert-base-cased_coe0.0_temp0.1_unit2_epoch30/diffusiondb100_cls_para_val0.72073_e24.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]


# Create a color map for authors
# unique_authors = df['Target'].unique()
# colors = plt.cm.get_cmap('tab20', len(unique_authors))
# color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


## other tsne plot

In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_para_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_para_val0.72073_e29.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]


# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe2_para_bert-base-cased_coe2.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe2_para_val0.73512_e24.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]


# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_para_maskn_bert-base-cased_coe1_temp0.1_unit2_epoch30/diffusiondb100_supcon_para_maskn_val0.67560_e26.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]

# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lclonly_coe1_para_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lclonly_coe1_para_val0.73859_e27.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]


# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


## tsne 2

In [None]:
# fix the color code of each author
# find cluster center of training data
# show the embeddings of text data - circle for correctly classified data, triangle for wrongly classified data

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np
from transformers import BertTokenizer, BertModel

# Load train data
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/train_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']
df1 = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df1 = df1[['prompt', 'user_name']]
df1.columns = ['content', 'Target']

selected_authors = np.random.choice(df['Target'].unique(), size=50, replace=False)
df = df[df['Target'].isin(selected_authors)]
df1 = df1[df1['Target'].isin(selected_authors)]

train_x, train_y = df['content'].tolist(), df['Target'].tolist()
test_x, test_y = df1['content'].tolist(), df1['Target'].tolist()
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('hsv', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}
print(color_map.keys())

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')
num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)
train_set = BertDataset(train_x, train_y, tokenizer, num_tokens)
train_loader = DataLoader(train_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe2_para_bert-base-cased_coe2.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe2_para_val0.73512_e24.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

model.eval()

with torch.no_grad():
    pg = tqdm(train_loader, leave=False, total=len(train_loader))
    all_feats = []
    all_labels = []
    for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
      x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
      pred, feats = model(x, return_feat=True)
      all_feats.append(feats.cpu().detach().numpy())
      all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

with torch.no_grad():
    pg1 = tqdm(test_loader, leave=False, total=len(test_loader))
    all_feats1 = []
    all_labels1 = []
    all_preds1 = []
    for i, (x1, x2, x3, y) in enumerate(pg1):  # for x1, x2, x3, y in train_set:
      x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
      pred, feats = model(x, return_feat=True)
      all_feats1.append(feats.cpu().detach().numpy())
      all_labels1.append(y.cpu().detach().numpy())
      pred = torch.argmax(pred, dim=1)
      all_preds1.append(pred.cpu().numpy())

all_feats1 = np.concatenate(all_feats1, axis=0)
all_labels1 = np.concatenate(all_labels1, axis=0)
all_preds1 = np.concatenate(all_preds1, axis=0)
print(all_labels1)
print(all_preds1)

all_feats_all = np.vstack((all_feats, all_feats1))

# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats_all)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

train_tsne = best_tsne[: len(all_feats)]
test_tsne = best_tsne[len(all_feats): ]

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = train_tsne[:, 0]
df['tsne_2'] = train_tsne[:, 1]

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min() - 5
tsne_1_max = df['tsne_1'].max() + 5
tsne_2_min = df['tsne_2'].min() - 5
tsne_2_max = df['tsne_2'].max() + 5

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()


# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=30, color=color_map[author], marker='x')

count = 0
for i, (x, y_true, y_pred) in enumerate(zip(test_tsne, all_labels1, all_preds1)):
    if y_true == y_pred:
        count += 1
        plt.scatter(x[0], x[1], color=color_map[y_true], s=30, marker='o')
    else:
        plt.scatter(x[0], x[1], color=color_map[y_true], s=30, marker='^')

print('acc', count / len(all_labels1))

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')
num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)
train_set = BertDataset(train_x, train_y, tokenizer, num_tokens)
train_loader = DataLoader(train_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = "/content/drive/MyDrive/msc_project/model/contrastive/club/style_encoder_1.pt"
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

model.eval()
with torch.no_grad():
    pg = tqdm(train_loader, leave=False, total=len(train_loader))
    all_feats = []
    all_labels = []
    for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
      x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
      pred, feats = model(x, return_feat=True)
      all_feats.append(feats.cpu().detach().numpy())
      all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

with torch.no_grad():
    pg1 = tqdm(test_loader, leave=False, total=len(test_loader))
    all_feats1 = []
    all_labels1 = []
    all_preds1 = []
    for i, (x1, x2, x3, y) in enumerate(pg1):  # for x1, x2, x3, y in train_set:
      x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
      pred, feats = model(x, return_feat=True)
      all_feats1.append(feats.cpu().detach().numpy())
      all_labels1.append(y.cpu().detach().numpy())
      pred = torch.argmax(pred, dim=1)
      all_preds1.append(pred.cpu().numpy())

all_feats1 = np.concatenate(all_feats1, axis=0)
all_labels1 = np.concatenate(all_labels1, axis=0)
all_preds1 = np.concatenate(all_preds1, axis=0)
print(all_labels1)
print(all_preds1)

all_feats_all = np.vstack((all_feats, all_feats1))

# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats_all)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

train_tsne = best_tsne[: len(all_feats)]
test_tsne = best_tsne[len(all_feats): ]

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = train_tsne[:, 0]
df['tsne_2'] = train_tsne[:, 1]

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min() - 5
tsne_1_max = df['tsne_1'].max() + 5
tsne_2_min = df['tsne_2'].min() - 5
tsne_2_max = df['tsne_2'].max() + 5

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()


# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=30, color=color_map[author], marker='x')

count = 0
for i, (x, y_true, y_pred) in enumerate(zip(test_tsne, all_labels1, all_preds1)):
    if y_true == y_pred:
        count += 1
        plt.scatter(x[0], x[1], color=color_map[y_true], s=30, marker='o')
    else:
        plt.scatter(x[0], x[1], color=color_map[y_true], s=30, marker='^')

print('acc', count / len(all_labels1))

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')
num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)
train_set = BertDataset(train_x, train_y, tokenizer, num_tokens)
train_loader = DataLoader(train_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_para_bert-base-cased_coe1_temp0.1_unit2_epoch30/diffusiondb100_supcon_para_val0.72321_e29.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

model.eval()
with torch.no_grad():
    pg = tqdm(train_loader, leave=False, total=len(train_loader))
    all_feats = []
    all_labels = []
    for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
      x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
      pred, feats = model(x, return_feat=True)
      all_feats.append(feats.cpu().detach().numpy())
      all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

with torch.no_grad():
    pg1 = tqdm(test_loader, leave=False, total=len(test_loader))
    all_feats1 = []
    all_labels1 = []
    all_preds1 = []
    for i, (x1, x2, x3, y) in enumerate(pg1):  # for x1, x2, x3, y in train_set:
      x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
      pred, feats = model(x, return_feat=True)
      all_feats1.append(feats.cpu().detach().numpy())
      all_labels1.append(y.cpu().detach().numpy())
      pred = torch.argmax(pred, dim=1)
      all_preds1.append(pred.cpu().numpy())

all_feats1 = np.concatenate(all_feats1, axis=0)
all_labels1 = np.concatenate(all_labels1, axis=0)
all_preds1 = np.concatenate(all_preds1, axis=0)
print(all_labels1)
print(all_preds1)

all_feats_all = np.vstack((all_feats, all_feats1))

# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats_all)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

train_tsne = best_tsne[: len(all_feats)]
test_tsne = best_tsne[len(all_feats): ]

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = train_tsne[:, 0]
df['tsne_2'] = train_tsne[:, 1]

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min() - 5
tsne_1_max = df['tsne_1'].max() + 5
tsne_2_min = df['tsne_2'].min() - 5
tsne_2_max = df['tsne_2'].max() + 5

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()


# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=30, color=color_map[author], marker='x')

count = 0
for i, (x, y_true, y_pred) in enumerate(zip(test_tsne, all_labels1, all_preds1)):
    if y_true == y_pred:
        count += 1
        plt.scatter(x[0], x[1], color=color_map[y_true], s=30, marker='o')
    else:
        plt.scatter(x[0], x[1], color=color_map[y_true], s=30, marker='^')

print('acc', count / len(all_labels1))

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()


In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
# df = df[['prompt', 'user_name']]
# df.columns = ['content', 'Target']

# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_try/style_encoder_supcon_8.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_feats = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_feats.append(feats.cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_feats = np.concatenate(all_feats, axis=0)
all_labels = np.concatenate(all_labels, axis=0)


# Step 3: Visualize using t-SNE
n_iter = 1  # Number of iterations
best_kl_divergence = float('inf')
best_tsne = None

for i in range(n_iter):
    tsne = TSNE(n_components=2, random_state=i, perplexity=30)
    X_tsne = tsne.fit_transform(all_feats)
    kl_divergence = tsne.kl_divergence_
    print(f"Iteration {i + 1}: KL-Divergence = {kl_divergence}")

    if kl_divergence < best_kl_divergence:
        best_kl_divergence = kl_divergence
        best_tsne = X_tsne

# Step 6: Add the best t-SNE results to DataFrame
df['tsne_1'] = best_tsne[:, 0]
df['tsne_2'] = best_tsne[:, 1]


# Create a color map for authors
unique_authors = df['Target'].unique()
colors = plt.cm.get_cmap('tab20', len(unique_authors))
color_map = {author: colors(i) for i, author in enumerate(unique_authors)}

# Calculate the range for the t-SNE plot
tsne_1_min = df['tsne_1'].min()
tsne_1_max = df['tsne_1'].max()
tsne_2_min = df['tsne_2'].min()
tsne_2_max = df['tsne_2'].max()

# Plot t-SNE results
plt.figure(figsize=(10, 6))
for author in unique_authors:
    indices = df['Target'] == author
    plt.scatter(df.loc[indices, 'tsne_1'], df.loc[indices, 'tsne_2'], label=author, s=10, color=color_map[author])

plt.xlim(tsne_1_min, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("t-SNE Visualization of Prompts", fontsize=16)
plt.xlabel("t-SNE Dimension 1", fontsize=14)
plt.ylabel("t-SNE Dimension 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.savefig('tsne_visualization_of_prompts.png', bbox_inches='tight')
plt.show()

# Step 4: Calculate Cluster Centers
cluster_centers = df.groupby('Target')[['tsne_1', 'tsne_2']].mean()

# Plot cluster centers with the same color coding
plt.figure(figsize=(10, 6))
for author in unique_authors:
    plt.scatter(cluster_centers.loc[author, 'tsne_1'], cluster_centers.loc[author, 'tsne_2'], label=author, s=100, color=color_map[author], marker='x')

plt.xlim(-70, tsne_1_max)
plt.ylim(tsne_2_min, tsne_2_max)
plt.title("Cluster Centers of Prompts", fontsize=20)
plt.xlabel("t-SNE Dimension 1", fontsize=18)
plt.ylabel("t-SNE Dimension 2", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('cluster_centers_of_prompts.png', bbox_inches='tight')
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.show()

# Step 5: Calculate Distances
in_cluster_distances = []
out_cluster_distances = []

for author in df['Target'].unique():
    author_indices = df['Target'] == author
    author_points = df.loc[author_indices, ['tsne_1', 'tsne_2']]
    other_points = df.loc[~author_indices, ['tsne_1', 'tsne_2']]

    # In-cluster distances
    distances_to_center = pairwise_distances(author_points, cluster_centers.loc[author].values.reshape(1, -1))
    in_cluster_distances.extend(distances_to_center)

    # Out-cluster distances
    distances_to_others = pairwise_distances(author_points, other_points)
    out_cluster_distances.extend(distances_to_others.mean(axis=1))

# Convert to numpy arrays for visualization
in_cluster_distances = np.array(in_cluster_distances)
out_cluster_distances = np.array(out_cluster_distances)

# Step 6: Visualize Distances
plt.figure(figsize=(10, 6))
plt.hist(in_cluster_distances, bins=30, alpha=0.5, label='In-Cluster Distances')
plt.hist(out_cluster_distances, bins=30, alpha=0.5, label='Out-Cluster Distances')
plt.legend(fontsize=20)
plt.title("Distribution of In-Cluster and Out-Cluster Distances", fontsize=20)
plt.xlabel("Distance", fontsize=18)
plt.ylabel("Frequency", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig('distribution_of_distances.png', bbox_inches='tight')
plt.show()

# Calculate and display statistics
def calculate_statistics(distances):
    return {
        'mean': np.mean(distances),
        'median': np.median(distances),
        'std': np.std(distances),
        'min': np.min(distances),
        'max': np.max(distances),
        '25th_percentile': np.percentile(distances, 25),
        '50th_percentile': np.percentile(distances, 50),
        '75th_percentile': np.percentile(distances, 75),
        '90th_percentile': np.percentile(distances, 90),
    }

# Calculate statistics for in-cluster and out-cluster distances
in_cluster_stats = calculate_statistics(in_cluster_distances)
out_cluster_stats = calculate_statistics(out_cluster_distances)

# Convert the statistics to a DataFrame for better visualization
stats_df = pd.DataFrame({
    'In-Cluster Distances': in_cluster_stats,
    'Out-Cluster Distances': out_cluster_stats
})

# Display the statistics
print(stats_df)


## accuracy (tested using paraphrase dataset)

In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df = df[['prompt', 'paraphrase', 'user_name']]
df.columns = ['content', 'paraphrase', 'Target']

# Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['paraphrase'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_para_bert-base-cased_coe1_temp0.1_unit2_epoch30/diffusiondb100_supcon_para_val0.72321_e29.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_preds = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_preds.append(pred.argmax(1).cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_preds = np.concatenate(all_preds, axis=0).tolist()
all_labels = np.concatenate(all_labels, axis=0).tolist()
print(all_preds)
print(all_labels)
# Compare predictions to labels
correct_predictions = sum(p == l for p, l in zip(all_preds, all_labels))
# Calculate accuracy
accuracy = correct_predictions / len(all_preds)
print(accuracy)

In [None]:
# tfidf - perplexit 30, iteration 10

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import numpy as np

# Load data
# df = pd.read_csv('/content/drive/MyDrive/dataset_topic_analysis/final_50_authors_50_prompts.csv', encoding='utf-8')
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df = df[['prompt', 'paraphrase', 'user_name']]
df.columns = ['content', 'paraphrase', 'Target']

# Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['paraphrase'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_cls_para_bert-base-cased_coe0.0_temp0.1_unit2_epoch30/diffusiondb100_cls_para_val0.72073_e24.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

all_preds = []
all_labels = []
for i, (x1, x2, x3, y) in enumerate(pg):  # for x1, x2, x3, y in train_set:
  x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
  pred, feats = model(x, return_feat=True)
  all_preds.append(pred.argmax(1).cpu().detach().numpy())
  all_labels.append(y.cpu().detach().numpy())

all_preds = np.concatenate(all_preds, axis=0).tolist()
all_labels = np.concatenate(all_labels, axis=0).tolist()
print(all_preds)
print(all_labels)
# Compare predictions to labels
correct_predictions = sum(p == l for p, l in zip(all_preds, all_labels))
# Calculate accuracy
accuracy = correct_predictions / len(all_preds)
print(accuracy)

## prediction accuracy for each of the author - para

In [None]:
import torch
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_topicseparate100_label_1.csv')
df = df[['prompt', 'user_label']]
df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe1_para_topic_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe1_para_topic_val0.48859_e29.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()
# Lists to store the true labels and predicted labels
all_labels = []
all_preds = []

# testing
model.eval()
pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=False)
with torch.no_grad():
    test_acc = AverageMeter()
    for i, (x1, x2, x3, y) in enumerate(pg):
        x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
        pred, feats = model(x, return_feat=True)
        # logger
        test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
        pred = torch.argmax(pred, dim=1)
        all_labels.extend(y.cpu().numpy())
        all_preds.extend(pred.cpu().numpy())
        pg.set_postfix({
            'test acc': '{:.6f}'.format(test_acc.avg),
        })

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({
    'author': all_labels,    # True labels (author)
    'pred_author': all_preds # Predicted labels (author)
})

# Group by author and calculate accuracy for each author
author_accuracy_cls_contra = results_df.groupby('author').apply(lambda x: (x['author'] == x['pred_author']).mean())

# Display the accuracy for each author
# print(author_accuracy_cls_contra)


In [None]:
author_accuracy_cls_contra.sum()

In [None]:
import torch
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_para_bert-base-cased_coe1_temp0.1_unit2_epoch30/diffusiondb100_supcon_para_val0.72321_e29.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()
# Lists to store the true labels and predicted labels
all_labels = []
all_preds = []

# testing
model.eval()
pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=False)
with torch.no_grad():
    test_acc = AverageMeter()
    for i, (x1, x2, x3, y) in enumerate(pg):
        x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
        pred, feats = model(x, return_feat=True)
        # logger
        test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
        pred = torch.argmax(pred, dim=1)
        all_labels.extend(y.cpu().numpy())
        all_preds.extend(pred.cpu().numpy())
        pg.set_postfix({
            'test acc': '{:.6f}'.format(test_acc.avg),
        })

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({
    'author': all_labels,    # True labels (author)
    'pred_author': all_preds # Predicted labels (author)
})

# Group by author and calculate accuracy for each author
author_accuracy_cls_contra = results_df.groupby('author').apply(lambda x: (x['author'] == x['pred_author']).mean())

# Display the accuracy for each author
# print(author_accuracy_cls_contra)


In [None]:
import torch
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_cls_para_bert-base-cased_coe0.0_temp0.1_unit2_epoch30/diffusiondb100_cls_para_val0.72073_e24.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()
# Lists to store the true labels and predicted labels
all_labels = []
all_preds = []

# testing
model.eval()
pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=False)
with torch.no_grad():
    test_acc = AverageMeter()
    for i, (x1, x2, x3, y) in enumerate(pg):
        x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
        pred, feats = model(x, return_feat=True)
        # logger
        test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
        pred = torch.argmax(pred, dim=1)
        all_labels.extend(y.cpu().numpy())
        all_preds.extend(pred.cpu().numpy())
        pg.set_postfix({
            'test acc': '{:.6f}'.format(test_acc.avg),
        })

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({
    'author': all_labels,    # True labels (author)
    'pred_author': all_preds # Predicted labels (author)
})

# Group by author and calculate accuracy for each author
author_accuracy_cls = results_df.groupby('author').apply(lambda x: (x['author'] == x['pred_author']).mean())

# Display the accuracy for each author
# print(author_accuracy_cls)


In [None]:
import torch
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe2_para_bert-base-cased_coe2.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe2_para_val0.73512_e24.pt'
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe1_para_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe1_para_val0.73264_e16.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()
# Lists to store the true labels and predicted labels
all_labels = []
all_preds = []

# testing
model.eval()
pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=False)
with torch.no_grad():
    test_acc = AverageMeter()
    for i, (x1, x2, x3, y) in enumerate(pg):
        x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
        pred, feats = model(x, return_feat=True)
        # logger
        test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
        pred = torch.argmax(pred, dim=1)
        all_labels.extend(y.cpu().numpy())
        all_preds.extend(pred.cpu().numpy())
        pg.set_postfix({
            'test acc': '{:.6f}'.format(test_acc.avg),
        })

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({
    'author': all_labels,    # True labels (author)
    'pred_author': all_preds # Predicted labels (author)
})

# Group by author and calculate accuracy for each author
author_accuracy_lcl = results_df.groupby('author').apply(lambda x: (x['author'] == x['pred_author']).mean())

# Display the accuracy for each author
# print(author_accuracy_cls)


In [None]:
import torch
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe2_para_bert-base-cased_coe2.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe2_para_val0.73512_e24.pt'
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_try2/style_encoder_supcon_9.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()
# Lists to store the true labels and predicted labels
all_labels = []
all_preds = []

# testing
model.eval()
pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=False)
with torch.no_grad():
    test_acc = AverageMeter()
    for i, (x1, x2, x3, y) in enumerate(pg):
        x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
        pred, feats = model(x, return_feat=True)
        # logger
        test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
        pred = torch.argmax(pred, dim=1)
        all_labels.extend(y.cpu().numpy())
        all_preds.extend(pred.cpu().numpy())
        pg.set_postfix({
            'test acc': '{:.6f}'.format(test_acc.avg),
        })

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({
    'author': all_labels,    # True labels (author)
    'pred_author': all_preds # Predicted labels (author)
})

# Group by author and calculate accuracy for each author
author_accuracy_mi = results_df.groupby('author').apply(lambda x: (x['author'] == x['pred_author']).mean())

# Display the accuracy for each author
# print(author_accuracy_cls)


In [None]:
accuracy_result = pd.concat([author_accuracy_cls_contra, author_accuracy_cls], axis=1)
accuracy_result = pd.concat([accuracy_result, author_accuracy_lcl], axis=1)
accuracy_result = pd.concat([accuracy_result, author_accuracy_mi], axis=1)
accuracy_result.columns = ['cls_contra', 'cls', 'lcl', 'mi']
accuracy_result

In [None]:
accuracy_result['mi_minus_cls'] = accuracy_result['mi'] - accuracy_result['cls']
accuracy_result['mi_minus_cls_contra'] = accuracy_result['mi'] - accuracy_result['cls_contra']
accuracy_result['mi_minus_lcl'] = accuracy_result['mi'] - accuracy_result['lcl']
accuracy_result

In [None]:
# Count the number of positive values
num_positive = (accuracy_result['mi_minus_cls'] > 0).sum()

# Count the number of zero values
num_zero = (accuracy_result['mi_minus_cls'] == 0).sum()

# Count the number of negative values
num_negative = (accuracy_result['mi_minus_cls'] < 0).sum()

# Print the results
print(f"Number of positive values: {num_positive}")
print(f"Number of zero values: {num_zero}")
print(f"Number of negative values: {num_negative}")

# Count the number of positive values
num_positive = (accuracy_result['mi_minus_cls_contra'] > 0).sum()

# Count the number of zero values
num_zero = (accuracy_result['mi_minus_cls_contra'] == 0).sum()

# Count the number of negative values
num_negative = (accuracy_result['mi_minus_cls_contra'] < 0).sum()

# Print the results
print(f"Number of positive values: {num_positive}")
print(f"Number of zero values: {num_zero}")
print(f"Number of negative values: {num_negative}")

# Count the number of positive values
num_positive = (accuracy_result['mi_minus_lcl'] > 0).sum()

# Count the number of zero values
num_zero = (accuracy_result['mi_minus_lcl'] == 0).sum()

# Count the number of negative values
num_negative = (accuracy_result['mi_minus_lcl'] < 0).sum()

# Print the results
print(f"Number of positive values: {num_positive}")
print(f"Number of zero values: {num_zero}")
print(f"Number of negative values: {num_negative}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter the DataFrame for rows where mi_minus_cls is negative
# df_negative = accuracy_result
df_negative = accuracy_result[accuracy_result['mi_minus_cls'] < 0]
# Set the figure size
plt.figure(figsize=(12, 6))

# Set up the x-axis to be the index (row number) of the filtered DataFrame
x = df_negative.index

# Plot 'mi' values as bars in blue
plt.bar(x - 0.2, df_negative['mi'], width=0.4, label='mi', color='blue')

# Plot 'cls' values as bars in red
plt.bar(x + 0.2, df_negative['cls'], width=0.4, label='cls', color='red')

# Set labels and title
plt.xlabel('Row Number')
plt.ylabel('Value')
plt.title('mi and cls Values for Rows with Negative mi_minus_cls')

# Add legend to differentiate between mi and cls
plt.legend()

# Show the plot
plt.show()


In [None]:

accuracy_result.sum()

## quantify model confidence

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

def compute_topk_entropy(pred, k):
    # Step 1: Convert logits to probabilities
    probs = F.softmax(pred, dim=1)

    # Step 2: Get the top-k probabilities
    topk_probs, _ = torch.topk(probs, k=k, dim=1)

    # Step 3: Normalize top-k probabilities to sum to 1
    normalized_topk_probs = topk_probs / topk_probs.sum(dim=1, keepdim=True)

    # Step 4: Calculate entropy for the top-k probabilities
    entropy = -torch.sum(normalized_topk_probs * torch.log2(normalized_topk_probs), dim=1)

    return entropy.mean().item()


def get_predictions(ckpt_path, test_loader):

    ngpus, dropout = torch.cuda.device_count(), 0.35
    num_tokens, hidden_dim, out_dim = 256, 512, 100
    model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
    model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
    model = nn.DataParallel(model).cuda()

    model.eval()

    all_preds = []
    pg = tqdm(test_loader, leave=False, total=len(test_loader))
    with torch.no_grad():
        for i, (x1, x2, x3, y) in enumerate(pg):
            x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
            pred, feats = model(x, return_feat=True)
            all_preds.append(pred)

    all_preds = torch.cat(all_preds, dim=0)

    return all_preds


df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

ckpt_path1 = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe1_para_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe1_para_val0.73264_e16.pt'
# ckpt_path1 = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe2_para_bert-base-cased_coe2.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe2_para_val0.73512_e24.pt'
ckpt_path2 = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_cls_para_bert-base-cased_coe0.0_temp0.1_unit2_epoch30/diffusiondb100_cls_para_val0.72073_e24.pt'
ckpt_path3 = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_para_bert-base-cased_coe1_temp0.1_unit2_epoch30/diffusiondb100_supcon_para_val0.72321_e29.pt'

all_preds1 = get_predictions(ckpt_path1, test_loader)
all_preds2 = get_predictions(ckpt_path2, test_loader)
all_preds3 = get_predictions(ckpt_path3, test_loader)


In [None]:
k_values = [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
avg_entropies1 = []
avg_entropies2 = []
avg_entropies3 = []
for k in k_values:
    avg_entropy1 = compute_topk_entropy(all_preds1, k)
    avg_entropies1.append(avg_entropy1)

    avg_entropy2 = compute_topk_entropy(all_preds2, k)
    avg_entropies2.append(avg_entropy2)

    avg_entropy3 = compute_topk_entropy(all_preds3, k)
    avg_entropies3.append(avg_entropy3)

print(avg_entropies1)
print(avg_entropies2)
print(avg_entropies3)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(k_values, avg_entropies1, label="SUP+LCL+CE")
plt.plot(k_values, avg_entropies2, label="CE")
plt.plot(k_values, avg_entropies3, label="SUP+CE")
plt.xlabel("k")
plt.ylabel("Entropy (bits)")
plt.title("Averaged Entropy of the Prediction Score Distributions")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
print(all_preds1)
print(all_preds2)
print(all_preds3)

## acuracy

In [None]:
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe1_para_albert-base-v2_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe1_para_val0.68056_e17.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_cls_para_albert-base-v2_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_cls_para_val0.66964_e24.pt'

# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe1_para_roberta-base_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe1_para_val0.73065_e22.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_cls_para_roberta-base_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_cls_para_val0.71627_e26.pt'

# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lclonly_coe1_para_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lclonly_coe1_para_val0.73859_e27.pt'

# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_lcl_coe1_para_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_lcl_coe1_para_val0.73264_e16.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_cls_para_bert-base-cased_coe0.0_temp0.1_unit2_epoch30/diffusiondb100_cls_para_val0.72073_e24.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_para_bert-base-cased_coe1_temp0.1_unit2_epoch30/diffusiondb100_supcon_para_val0.72321_e29.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_try2/style_encoder_supcon_9.pt'

# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/blogs50_cls_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/blogs50_cls_val0.82588_e29.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/blogs50_lcl_coe1_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/blogs50_lcl_coe1_val0.83203_e28.pt'

# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/imdb62_cls_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/imdb62_cls_val0.97917_e27.pt'

# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_clean_cls_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_clean_cls_val0.41518_e23.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_clean_lcl_coe1_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_clean_lcl_coe1_val0.43056_e22.pt'

# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb120_lcl_coe1_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb120_lcl_coe1_val0.79117_e26.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb80_lcl_coe1_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb80_lcl_coe1_val0.68899_e22.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb60_lcl_coe1_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb60_lcl_coe1_val0.61954_e28.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_200_lcl_coe1_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_200_lcl_coe1_val0.68975_e27.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/lcl/diffusiondb100_150_lcl_coe1_bert-base-cased_coe1.0_temp0.1_unit2_epoch30/diffusiondb100_150_lcl_coe1_val0.73867_e29.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_80/style_encoder_supcon1_14.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_60/style_encoder_supcon1_17.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_clean/style_encoder_supcon1_8.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_imdb621/style_encoder_supcon1_6.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_blogs501/style_encoder_supcon1_10.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_120/style_encoder_supcon1_12.pt'
# ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_100_150/style_encoder_supcon1_6.pt'
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/club_100_2001/style_encoder_supcon1_7.pt'

# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs50/processed/blogs50_AA_test.csv')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/clean/test_random100_label_1.csv')
# df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/vary/test_random100_200_label_1.csv')
df = df[['prompt', 'user_name']]
# df = df[['text', 'author_id']]
df.columns = ['content', 'Target']

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel, AlbertTokenizer, AlbertModel, RobertaTokenizer, RobertaModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')
# tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
# extractor = AlbertModel.from_pretrained('albert-base-v2')
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# extractor = RobertaModel.from_pretrained('roberta-base')

num_tokens, hidden_dim, out_dim = 256, 512, 200
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

ngpus, dropout = torch.cuda.device_count(), 0.35
# num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()

model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for i, (x1, x2, x3, y) in enumerate(test_loader):  # for x1, x2, x3, y in train_set:
        x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
        pred, feats = model(x, return_feat=True)
        all_preds.append(pred.argmax(1).cpu().detach().numpy())
        all_labels.append(y.cpu().detach().numpy())

all_preds = np.concatenate(all_preds, axis=0).tolist()
all_labels = np.concatenate(all_labels, axis=0).tolist()
# print(all_preds)
# print(all_labels)
# Compare predictions to labels
correct_predictions = sum(p == l for p, l in zip(all_preds, all_labels))
# Calculate accuracy
accuracy = correct_predictions / len(all_preds)
print('accuracy', accuracy)

from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision, recall, and F1 score
# Macro average (unweighted mean, treats all classes equally):
macro_precision = precision_score(all_labels, all_preds, average='macro')
macro_recall = recall_score(all_labels, all_preds, average='macro')
macro_f1 = f1_score(all_labels, all_preds, average='macro')

# Weighted average (weighted by class frequencies):
weighted_precision = precision_score(all_labels, all_preds, average='weighted')
weighted_recall = recall_score(all_labels, all_preds, average='weighted')
weighted_f1 = f1_score(all_labels, all_preds, average='weighted')

# Print the results
print(f"Macro Precision: {macro_precision}")
print(f"Macro Recall: {macro_recall}")
print(f"Macro F1: {macro_f1}")

print(f"Weighted Precision: {weighted_precision}")
print(f"Weighted Recall: {weighted_recall}")
print(f"Weighted F1: {weighted_f1}")


## prediction accuracy for each of the author

In [None]:
import torch
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_bert-base-cased_coe1_temp0.1_unit2_epoch20/diffusiondb100_val0.77108_e17.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()
# Lists to store the true labels and predicted labels
all_labels = []
all_preds = []

# testing
model.eval()
pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=False)
with torch.no_grad():
    test_acc = AverageMeter()
    for i, (x1, x2, x3, y) in enumerate(pg):
        x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
        pred, feats = model(x, return_feat=True)
        # logger
        test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
        pred = torch.argmax(pred, dim=1)
        all_labels.extend(y.cpu().numpy())
        all_preds.extend(pred.cpu().numpy())
        pg.set_postfix({
            'test acc': '{:.6f}'.format(test_acc.avg),
        })

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({
    'author': all_labels,    # True labels (author)
    'pred_author': all_preds # Predicted labels (author)
})

# Group by author and calculate accuracy for each author
author_accuracy_cls_contra = results_df.groupby('author').apply(lambda x: (x['author'] == x['pred_author']).mean())

# Display the accuracy for each author
# print(author_accuracy_cls_contra)


In [None]:
import torch
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_cls_bert-base-cased_coe1_temp0.1_unit2_epoch20/diffusiondb100_cls_val0.77510_e7.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()
# Lists to store the true labels and predicted labels
all_labels = []
all_preds = []

# testing
model.eval()
pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=False)
with torch.no_grad():
    test_acc = AverageMeter()
    for i, (x1, x2, x3, y) in enumerate(pg):
        x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
        pred, feats = model(x, return_feat=True)
        # logger
        test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
        pred = torch.argmax(pred, dim=1)
        all_labels.extend(y.cpu().numpy())
        all_preds.extend(pred.cpu().numpy())
        pg.set_postfix({
            'test acc': '{:.6f}'.format(test_acc.avg),
        })

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({
    'author': all_labels,    # True labels (author)
    'pred_author': all_preds # Predicted labels (author)
})

# Group by author and calculate accuracy for each author
author_accuracy_cls = results_df.groupby('author').apply(lambda x: (x['author'] == x['pred_author']).mean())

# Display the accuracy for each author
# print(author_accuracy_cls)


In [None]:
import torch
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/processed/test_random100_label_1.csv')
df = df[['prompt', 'user_name']]
df.columns = ['content', 'Target']

# # Randomly select 10 authors
# selected_authors = np.random.choice(df['Target'].unique(), size=10, replace=False)
# df = df[df['Target'].isin(selected_authors)]

test_x, test_y = df['content'].tolist(), df['Target'].tolist()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
extractor = BertModel.from_pretrained('bert-base-cased')

num_tokens, hidden_dim, out_dim = 256, 512, 100
test_set = BertDataset(test_x, test_y, tokenizer, num_tokens)
test_loader = DataLoader(test_set, batch_size=24, shuffle=False, num_workers=4,
                             pin_memory=True)

pg = tqdm(test_loader, leave=False, total=len(test_loader))
ngpus, dropout = torch.cuda.device_count(), 0.35
num_tokens, hidden_dim, out_dim = 256, 512, 100
model = BertClassifier(extractor, LogisticRegression(768 * num_tokens, hidden_dim, out_dim, dropout=dropout))
ckpt_path = '/content/drive/MyDrive/msc_project/model/contrastive/contrax/exp_data/diffusiondb100_supcon_cls_bert-base-cased_coe1_temp0.1_unit6_epoch30/diffusiondb100_supcon_cls_val0.79216_e24.pt'
model = load_model_dic(model, ckpt_path, verbose=True, strict=True)
model = nn.DataParallel(model).cuda()
# Lists to store the true labels and predicted labels
all_labels = []
all_preds = []

# testing
model.eval()
pg = tqdm(test_loader, leave=False, total=len(test_loader), disable=False)
with torch.no_grad():
    test_acc = AverageMeter()
    for i, (x1, x2, x3, y) in enumerate(pg):
        x, y = (x1.cuda(), x2.cuda(), x3.cuda()), y.cuda()
        pred, feats = model(x, return_feat=True)
        # logger
        test_acc.update((pred.argmax(1) == y).sum().item() / len(y))
        pred = torch.argmax(pred, dim=1)
        all_labels.extend(y.cpu().numpy())
        all_preds.extend(pred.cpu().numpy())
        pg.set_postfix({
            'test acc': '{:.6f}'.format(test_acc.avg),
        })

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({
    'author': all_labels,    # True labels (author)
    'pred_author': all_preds # Predicted labels (author)
})

# Group by author and calculate accuracy for each author
author_accuracy_cls_supcon = results_df.groupby('author').apply(lambda x: (x['author'] == x['pred_author']).mean())

# Display the accuracy for each author
# print(author_accuracy_cls_contra)


In [None]:
accuracy_result = pd.concat([author_accuracy_cls_contra, author_accuracy_cls], axis=1)
accuracy_result = pd.concat([author_accuracy_cls_contra, author_accuracy_cls], axis=1)
accuracy_result.columns = ['cls_contra', 'cls']
accuracy_result

In [None]:
accuracy_result.sum()

## Dataset-level

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
    '--dataset', 'blog',
    '--id', 'blog10',
    '--gpu', '0',
    '--tqdm', 'True',
    '--authors', '10',
    '--epochs', '8',
    '--model', 'bert-base-cased'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    df = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
    df = df[['prompt', 'user_name']]
    df.columns = ['content', 'From']

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    nlp_train, nlp_val, nlp_test_paraphrased = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # print(df.head(5))
    print(nlp_test_paraphrased.head(5))

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
    '--dataset', 'blog',
    '--id', 'blog50',
    '--gpu', '0',
    '--tqdm', 'True',
    '--authors', '50',
    '--epochs', '8',
    '--model', 'bert-base-cased'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    df = load_dataset_dataframe(source)

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    nlp_train, nlp_val, nlp_test_50 = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # print(df.head(5))
    print(nlp_test_50.head(5))

In [None]:
if __name__ == '__main__':
    datasets = ['imdb62', 'blog', 'turing', 'diffusiondb']
    parser = argparse.ArgumentParser(description=f'Training models for datasets {datasets}')
    parser.add_argument('--dataset', type=str, help='dataset used for training', choices=datasets)
    parser.add_argument('--id', type=str, default='0', help='experiment id')
    parser.add_argument('--gpu', type=str, help='the cuda devices used for training', default="0,1,2,3")
    parser.add_argument('--tqdm', type=bool, help='whether tqdm is on', default=False)
    parser.add_argument('--authors', type=int, help='number of authors', default=None)
    parser.add_argument('--samples-per-auth', type=int, help='number of samples per author', default=None)
    parser.add_argument('--epochs', type=int, default=5)
    parser.add_argument('--model', type=str, default='microsoft/deberta-base')
    parser.add_argument('--coe', type=float, default=1)

    # dataset - num of authors mapping
    default_num_authors = {
        'imdb62': 62,
        'blog': 50,
        'turing': 20,
        'diffusiondb': 100,
    }

    training_args = [
    '--dataset', 'turing',
    '--id', 'turing',
    '--gpu', '0',
    '--tqdm', 'True',
    '--authors', '20',
    '--epochs', '8',
    '--model', 'bert-base-cased'
    ]

    # parse args
    args = parser.parse_args(training_args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    source = args.dataset
    num_authors = args.authors if args.authors is not None else default_num_authors[args.dataset]
    print(' '.join(f'{k}={v}' for k, v in vars(args).items()))  # print all args

    # masked classes
    mask_classes = {
        'blog': [],
        'imdb62': [],
        'turing': [],
        'diffusiondb': [],
    }

    # load data and remove emails containing the sender's name
    df = load_dataset_dataframe(source)

    if args.authors is not default_num_authors[args.dataset]:
        warnings.warn(f"Number of authors for dataset {args.dataset} is {default_num_authors[args.dataset]}, "
                      f"but got {args.authors} instead. ")

    if args.samples_per_auth is not None:
        warnings.warn(f"Number of samples per author specified as {args.samples_per_auth}, which is a "
                      f"dangerous argument. ")

    limit = num_authors
    print("Number of authors: ", limit)

    # select top N senders and build train and test
    nlp_train, nlp_val, nlp_test_turing = build_train_test(df, source, limit, per_author=args.samples_per_auth, seed=0)

    # print(df.head(5))
    print(nlp_test_turing.head(5))

In [None]:
print(nlp_test_10.columns)

In [None]:
!pip install lda

In [None]:
grouped_by_author = nlp_test_10.groupby('Target')
blog10_documents_by_author = {author: group for author, group in grouped_by_author}

grouped_by_author = nlp_test_50.groupby('Target')
blog50_documents_by_author = {author: group for author, group in grouped_by_author}

grouped_by_author = nlp_test_turing.groupby('Target')
turing_documents_by_author = {author: group for author, group in grouped_by_author}

# print(blog10_documents_by_author)
# print(blog50_documents_by_author)
# print(turing_documents_by_author)

In [None]:
nlp_test_paraphased = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
nlp_test_paraphased = nlp_test_paraphased[['prompt', 'user_name']]
nlp_test_paraphased.columns = ['content', 'Target']

nlp_test_clean = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/clean/test_random100_label_1.csv')
nlp_test_clean = nlp_test_clean[['prompt', 'user_name']]
nlp_test_clean.columns = ['content', 'Target']

nlp_test_blogs50 = pd.read_csv('/content/drive/MyDrive/msc_project/data/blogs/processed/blogs50_AA_test.csv')
nlp_test_blogs50 = nlp_test_blogs50[['text', 'author_id']]
nlp_test_blogs50.columns = ['content', 'Target']

nlp_test_imdb62 = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_test.csv')
nlp_test_imdb62 = nlp_test_imdb62[['text', 'author_id']]
nlp_test_imdb62.columns = ['content', 'Target']

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nlp_test_paraphased['cleaned_text'] = nlp_test_paraphased['content'].apply(fil_sent)

# 2. Apply the 'extract_style' function to extract features and add them to the DataFrame
style_columns = [avg_len, len_text, len_words, num_short_w, per_digit, per_cap, f_a, f_b, f_c, f_d, f_e, f_f, f_g, f_h, f_i,
         f_j, f_k, f_l, f_m, f_n, f_o, f_p, f_q, f_r, f_s, f_t, f_u, f_v, f_w, f_x, f_y, f_z, f_0, f_1, f_2, f_3,
         f_4, f_5, f_6, f_7, f_8, f_9, f_e_0, f_e_1, f_e_2, f_e_3, f_e_4, f_e_5, f_e_6, f_e_7, f_e_8, f_e_9, f_e_10,
         f_e_11, richness]
nlp_test_paraphased[style_columns] = nlp_test_paraphased['cleaned_text'].apply(extract_style)


In [None]:
grouped_by_author = nlp_test_paraphased.groupby('Target')
paraphrased_documents_by_author = {author: group for author, group in grouped_by_author}

grouped_by_author = nlp_test_clean.groupby('Target')
clean_documents_by_author = {author: group for author, group in grouped_by_author}

grouped_by_author = nlp_test_blogs50.groupby('Target')
blogs50_documents_by_author = {author: group for author, group in grouped_by_author}

grouped_by_author = nlp_test_imdb62.groupby('Target')
imdb62_documents_by_author = {author: group for author, group in grouped_by_author}

In [None]:
from __future__ import division, print_function
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import map_tag
from collections import Counter
import nltk
import os
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from scipy.spatial.distance import jensenshannon

import lda
import re
import lda.datasets
from collections import defaultdict
from numpy import sum
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
from numpy.linalg import norm
from collections import Counter
import argparse
import warnings

np.random.seed(1337)

def char_bigram(text, x_train):
    vec = CountVectorizer(analyzer="char", ngram_range=(2, 2), max_df=0.95, min_df=2, max_features=100)
    vec.fit_transform(x_train)
    vocab = vec.vocabulary_
    vectorizer = CountVectorizer(analyzer="char", ngram_range=(2, 2), vocabulary=vocab, max_features=100)
    vectorizer.fit_transform(x_train)
    feature_bigrams = vectorizer.transform(text)
    return feature_bigrams.toarray()

def char_trigram(text, x_train):
    vec = CountVectorizer(analyzer="char", ngram_range=(3, 3), max_df=0.95, min_df=2, max_features=100)
    vec.fit_transform(x_train)
    vocab = vec.vocabulary_
    vectorizer = CountVectorizer(analyzer="char", ngram_range=(3, 3), vocabulary=vocab, max_features=100)
    vectorizer.fit_transform(x_train)
    feature_trigrams = vectorizer.transform(text)
    return feature_trigrams.toarray()

def word_unigram(text, x_train):
    # print(text)
    vec = CountVectorizer(analyzer="word", ngram_range=(1, 1), max_df=0.95, min_df=2, max_features=100, stop_words="english")
    vec.fit_transform(x_train)
    vocab = vec.vocabulary_
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), vocabulary=vocab, max_features=100)
    vectorizer.fit_transform(x_train)
    feature_wunigrams = vectorizer.transform(text)
    return feature_wunigrams.toarray()

def word_bigram(text, x_train):
    vec = CountVectorizer(analyzer="word", ngram_range=(2, 2), max_df=0.95, min_df=2, max_features=100, stop_words="english")
    vec.fit_transform(x_train)
    vocab = vec.vocabulary_
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(2, 2), vocabulary=vocab, max_features=100)
    vectorizer.fit_transform(x_train)
    feature_wbigrams = vectorizer.transform(text)
    return feature_wbigrams.toarray()

def word_trigram(text, x_train):
    vec = CountVectorizer(analyzer="word", ngram_range=(3, 3), max_df=0.95, min_df=2, max_features=100, stop_words="english")
    vec.fit_transform(x_train)
    vocab = vec.vocabulary_
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(3, 3), vocabulary=vocab, max_features=100)
    vectorizer.fit_transform(x_train)
    feature_wtrigrams = vectorizer.transform(text)
    return feature_wtrigrams.toarray()


# Topic analysis
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\d+", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def sample_prompts(group, n=100):
    return group.sample(n=n, random_state=42) if len(group) > n else group

def load_diffusiondb(df):
    # load data (can be used if data already split into train and test set)
    df = df[['content', 'From']]
    # print(df)
    df.columns = ['prompt', 'user_name']

    x = df['prompt'].tolist()
    y = df['user_name'].tolist()
    dict_author = {}    # id doc: author_name
    X = []
    for i in range(len(x)):
        X.append(clean_str(x[i]))
        dict_author[i] = y[i]
    return X, dict_author


def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))


def topics_analysis(df, number_of_topics):
    warnings.simplefilter(action='ignore', category=FutureWarning)

    X, dict_author = load_diffusiondb(df)

    # print(X)
    # create vocabulary
    print ("creating vocabulary..")
    print ("---------------------------")

    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=5, stop_words='english')
    X_tf = tf_vectorizer.fit_transform(X)
    vocab = tf_vectorizer.get_feature_names_out()
    print("shape: {}\n".format(X_tf.shape))
    # print(vocab)


    # building topic model using LDA
    print ("building model..")
    print ("---------------------------")
    model = lda.LDA(n_topics=number_of_topics, n_iter=500, random_state=1000)
    model.fit(X_tf)
    topic_word = model.topic_word_
    print("shape: {}".format(topic_word.shape))

    # show detail of topic
    n = 10
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
        print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

    print ("document topic model..")
    print ("---------------------------")
    doc_topic = model.doc_topic_
    topic_most = {}
    for n in range(len(doc_topic)):
        topic_most_pr = doc_topic[n].argmax()
        author = dict_author[n]
        if author in topic_most:
            tp_most.append(topic_most_pr)
        else:
            tp_most = []
            tp_most.append(topic_most_pr)
        topic_most[author] = tp_most

    i = 0
    for author_p, topic_p in topic_most.items():
        print (i, author_p, Counter(topic_p))
        i += 1

    new_dict = defaultdict(list)
    for k, v in dict_author.items():
        new_dict[v].append(k)

    new_dict_2 = defaultdict(list)
    for k, v in new_dict.items():
        sum_per_author = np.zeros(number_of_topics)
        n_doc = len(v)
        for i in range(len(v)):
            sum_per_author = sum([sum_per_author, doc_topic[v[i]]], axis=0)
        mean_prob = sum_per_author/n_doc
        new_dict_2[k].append(mean_prob)

    # print(new_dict_2)
    return dict(new_dict_2)

def extract_content_features(doc, df):
    # doc = [row['content']]
    word_unigrams = word_unigram(doc, df['content'].tolist())
    word_bigrams = word_bigram(doc, df['content'].tolist())
    word_trigrams = word_trigram(doc, df['content'].tolist())
    return np.concatenate([word_unigrams, word_bigrams, word_trigrams])

def extract_style_features(row, df):
    # print(row)
    columns_to_select = ['avg_len',
       'num_short_w', 'per_digit', 'per_cap', 'f_a', 'f_b', 'f_c', 'f_d',
       'f_e', 'f_f', 'f_g', 'f_h', 'f_i', 'f_j', 'f_k', 'f_l', 'f_m', 'f_n',
       'f_o', 'f_p', 'f_q', 'f_r', 'f_s', 'f_t', 'f_u', 'f_v', 'f_w', 'f_x',
       'f_y', 'f_z', 'f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7',
       'f_8', 'f_9', 'f_e_0', 'f_e_1', 'f_e_2', 'f_e_3', 'f_e_4', 'f_e_5',
       'f_e_6', 'f_e_7', 'f_e_8', 'f_e_9', 'f_e_10', 'f_e_11', 'richness']
    # print(row[columns_to_select].to_numpy())
    return row[columns_to_select].to_numpy()

def extract_hybrid_features(row, df):
    # doc = row['content']
    char_bigrams = char_bigram(row, df['content'].tolist())
    char_trigrams = char_trigram(row, df['content'].tolist())
    return np.concatenate([char_bigrams, char_trigrams])

def extract_topic_features(row, df):
    pass

def compute_author_representation(documents, feature_extractor, df):
    feature_vectors = feature_extractor(documents['content'].to_numpy(), df)
    return np.mean(feature_vectors, axis=0)

def compute_author_representation_style(documents, feature_extractor, df):
    feature_vectors = np.array([feature_extractor(row, df) for idx, row in documents.iterrows()])
    # print(feature_vectors)
    return np.mean(feature_vectors, axis=0)

def compute_inter_author_dissimilarity(dfs, feature_extractor, df, feature_type, distance_metric='cosine'):
    author_representations = {}

    if feature_type == 'Style':
      for author, docs in dfs.items():
        # print(compute_author_representation_style(docs, feature_extractor, df))
        author_representations[author] = compute_author_representation_style(docs, feature_extractor, df)
    else:
      for author, docs in dfs.items():
          author_representations[author] = compute_author_representation(docs, feature_extractor, df)

    authors = list(author_representations.keys())
    num_authors = len(authors)

    # Calculate pairwise dissimilarity
    dissimilarities = []

    for i in range(num_authors):
        for j in range(num_authors):
            vec_i = author_representations[authors[i]]
            vec_j = author_representations[authors[j]]

            if distance_metric == 'cosine':
                dissimilarity = 1 - cosine(vec_i, vec_j)
            elif distance_metric == 'jsd':
                dissimilarity = jensenshannon(vec_i, vec_j) ** 2
            else:
                raise ValueError("Unknown distance metric")

            dissimilarities.append(dissimilarity)

    # Return the average dissimilarity
    return np.sum(dissimilarities) / (num_authors ** 2)

def compute_inter_author_dissimilarity_topics(dfs, feature_extractor, df, distance_metric='cosine'):
    author_representations = topics_analysis(df, 20)

    authors = list(author_representations.keys())
    num_authors = len(authors)

    # print(author_representations)
    # print(authors)
    # Calculate pairwise dissimilarity
    dissimilarities = []

    for i in range(num_authors):
        for j in range(num_authors):
            vec_i = author_representations[authors[i]][0]
            vec_j = author_representations[authors[j]][0]
            # print(vec_i[0])
            # print(vec_j)

            if distance_metric == 'cosine':
                dissimilarity = 1 - cosine(vec_i, vec_j)
            elif distance_metric == 'jsd':
                vec_i = vec_i / np.sum(vec_i)
                vec_j = vec_j / np.sum(vec_j)
                dissimilarity = jensenshannon(vec_i, vec_j) ** 2
            else:
                raise ValueError("Unknown distance metric")

            dissimilarities.append(dissimilarity)

    # Return the average dissimilarity
    return np.sum(dissimilarities) / (num_authors ** 2)


# datasets = {
#     'Blog10': blog10_documents_by_author,  # Replace with actual data
#     'Blog50': blog50_documents_by_author,
#     # 'TuringBench': turing_documents_by_author
# }

# datasets1 = {
#     'Blog10': nlp_test_10,  # Replace with actual data
#     'Blog50': nlp_test_50,
#     # 'TuringBench': turing_documents_by_author
# }

datasets = {
    'paraphrased': paraphrased_documents_by_author,  # Replace with actual data
    'clean': clean_documents_by_author,
    'blogs50':blogs50_documents_by_author,
    'imdb62':imdb62_documents_by_author,
    # 'TuringBench': turing_documents_by_author
}

datasets1 = {
    'paraphrased': nlp_test_paraphased,  # Replace with actual data
    'clean': nlp_test_clean,
    'blogs50':nlp_test_blogs50,
    'imdb62':nlp_test_imdb62,
   # 'TuringBench': turing_documents_by_author
}

# feature_extractors = {
#     'Topic': extract_topic_features,
#     'Hybrid': extract_hybrid_features,
#     #'Content': extract_content_features,
#     #'Style': extract_style_features,

# }

feature_extractors = {
    # 'Topic': extract_topic_features,
    'Hybrid': extract_hybrid_features,
    # 'Content': extract_content_features,
    # 'Style': extract_style_features,

}

distance_metrics = {
    'Content': 'cosine',
    'Style': 'cosine',
    'Hybrid': 'cosine',
    'Topic': 'jsd'
}

results = {}

for dataset_name, dfs in datasets.items():
    results[dataset_name] = {}
    for feature_type, extractor in feature_extractors.items():
        print(feature_type)
        if feature_type != 'Topic':
          dissimilarity = compute_inter_author_dissimilarity(
              dfs, extractor, datasets1[dataset_name], feature_type, distance_metric=distance_metrics[feature_type]
          )
        else:
          dissimilarity = compute_inter_author_dissimilarity_topics(
              dfs, extractor, datasets1[dataset_name], distance_metric=distance_metrics[feature_type]
          )
        print(dissimilarity)
        results[dataset_name][feature_type] = dissimilarity

for feature_type in feature_extractors.keys():
    max_value = max(results[dataset_name][feature_type] for dataset_name in datasets.keys())

    for dataset_name in datasets.keys():
        results[dataset_name][feature_type] /= max_value

print(results)

In [None]:
from __future__ import division, print_function
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import map_tag
from collections import Counter
import nltk
import os
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from scipy.spatial.distance import jensenshannon

import lda
import re
import lda.datasets
from collections import defaultdict
from numpy import sum
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
from numpy.linalg import norm
from collections import Counter
import argparse
import warnings

np.random.seed(1337)

def char_bigram(text, x_train):
    vec = CountVectorizer(analyzer="char", ngram_range=(2, 2), max_df=0.95, min_df=2, max_features=100)
    vec.fit_transform(x_train)
    vocab = vec.vocabulary_
    vectorizer = CountVectorizer(analyzer="char", ngram_range=(2, 2), vocabulary=vocab, max_features=100)
    vectorizer.fit_transform(x_train)
    feature_bigrams = vectorizer.transform(text)
    return feature_bigrams.toarray()

def char_trigram(text, x_train):
    vec = CountVectorizer(analyzer="char", ngram_range=(3, 3), max_df=0.95, min_df=2, max_features=100)
    vec.fit_transform(x_train)
    vocab = vec.vocabulary_
    vectorizer = CountVectorizer(analyzer="char", ngram_range=(3, 3), vocabulary=vocab, max_features=100)
    vectorizer.fit_transform(x_train)
    feature_trigrams = vectorizer.transform(text)
    return feature_trigrams.toarray()

def word_unigram(text, x_train):
    # print(text)
    vec = CountVectorizer(analyzer="word", ngram_range=(1, 1), max_df=0.95, min_df=2, max_features=100, stop_words="english")
    vec.fit_transform(x_train)
    vocab = vec.vocabulary_
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), vocabulary=vocab, max_features=100)
    vectorizer.fit_transform(x_train)
    feature_wunigrams = vectorizer.transform(text)
    return feature_wunigrams.toarray()

def word_bigram(text, x_train):
    vec = CountVectorizer(analyzer="word", ngram_range=(2, 2), max_df=0.95, min_df=2, max_features=100, stop_words="english")
    vec.fit_transform(x_train)
    vocab = vec.vocabulary_
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(2, 2), vocabulary=vocab, max_features=100)
    vectorizer.fit_transform(x_train)
    feature_wbigrams = vectorizer.transform(text)
    return feature_wbigrams.toarray()

def word_trigram(text, x_train):
    vec = CountVectorizer(analyzer="word", ngram_range=(3, 3), max_df=0.95, min_df=2, max_features=100, stop_words="english")
    vec.fit_transform(x_train)
    vocab = vec.vocabulary_
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(3, 3), vocabulary=vocab, max_features=100)
    vectorizer.fit_transform(x_train)
    feature_wtrigrams = vectorizer.transform(text)
    return feature_wtrigrams.toarray()


# Topic analysis
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\d+", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def sample_prompts(group, n=100):
    return group.sample(n=n, random_state=42) if len(group) > n else group

def load_diffusiondb(df):
    # load data (can be used if data already split into train and test set)
    df = df[['content', 'Target']]
    # print(df)
    df.columns = ['prompt', 'user_name']

    x = df['prompt'].tolist()
    y = df['user_name'].tolist()
    dict_author = {}    # id doc: author_name
    X = []
    for i in range(len(x)):
        X.append(clean_str(x[i]))
        dict_author[i] = y[i]
    return X, dict_author


def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))


def topics_analysis(df, number_of_topics):
    warnings.simplefilter(action='ignore', category=FutureWarning)

    X, dict_author = load_diffusiondb(df)

    # print(X)
    # create vocabulary
    print ("creating vocabulary..")
    print ("---------------------------")

    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=5, stop_words='english')
    X_tf = tf_vectorizer.fit_transform(X)
    vocab = tf_vectorizer.get_feature_names_out()
    print("shape: {}\n".format(X_tf.shape))
    # print(vocab)


    # building topic model using LDA
    print ("building model..")
    print ("---------------------------")
    model = lda.LDA(n_topics=number_of_topics, n_iter=500, random_state=1000)
    model.fit(X_tf)
    topic_word = model.topic_word_
    print("shape: {}".format(topic_word.shape))

    # show detail of topic
    n = 10
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
        print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

    print ("document topic model..")
    print ("---------------------------")
    doc_topic = model.doc_topic_
    topic_most = {}
    for n in range(len(doc_topic)):
        topic_most_pr = doc_topic[n].argmax()
        author = dict_author[n]
        if author in topic_most:
            tp_most.append(topic_most_pr)
        else:
            tp_most = []
            tp_most.append(topic_most_pr)
        topic_most[author] = tp_most

    i = 0
    for author_p, topic_p in topic_most.items():
        print (i, author_p, Counter(topic_p))
        i += 1

    new_dict = defaultdict(list)
    for k, v in dict_author.items():
        new_dict[v].append(k)

    new_dict_2 = defaultdict(list)
    for k, v in new_dict.items():
        sum_per_author = np.zeros(number_of_topics)
        n_doc = len(v)
        for i in range(len(v)):
            sum_per_author = sum([sum_per_author, doc_topic[v[i]]], axis=0)
        mean_prob = sum_per_author/n_doc
        new_dict_2[k].append(mean_prob)

    # print(new_dict_2)
    return dict(new_dict_2)

def extract_content_features(doc, df):
    # doc = [row['content']]
    word_unigrams = word_unigram(doc, df['content'].tolist())
    word_bigrams = word_bigram(doc, df['content'].tolist())
    word_trigrams = word_trigram(doc, df['content'].tolist())
    return np.concatenate([word_unigrams, word_bigrams, word_trigrams])

def extract_style_features(row, df):
    # print(row)
    columns_to_select = ['avg_len',
       'num_short_w', 'per_digit', 'per_cap', 'f_a', 'f_b', 'f_c', 'f_d',
       'f_e', 'f_f', 'f_g', 'f_h', 'f_i', 'f_j', 'f_k', 'f_l', 'f_m', 'f_n',
       'f_o', 'f_p', 'f_q', 'f_r', 'f_s', 'f_t', 'f_u', 'f_v', 'f_w', 'f_x',
       'f_y', 'f_z', 'f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7',
       'f_8', 'f_9', 'f_e_0', 'f_e_1', 'f_e_2', 'f_e_3', 'f_e_4', 'f_e_5',
       'f_e_6', 'f_e_7', 'f_e_8', 'f_e_9', 'f_e_10', 'f_e_11', 'richness']
    # print(row[columns_to_select].to_numpy())
    return row[columns_to_select].to_numpy()

def extract_hybrid_features(row, df):
    # doc = row['content']
    char_bigrams = char_bigram(row, df['content'].tolist())
    char_trigrams = char_trigram(row, df['content'].tolist())
    return np.concatenate([char_bigrams, char_trigrams])

def extract_topic_features(row, df):
    pass

def compute_author_representation(documents, feature_extractor, df):
    feature_vectors = feature_extractor(documents['content'].to_numpy(), df)
    return np.mean(feature_vectors, axis=0)

def compute_author_representation_style(documents, feature_extractor, df):
    feature_vectors = np.array([feature_extractor(row, df) for idx, row in documents.iterrows()])
    # print(feature_vectors)
    return np.mean(feature_vectors, axis=0)

def compute_inter_author_dissimilarity(dfs, feature_extractor, df, feature_type, distance_metric='cosine'):
    author_representations = {}

    if feature_type == 'Style':
      for author, docs in dfs.items():
        # print(compute_author_representation_style(docs, feature_extractor, df))
        author_representations[author] = compute_author_representation_style(docs, feature_extractor, df)
    else:
      for author, docs in dfs.items():
          author_representations[author] = compute_author_representation(docs, feature_extractor, df)

    authors = list(author_representations.keys())
    num_authors = len(authors)

    # Calculate pairwise dissimilarity
    dissimilarities = []

    for i in range(num_authors):
        for j in range(num_authors):
            vec_i = author_representations[authors[i]]
            vec_j = author_representations[authors[j]]

            if distance_metric == 'cosine':
                dissimilarity = 1 - cosine(vec_i, vec_j)
            elif distance_metric == 'jsd':
                dissimilarity = jensenshannon(vec_i, vec_j) ** 2
            else:
                raise ValueError("Unknown distance metric")

            dissimilarities.append(dissimilarity)

    # Return the average dissimilarity
    return np.sum(dissimilarities) / (num_authors ** 2)

def compute_inter_author_dissimilarity_topics(dfs, feature_extractor, df, distance_metric='cosine'):
    author_representations = topics_analysis(df, 20)

    authors = list(author_representations.keys())
    num_authors = len(authors)

    # print(author_representations)
    # print(authors)
    # Calculate pairwise dissimilarity
    dissimilarities = []

    for i in range(num_authors):
        for j in range(num_authors):
            vec_i = author_representations[authors[i]][0]
            vec_j = author_representations[authors[j]][0]
            # print(vec_i[0])
            # print(vec_j)

            if distance_metric == 'cosine':
                dissimilarity = 1 - cosine(vec_i, vec_j)
            elif distance_metric == 'jsd':
                vec_i = vec_i / np.sum(vec_i)
                vec_j = vec_j / np.sum(vec_j)
                dissimilarity = jensenshannon(vec_i, vec_j) ** 2
            else:
                raise ValueError("Unknown distance metric")

            dissimilarities.append(dissimilarity)

    # Return the average dissimilarity
    return np.sum(dissimilarities) / (num_authors ** 2)


# datasets = {
#     'Blog10': blog10_documents_by_author,  # Replace with actual data
#     'Blog50': blog50_documents_by_author,
#     # 'TuringBench': turing_documents_by_author
# }

# datasets1 = {
#     'Blog10': nlp_test_10,  # Replace with actual data
#     'Blog50': nlp_test_50,
#     # 'TuringBench': turing_documents_by_author
# }

datasets = {
    'paraphrased': paraphrased_documents_by_author,  # Replace with actual data
    'clean': clean_documents_by_author,
    'blogs50':blogs50_documents_by_author,
    'imdb62':imdb62_documents_by_author,
    # 'TuringBench': turing_documents_by_author
}

datasets1 = {
    'paraphrased': nlp_test_paraphased,  # Replace with actual data
    'clean': nlp_test_clean,
    'blogs50':nlp_test_blogs50,
    'imdb62':nlp_test_imdb62,
   # 'TuringBench': turing_documents_by_author
}

# feature_extractors = {
#     'Topic': extract_topic_features,
#     'Hybrid': extract_hybrid_features,
#     #'Content': extract_content_features,
#     #'Style': extract_style_features,

# }

feature_extractors = {
    'Topic': extract_topic_features,
    # 'Hybrid': extract_hybrid_features,
    'Content': extract_content_features,
    # 'Style': extract_style_features,

}

distance_metrics = {
    'Content': 'cosine',
    'Style': 'cosine',
    'Hybrid': 'cosine',
    'Topic': 'jsd'
}

results = {}

for dataset_name, dfs in datasets.items():
    results[dataset_name] = {}
    for feature_type, extractor in feature_extractors.items():
        print(feature_type)
        if feature_type != 'Topic':
          dissimilarity = compute_inter_author_dissimilarity(
              dfs, extractor, datasets1[dataset_name], feature_type, distance_metric=distance_metrics[feature_type]
          )
        else:
          dissimilarity = compute_inter_author_dissimilarity_topics(
              dfs, extractor, datasets1[dataset_name], distance_metric=distance_metrics[feature_type]
          )
        print(dissimilarity)
        results[dataset_name][feature_type] = dissimilarity

for feature_type in feature_extractors.keys():
    max_value = max(results[dataset_name][feature_type] for dataset_name in datasets.keys())

    for dataset_name in datasets.keys():
        results[dataset_name][feature_type] /= max_value

print(results)