# Clustering analysis

1. Cluster sentences of target words that appear frequently enough (More than 5 times)
2. Cluster substitutes obtained

Embeddings extracted are the one at the **11th** layer of the hidden states of `bert-base-uncased`

1. **Tokenize** sentences and term
2. Identify the token(s) of the target term, get embeddings and average if there are several -- (gay, ##est)
3. Create **k-means clusters** of the sentences by the embeddings of the specific term
4. Obtain clusters **metrics**

# Imports and setup

In [None]:
import pandas as pd
import numpy as np

from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
import torch

from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import plotly.graph_objects as go
from matplotlib.backends.backend_pdf import PdfPages

from scipy.spatial.distance import cosine

import pickle
from tqdm import tqdm
import re
import ast
import os

In [None]:
MODEL_USED = "RoPretrained" #RoPretrained, #BERT

# Version of analysis to create or load
VERSION = "avg_pca2" # first_pca, avg_pca_vocab, first_pca_vocab
n_clusters = range(1, 60)

In [None]:
# Load files
with open('../base_dict.pkl', 'rb') as file:
    sub_sentences = pickle.load(file)

with open('../tokens_dict.pkl', 'rb') as file:
    tokens_sentences = pickle.load(file)

with open('../base_dict_noblack.pkl', 'rb') as file:
    sub_sentences_noblack = pickle.load(file)

with open('../tokens_dict_noblack.pkl', 'rb') as file:
    tokens_sentences_noblack = pickle.load(file)
specific_words = [tok for tok, sents  in tokens_sentences.items() if len(sents["sentences"]) >= 5]

# Load tokenizer and model
if MODEL_USED == "RoPretrained" :
    tokenizer = AutoTokenizer.from_pretrained('pretrained_videogame_with_tokenizer')
    model = AutoModel.from_pretrained('pretrained_videogame_with_tokenizer_0.001/checkpoint-3600', output_hidden_states=True)

elif MODEL_USED == "BERT" :
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

pd.set_option('display.max_colwidth', None)
n_components = 3
pca = PCA(n_components=n_components)
n_components_low = 2
pca_low = PCA(n_components=n_components_low)

split_tokens_csvpath = f"{VERSION}/split_tokens{MODEL_USED}_{VERSION}.csv"
split_tokens_txtpath = f"{VERSION}/split_tokens{MODEL_USED}_{VERSION}.txt"

target_embeddings_pkl = f"{VERSION}/target_embeddings{MODEL_USED}_{VERSION}.pkl"

vocab_tokens_pkl = f"vocab_tokens{MODEL_USED}.pkl"
similarities_for_sub_csv = f"{VERSION}/simSub{MODEL_USED}_{VERSION}.csv"

elbow_method_plots_repeatTerms = f"{VERSION}/elbow_method_plots_repeatTerms{MODEL_USED}_{VERSION}.pdf"
elbow_method_plots_substitutes = f"{VERSION}/elbow_method_plots_substitutes{MODEL_USED}_{VERSION}.pdf"

final_results_repeatTerms = f"{VERSION}/final_results_repeatTerms{MODEL_USED}_{VERSION}.csv"
final_results_substitutes = f"{VERSION}/final_results_substitutes{MODEL_USED}_{VERSION}.csv"

clusters_repeatTerms_pkl = f"{VERSION}/clusters_repeatTerms{MODEL_USED}_{VERSION}.pkl"
clusters_substitutes_pkl = f"{VERSION}/clusters_substitutes{MODEL_USED}_{VERSION}.pkl"

## Functions

### Generic functions

In [None]:
def open_pickle(filename) :
    with open(filename, 'rb') as file:
        return pickle.load(file)
    
def save_to_pickle(filename, item):
    with open(filename, "wb") as file :
        pickle.dump(item, file)

def assign_to_dict(dic, *args):
    for key, arg in zip(dic.keys(), args) :
        dic[key].append(arg)
    return dic

def format_dict(original_dict, keys_to_keep):
    return {key: original_dict[key] for key in keys_to_keep}

def create_pdDf(list_of_dicts, csv_filename, save = True):
    df = pd.DataFrame(list_of_dicts)
    df = df.apply(pd.Series.explode)
    if save :
        df.to_csv(csv_filename)
    return df


### Obtain embeddings, vocab

In [None]:
def find_sublist(main_list, sub_list):
    for i in range(len(main_list) - len(sub_list) + 1):
        if main_list[i:i + len(sub_list)] == sub_list:
            if len(sub_list) == 1 :
                return i
            else :
                return [i, i+len(sub_list)]
    return -1

def get_embeddings(sentences_for_target, contexts_with_targets, toxicity_of_target, target):
    model.eval()
    target_information = {"sentence": [], "sentence_in_context" : [], "target_embeddings" : [], "toxicity" : []} #before : [sentence, sentence_in_context, target_embeddings]
    tokens_split = {"term": [], "tokens" : []}
    print(target)
    with torch.no_grad():
        target_embeddings = {}

        for sentence, sentence_in_context, toxicity in zip(sentences_for_target, contexts_with_targets, toxicity_of_target) :

            complete_tokens = tokenizer.tokenize(sentence_in_context)
            complete_token_ids = tokenizer.convert_tokens_to_ids(complete_tokens)
            complete_tokens_tensor = torch.tensor([complete_token_ids])
            outputs = model(complete_tokens_tensor)
            hidden_states = outputs.hidden_states

            complete_embeddings = hidden_states[10][0]

            sentence_tokens = tokenizer.tokenize(sentence)
            target_tokens = tokenizer.tokenize(target)

            sentence_indices = find_sublist(complete_tokens, sentence_tokens)

            # Possibility of different tokenization with space in front of sentence / term
            if sentence_indices == -1 :
                sentence_tokens = tokenizer.tokenize(" "+sentence)
                sentence_indices = find_sublist(complete_tokens, sentence_tokens)

            if sentence_indices != -1 :
                sentence_embeddings = complete_embeddings[sentence_indices[0]:sentence_indices[1]]

                target_indices = find_sublist(complete_tokens[sentence_indices[0]:sentence_indices[1]], target_tokens)
                if target_indices == -1 :
                    target_tokens = tokenizer.tokenize(" "+target)
                    target_indices = find_sublist(complete_tokens[sentence_indices[0]:sentence_indices[1]], target_tokens)

                if target_indices != -1 :
                    if type(target_indices) != list :
                        target_embeddings = sentence_embeddings[target_indices]
                    elif type(target_indices) == list and "avg" in VERSION :
                        target_embeddings = sentence_embeddings[target_indices[0]:target_indices[-1]].mean(dim=0)
                        tokens_split["term"].append(target)
                        tokens_split["tokens"].append(target_tokens)
                    elif type(target_indices) == list and "avg" not in VERSION :
                        target_embeddings = sentence_embeddings[target_indices[0]]
                        tokens_split["term"].append(target)
                        tokens_split["tokens"].append(target_tokens)
                    assign_to_dict(target_information, sentence, sentence_in_context, target_embeddings, toxicity)
                    
            else :
                print(f"sentence : {sentence_tokens} \n not found in context : {complete_tokens} ")
    
    # Token splits :
    split_df = pd.DataFrame(tokens_split)
    if os.path.exists(split_tokens_csvpath):
        split_df.to_csv(split_tokens_csvpath, mode='a', header=False, index=False)
    else :
        split_df.to_csv(split_tokens_csvpath, mode='w', header=True, index=False)

    return target_information

In [None]:
# Function to filter subword tokens
def filter_subwords(tokens, embeddings):
    filtered_tokens = []
    filtered_embeddings = []
    for token, embedding in zip(tokens, embeddings):
        if not token.startswith("##"):
            filtered_tokens.append(token)
            filtered_embeddings.append(embedding)
    return filtered_tokens, np.array(filtered_embeddings)

### Similarity, clustering and metrics

In [None]:
# Function to calculate similarities
def calculate_similarities(target_embedding, vocab_embeddings, vocab_tokens, max = False):
    similarities = {"token" : [], "similarity_score" : [], "embedding" : []}
    for i, vocab_embedding in enumerate(vocab_embeddings):
        similarity = 1 - cosine(target_embedding, vocab_embedding)
        if max :
            assign_to_dict(similarities, vocab_tokens[i], similarity, vocab_embedding.reshape(1,-1))
            
        else : 
            assign_to_dict(similarities, vocab_tokens[i], similarity, vocab_embedding)

    if max:
        # Sort by similarity_score
        sorted_indices = sorted(range(len(similarities["similarity_score"])), key=lambda x: similarities["similarity_score"][x], reverse=True)
        sorted_similarities = {key: [similarities[key][i] for i in sorted_indices] for key in similarities}
        return {key: sorted_similarities[key][:max] for key in sorted_similarities}
    else:
        return similarities

In [None]:
# Function to cluster
def clustering(embeddings, n_clusters) :
    kmeans = KMeans(n_clusters = n_clusters)
    kmeans.fit_predict(embeddings)

    inertia = kmeans.inertia_
    cluster_labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    return cluster_labels, centroids, inertia   

def check_possible_clusters(len_embeddings, n_clusters_list) :
    if len_embeddings <= n_clusters_list[-1] :
            n_clusters_list = range(1, len_embeddings)
    return n_clusters_list

# Function to identify tokens/sentences/embeddings of each cluster
def identify(cluster_labels, cluster_id, list_of_items) :
    return [item for item, label in zip(list_of_items, cluster_labels) if label == cluster_id]

#|--------------------------------------|
#| Functions to calculate some metrics  |
#|--------------------------------------|
def elbow_method(target, sentence, embeddings, n_clusters_list, pdf):
    inertias = []
    clusters = {}

    for n in n_clusters_list:
        cluster_labels, centroids, inertia = clustering(embeddings, n)
        inertias.append(inertia)
        clusters[n] = {"cluster_labels" : [cluster_labels], "centroids" : [centroids], "inertia" : [inertia], "embeddings" : [embeddings]}
    plt.figure()
    plt.plot(range(len(n_clusters_list)), inertias, marker='o')
    plt.title(f'Target : {target} - N : {n} clusters - Sentence : {sentence}')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    
    pdf.savefig()
    plt.clf()
    plt.close()

    return clusters

def calculate_variance(embeddings) :
    return np.var(embeddings, axis=0).mean() 

def get_most_representative(cluster_embeddings, cluster_centroid, sentence_list):
    min_distance = float('inf')
    representative_item = ""
    
    for embedding, sentence in zip(cluster_embeddings, sentence_list):
        # Calculate the distance between the embedding and the cluster centroid
        distance = np.linalg.norm(embedding - cluster_centroid)
        if distance < min_distance:
            min_distance = distance
            representative_item = sentence
    
    return representative_item

def get_silhouette(embeddings_list, cluster_labels) :
    silhouette_avg = silhouette_score(embeddings_list, cluster_labels)
    return silhouette_avg

def get_max_silhouette(target_silhouettes) :
    silhouettes = []
    ks = []
    for k, info in target_silhouettes.items() :
        silhouettes.append(info[0])
        ks.append(k)

    max_silhouette = max(silhouettes)
    index_of_max = silhouettes.index(max_silhouette)

    return max_silhouette, ks[index_of_max]

### Visualizations

In [None]:
def convert_to_lit(val):
    if isinstance(val, str):
        # Convert string representation of array to actual array
        if "array" in val :
            val = re.sub(r'array\(|, dtype=float32\)', '', val)
        try:
            # Safely evaluate the string to numpy array
            val = ast.literal_eval(val)
        except (ValueError, SyntaxError):
            return val
    return val

def apply_lit_eval(df, list_of_columns) :
    for col in list_of_columns :
        df[col] = df[col].apply(convert_to_lit)
    return df

def generate_hover_text(row, hover_columns, index):
    hover_text = []
    for col in hover_columns:
        if col in row:
            # If the column contains a list, get the specific value for the current index
            if isinstance(row[col], list) and len(row[col]) > index:
                hover_text.append(f"{col}: {row[col][index]}")
            else:
                hover_text.append(f"{col}: {row[col]}")
    return "<br>".join(hover_text)

### Main functions

In [None]:
def apply_elbow(embeddings, target, sentence_to_write, n_clusters, pdf, 
                targets_info, name_of_targets_info, 
                other_infos = False, name_of_other_info = "other"):

    embeddings_to_cluster = np.vstack(embeddings)
    low_embeddings = pca_low.fit_transform(embeddings_to_cluster)
    embeddings_to_cluster = pca.fit_transform(embeddings_to_cluster)

    nb_of_clusters = check_possible_clusters(len(embeddings_to_cluster), n_clusters)

    clusters = elbow_method(target, sentence_to_write, embeddings_to_cluster, nb_of_clusters, pdf)

    clusters_results = {}
    for nb, cluster_info in clusters.items():
        clusters_results[nb] = {"target_term": [], "sentence": [],"cluster_id" : [], 
                                "most_representative" : [],
                                name_of_targets_info : [], name_of_other_info :[],
                                "cluster_labels" : [], "centroids" : [], "inertia" : [],
                                "embeddings_to_cluster" : [], "n_clusters" : [],
                                "cluster_embeddings" : [], "low_embeddings" : []}                               

        for id in range(nb):
            cluster_embeddings = identify(cluster_info["cluster_labels"][0], id, embeddings_to_cluster)
            cluster_low_embeddings = identify(cluster_info["cluster_labels"][0], id, low_embeddings)
            list_of_targets = identify(cluster_info["cluster_labels"][0], id, targets_info)
            if other_infos :
                list_of_others = identify(cluster_info["cluster_labels"][0], id, other_infos)
            else :
                list_of_others = False

            assign_to_dict(clusters_results[nb], target, sentence_to_write, id, 
                            get_most_representative(cluster_embeddings, cluster_info["centroids"], list_of_targets),
                            list_of_targets, list_of_others, 
                            cluster_info["cluster_labels"], cluster_info["centroids"], cluster_info["inertia"],
                            embeddings_to_cluster, nb_of_clusters, 
                            cluster_embeddings, cluster_low_embeddings)

    return clusters_results

def apply_silhouette(clusters, nb_clusters) :
    list_of_ks = []
    list_of_scores = []
    for possible_k in nb_clusters : 
        if possible_k == 1 : 
            continue

        labels = clusters[possible_k]["cluster_labels"][0][0]
        embeds = clusters[possible_k]["embeddings_to_cluster"][0]

        score = get_silhouette(embeds, labels)

        list_of_scores.append(score)
        list_of_ks.append(possible_k)

    max_silhouette = max(list_of_scores)
    index_of_max = list_of_scores.index(max_silhouette)

    return list_of_ks[index_of_max], max_silhouette

def get_final_cluster(clusters, nb_clusters):

    final_k, sil_score = apply_silhouette(clusters, nb_clusters)
    final_dict = clusters[final_k]
    final_dict["k"] = [final_k]*final_k

    final_dict["silhouette_score"] = [sil_score]*final_k
    return final_dict


# Extracting embeddings

### Creating and saving

In [None]:
target_embeddings = {target:get_embeddings(items["sentences"], items["contexts"], items["toxicity"], target) for target, items in tqdm(tokens_sentences.items())}

In [None]:
save_to_pickle(target_embeddings_pkl, target_embeddings)

### Loading

In [None]:
target_embeddings = open_pickle(target_embeddings_pkl)

# RepeatTerms - Cluster sentences for each repeated term

Example :

"Gay" : "I'm gay", "he's gay", "this is so gay"
We will use the embeddings of all occurences of "gay" and cluster them.

### Creating and saving

In [None]:
repeatTerms_results = {}
pdf_file = elbow_method_plots_repeatTerms
with PdfPages(pdf_file) as pdf:
    for target, representations in tqdm(target_embeddings.items()) :
        if target in specific_words :
            embeddings = representations["target_embeddings"]
            sentences = representations["sentence"]
            toxicity = []
            for sent in sentences :
                if sent in sub_sentences.keys() :
                    toxicity.append(sub_sentences[sent]["toxicity"][0])
                    continue
                for main_sentence, info in sub_sentences.items() :
                    if sent in info["sentences"] :
                        toxicity.append(info["toxicity"][0])
                        break

            if embeddings :
                print(f"There are {len(embeddings)} embeddings for the term : {target}. Clustering...")
                
                repeatTerms_results[target] = apply_elbow(embeddings, target, "all", n_clusters,
                                                            pdf, sentences, "sentences", toxicity, "toxicity")
            else : 
                print(f"SKIPPING {target} - There are {len(embeddings)} embedding(s)")

In [None]:
# Save
save_to_pickle(clusters_repeatTerms_pkl, repeatTerms_results)

In [None]:
final_repeatTerms_clusters = {}
for target, clusters in tqdm(repeatTerms_results.items()) :
    nb_cluster = range(3, clusters[1]["n_clusters"][0][-1]) # same in all clusters id

    final_repeatTerms_clusters[target] = get_final_cluster(clusters, nb_cluster)

In [None]:
list_of_clusters = []
columns_to_keep = ["target_term", "k", "cluster_id", "sentences", 
                    "most_representative", "inertia", "silhouette_score", "toxicity",
                    "cluster_embeddings", "low_embeddings"]

for target_term, clusters in final_repeatTerms_clusters.items():
    list_of_clusters.append(format_dict(clusters, columns_to_keep))

In [None]:
repeatTerms_df = create_pdDf(list_of_clusters, final_results_repeatTerms)

### Loading

In [None]:
repeatTerms_results = open_pickle(clusters_repeatTerms_pkl)
repeatTerms_df = pd.read_csv(final_results_repeatTerms)

# Substitutes - Clustering substitutes for each main sentence

"gay for not letting us win" : cluster all variations with substitutes. Also get similarity score and plot that

### Clustering and saving

In [None]:
substitutes_results = {}
similarities_for_subs = {}

pdf_file = elbow_method_plots_substitutes
with PdfPages(pdf_file) as pdf:
    for main, others in tqdm(sub_sentences.items()) :
        print(main)
        if main == "yeah and I'm part of the gays" :
            main_token = "gAYS"
        else : 
            main_token = others["tokens"][0]
        print(target_embeddings[main_token]["target_embeddings"])
        embedding_of_main = next(target_embeddings[main_token]["target_embeddings"][sent_pos] for sent_pos, sent in enumerate(target_embeddings[main_token]["sentence"]) if main == sent)
        
        embeddings_list = []
        tokens_list = []
        sentence_list = []
        for sentence, token in zip(others["sentences"], others["tokens"][1:]) :
            embs = next(target_embeddings[token]["target_embeddings"][sent_pos] for sent_pos, sent in enumerate(target_embeddings[token]["sentence"]) if sentence == sent)

            embeddings_list.append(embs)
            tokens_list.append(token)
            sentence_list.append(sentence)
        
                    # Computing similarities
        similarities_substitutes = calculate_similarities(embedding_of_main, embeddings_list, tokens_list)
        assign_to_dict(similarities_substitutes, main_token, "main_term", embedding_of_main)
        similarities_for_subs[main] = similarities_substitutes

        if len(embeddings_list) >= n_components:
            print(f"There are {len(embeddings_list)} embeddings for the sentence : {main}. Clustering...")


            
            similarity_sub_info = list(zip(similarities_substitutes["token"], similarities_substitutes["similarity_score"]))
            substitutes_results[main] = apply_elbow(embeddings_list, main, main, n_clusters, pdf,
                                                    sentence_list, "sentences", similarity_sub_info, "tokens_and_similarity")
        else :
            print(f"not enough embeddings, {len(embeddings_list)}")
            


In [None]:
# Save
save_to_pickle(clusters_substitutes_pkl, substitutes_results)

In [None]:
save_to_pickle(f"{VERSION}/similarities_for_sub.pkl", similarities_for_subs)

In [None]:
similarities_for_subs_final = {key:{'token' : value['token'], 
                                "similarity_score" : [round(float(score), 3)  if score != 'main_term' else 'main_term' for score in value['similarity_score']], 
                                'sorted_similarity' : sorted(zip(value['token'][:-1], [round(float(score), 3)  if score != 'main_term' else 'main_term' for score in value['similarity_score'][:-1]]), key=lambda x: x[1], reverse=True),
                                "token_freq" : dict(sorted(sub_sentences[key]["tokens_count"][0].items(), key=lambda item: item[1], reverse=True))
                                } 
                                
                                for key, value in similarities_for_subs.items()}

In [None]:
similarities_for_subs_df = pd.DataFrame.from_dict(similarities_for_subs_final, orient='index')
similarities_for_subs_df.to_csv(similarities_for_sub_csv)


#### Other calculations and saving

In [None]:
final_substitutes_clusters= {}
for target, clusters in tqdm(substitutes_results.items()) :
    nb_clusters = clusters[1]["n_clusters"][0]

    final_substitutes_clusters[target] = get_final_cluster(clusters, nb_clusters)

In [None]:
list_of_clusters = []
columns_to_keep = ["target_term", "k", "cluster_id", "sentences", "tokens_and_similarity",
                    "tokens", "similarity_score", "most_representative", "inertia", "silhouette_score", 
                    "cluster_embeddings", "low_embeddings"]

for target, clusters in final_substitutes_clusters.items() :
    clusters["tokens"] = [[token for token, _ in tokens] for tokens in clusters["tokens_and_similarity"]]
    clusters["similarity_score"] = [[score for _, score in tokens] for tokens in clusters["tokens_and_similarity"]]
    list_of_clusters.append(format_dict(clusters, columns_to_keep))

In [None]:
# Creates and saves dataframe
substitutes_df = create_pdDf(list_of_clusters, final_results_substitutes)

### Loading

In [None]:
substitutes_results = open_pickle(clusters_substitutes_pkl)
substitutes_df = pd.read_csv(final_results_substitutes)

# LOADALL

In [None]:
target_embeddings = open_pickle(target_embeddings_pkl)

repeatTerms_results = open_pickle(clusters_repeatTerms_pkl)
repeatTerms_df = pd.read_csv(final_results_repeatTerms)

substitutes_results = open_pickle(clusters_substitutes_pkl)
substitutes_df = pd.read_csv(final_results_substitutes)

# Visualisation

In [None]:
XP = "repeatTerms" # "repeatTerms", "substitutes"
UNIQUE_TERM = "gay" # False
TOXICITY = True
TEXT = False
if MODEL_USED == "RoPretrained" :
    MODEL = "RoBERTa"
else :
    MODEL = "BERT"

# If df is loaded, might need to eval some columns
df_loaded = True
hover_columns = ["cluster_id", "most_representative"]
columns_lit_eval = ["cluster_embeddings", "low_embeddings"]

plot_3d_html = f"{VERSION}/plot_3d_{XP}_{MODEL_USED}_{VERSION}.html"
plot_2d_html = f"{VERSION}/plot_2d_{XP}_{MODEL_USED}_{VERSION}.html"

if XP == "substitutes" :
    df = substitutes_df
    main_col = "target_term"
    marker_text = "tokens"
    other_cols = ["tokens", "similarity_score"]

elif XP == "repeatTerms" :
    TOXICITY = True
    df = repeatTerms_df
    main_col = "target_term"
    marker_text = "sentences"
    other_cols = ["sentences", "toxicity"]

elif XP == "vocabSim" :
    df = vocabSim_df.loc[vocabSim_df['target_term'] == UNIQUE_TERM]
    main_col = "sentence"
    marker_text = "tokens"
    other_cols = ["tokens", "similarity_score"]

else : 
    print("Wrong XP name.")

# Process df
hover_columns[0:0] = other_cols
columns_lit_eval.extend(other_cols)

if df_loaded:
    df = apply_lit_eval(df, columns_lit_eval)

In [None]:
unique_target_terms = df[main_col].unique()

symbol_map = {"toxic" : "diamond-open", "non-toxic" : "circle", "black" : "circle"}

# 3D plot overlay
fig_3d = go.Figure()

# 2D plot overlay
fig_2d = go.Figure()

# Loop through each unique target term
for i, target_term in enumerate(unique_target_terms):

    if XP == "vocabSim" and target_term != UNIQUE_TERM :
        continue

    subset_df = df[df[main_col] == target_term]


    # 3D plot
    for _, row in subset_df.iterrows():
        cluster_id = row['cluster_id']
    
        x = [embed[0] for embed in row['cluster_embeddings']]
        y = [embed[1] for embed in row['cluster_embeddings']]
        z = [embed[2] for embed in row['cluster_embeddings']]
        hover_info = [generate_hover_text(row, hover_columns, i) for i in range(len(x))]
        if TEXT:
            text_marker = [tok for tok in row[marker_text]]
        else :
            text_marker = None

        if XP == "repeatTerms" :
            symbols = [symbol_map[val] for val in row['toxicity']]
        else : 
            symbols = None
        fig_3d.add_trace(go.Scatter3d(
            x=x, y=y, z=z,
            mode='markers+text',
            marker=dict(size=5, color=cluster_id, colorscale='Viridis', symbol=symbols),
            text = text_marker,
            textposition = "top center",
            textfont=dict(size=17),
            hovertext = hover_info,
            hoverinfo='text',
            name=f"Cluster {cluster_id} - {target_term}",
            visible=False
        ))

    # # 2D plot
        x_low = [embed[0] for embed in row['low_embeddings']]
        y_low = [embed[1] for embed in row['low_embeddings']]


        fig_2d.add_trace(go.Scatter(
            x=x_low, y=y_low,
            mode='markers+text',
            marker=dict(size=15, color=cluster_id, colorscale='Viridis', symbol=symbols),
            text = text_marker,
            textposition = "top center",
            textfont=dict(size=17),
            hovertext = hover_info,
            hoverinfo='text',
            name=f"Cluster {cluster_id} - {target_term}",
            visible=False
        ))

# Buttons to toggle visibility for target terms
buttons_3d = []
buttons_2d = []

for term in unique_target_terms:
    buttons_3d.append(dict(
        label=f"{term}",
        method="update",
        args=[{"visible": [not trace.visible if trace.name.endswith(term) else trace.visible
                            for trace in fig_3d.data]}, {"title": f"3D {MODEL} - {term}"}]
    ))
    
    buttons_2d.append(dict(
        label=f"{term}",
        method="update",
        args=[{"visible": [not trace.visible if trace.name.endswith(term) else trace.visible
                            for trace in fig_2d.data]}, {"title": f"2D {MODEL} - {term}"}]
    ))

# Default visibility (none visible at the start)
for trace in fig_3d.data:
    trace.visible = False

for trace in fig_2d.data:
    trace.visible = False

# Update layout with buttons for 3D and 2D plots
fig_3d.update_layout(
    title="3D Plot",
    updatemenus=[{
        "buttons": buttons_3d,
        "direction": "down",
        "showactive": True,
        "x": 1.3,  # Adjust the position to avoid overlap
        "xanchor": "left",
        "y": 1.05,
        "yanchor": "top",
    }],
    legend_title_text="Cluster ID",
    title_x=0.05,
    title_y=0.80,
    title_font=dict(size=20, color="black", weight="bold"),

    legend=dict(
        font=dict(size=16),
        x=1,          # Horizontal position
        y=0.7,        # Vertical position
        xanchor='left',   # Anchor position for x
        yanchor='middle'  # Anchor position for y
    )
)

fig_2d.update_layout(
    title="2D Plot",
    updatemenus=[{
        "buttons": buttons_2d,
        "direction": "down",
        "showactive": True,
        "x": 1.3,  # Adjust the position to avoid overlap
        "xanchor": "left",
        "y": 1.05,
        "yanchor": "top",
    }],
    legend_title_text="Cluster ID",
    legend=dict(font=dict(size=16)),
    title_font=dict(size=20, color="black", weight="bold")
)

# Adjusting the size of the plot window to fit all buttons
fig_3d.update_layout(height=800)
fig_2d.update_layout(height=800)

# Show the figures
fig_3d.show()
fig_2d.show()
fig_2d.write_html(plot_2d_html)
fig_3d.write_html(plot_3d_html)


# Other metrics random

In [None]:
def checksplit(df, term):
    filt = df.loc[df["term"] == term]
    if not filt.empty :
        return filt["tokens"].values[0]
    else : 
        return None

In [None]:
import itertools
from collections import Counter

freq_dict = {"term" : [], "frequency" : [], "toxic_freq" : [], "nonToxic_freq" : [], 
            "toxic_100": [], "black_sub": [], "cluster_freq" : [], "clusters" : [],
            "bi_clusters" : [], "tri_clusters" : [], "four_clsuters" : [],
            "all_occs" : [], "BERT_split": [], "Ro_split" : []}
split_tokens_BERT = pd.read_csv('avg_pca2/split_tokensBERT_avg_pca2.csv')
split_tokens_RoPretrained = pd.read_csv('avg_pca2/split_tokensRoPretrained_avg_pca2.csv')

for term, info in tokens_sentences_noblack.items() :
    print(term)
    freq = len(info["sentences"])
    toxic_freq = len([tox for tox in info["toxicity"] if tox == "toxic"])
    nontoxic_freq = len([tox for tox in info["toxicity"] if tox == "non-toxic"])
    black_freq = len([tox for tox in info["toxicity"] if tox == "black"])
    try : 
        toxic_100 = round((toxic_freq/(toxic_freq+nontoxic_freq))*100, 2) # NOT CONSIDERING BLACK
    except :
        toxic_100 == 0
    
    all_ocs = info["sentences"]
    BERT_split = checksplit(split_tokens_BERT, term)
    Ro_split = checksplit(split_tokens_RoPretrained, term)

    clusters_df = substitutes_df[substitutes_df['tokens'].apply(lambda x: term in x)]
    clusters_df = clusters_df[~clusters_df['target_term'].str.contains('black')]
    clusters_items = {}
    bi_counter = Counter()
    tri_counter = Counter()
    four_counter = Counter()
    how_many_clusers = len(clusters_df)
    # Loop through each row and the list within that row
    for idx, row in clusters_df.iterrows():
        for item in row['tokens']:
            if item in clusters_items :
                clusters_items[item] += 1
            else :
                clusters_items[item] = 1
    
        # Get the other items in the list besides the target term
        other_items = [item for item in row['tokens'] if item != term]
        
        # Generate all possible pairs (or combinations of any size) from the other items
        for group in itertools.combinations(other_items, 2):  # For pairs, use 2. For triples, use 3, etc.
            bi_counter[group] += 1
            
        for group in itertools.combinations(other_items, 3):  # For pairs, use 2. For triples, use 3, etc.
            tri_counter[group] += 1
            
        for group in itertools.combinations(other_items, 4):  # For pairs, use 2. For triples, use 3, etc.
            four_counter[group] += 1
    print(clusters_items)
    clusters_items = {key: count for key, count in sorted(clusters_items.items(), key=lambda item: item[1], reverse=True)}
    bi_counter = {key: count for key, count in sorted(bi_counter.items(), key=lambda item: item[1], reverse=True) if count > 1}
    tri_counter = {key: count for key, count in sorted(tri_counter.items(), key=lambda item: item[1], reverse=True) if count > 1}
    four_counter = {key: count for key, count in sorted(four_counter.items(), key=lambda item: item[1], reverse=True) if count > 1}

    assign_to_dict(freq_dict, term, freq, toxic_freq, nontoxic_freq, toxic_100, black_freq, how_many_clusers,
                    clusters_items, bi_counter, tri_counter, four_counter, all_ocs, BERT_split, Ro_split)

terms_freq = pd.DataFrame(freq_dict)
terms_freq.to_csv(f"{VERSION}/tokens_details_of_freq.csv")