In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

In [2]:
data = pd.read_csv('../../Data/study_1.csv')
study_1 = data.to_dict()

## Define Functions

In [3]:
def cosines_by_image(model_name, data_dict):

    model = SentenceTransformer(model_name)
    embeddings = model.encode(list(data_dict['text'].values()))
    data_dict['embedding'] = embeddings
     
    # Initialize containers for results
    cosine_sim_results = []

    # Process embeddings by race
    races = set(data_dict['race'].values()) # Identify unique races

    for race in races:

        # Initialize lists to hold race-specific embeddings and images
        race_embeddings = []
        race_images = []

        for i in range(len(data_dict['race'])):
            
            if data_dict['race'][i] == race:
                race_embeddings.append(data_dict['embedding'][i])
                race_images.append(data_dict['image'][i])

        # Now, calculate pairwise cosine similarity for the current race
        for i in range(len(race_embeddings)):
            for j in range(i + 1, len(race_embeddings)):
                # Calculate cosine similarity
                sim = cosine_similarity([race_embeddings[i]], [race_embeddings[j]])[0][0]
                
                # Append result
                cosine_sim_results.append({
                    'race': race,
                    'image_1': race_images[i],
                    'image_2': race_images[j],
                    'cosine_similarity': sim
                })

    results_df = pd.DataFrame(cosine_sim_results)
    return(results_df)

In [4]:
mpnetbase_df = cosines_by_image('sentence-transformers/all-mpnet-base-v2', study_1)
mpnetbase_df.to_csv('../Cosine/mpnetbase.csv', index = False)

  return self.fget.__get__(instance, owner)()


In [5]:
distilroberta_df = cosines_by_image('sentence-transformers/all-distilroberta-v1', study_1)
distilroberta_df.to_csv('../Cosine/distilroberta.csv', index = False)

In [6]:
allminilm_df = cosines_by_image('sentence-transformers/all-MiniLM-L12-v2', study_1)
allminilm_df.to_csv('../Cosine/allminilm.csv', index = False)