In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

In [2]:
data = pd.read_csv('../../Data/study_2.csv')
study_2 = data.to_dict()

## Define Functions

In [3]:
def cosines_by_image(model_name, data_dict):

    model = SentenceTransformer(model_name)
    embeddings = model.encode(list(data_dict['text'].values()))
    data_dict['embedding'] = embeddings
     
    # Initialize containers for results
    cosine_sim_results = []

    # Process embeddings by gender
    genders = set(data_dict['gender'].values()) # Identify unique genders

    for gender in genders:

        # Initialize lists to hold gender-specific embeddings and images
        gender_embeddings = []
        gender_images = []

        for i in range(len(data_dict['gender'])):
            
            if data_dict['gender'][i] == gender:
                gender_embeddings.append(data_dict['embedding'][i])
                gender_images.append(data_dict['image'][i])

        # Now, calculate pairwise cosine similarity for the current gender
        for i in range(len(gender_embeddings)):
            for j in range(i + 1, len(gender_embeddings)):
                # Calculate cosine similarity
                sim = cosine_similarity([gender_embeddings[i]], [gender_embeddings[j]])[0][0]
                
                # Append result
                cosine_sim_results.append({
                    'gender': gender,
                    'image_1': gender_images[i],
                    'image_2': gender_images[j],
                    'cosine_similarity': sim
                })

    results_df = pd.DataFrame(cosine_sim_results)
    return(results_df)

In [4]:
mpnetbase_df = cosines_by_image('sentence-transformers/all-mpnet-base-v2', study_2)
mpnetbase_df.to_csv('../Cosine/mpnetbase.csv', index = False)

  return self.fget.__get__(instance, owner)()


In [5]:
distilroberta_df = cosines_by_image('sentence-transformers/all-distilroberta-v1', study_2)
distilroberta_df.to_csv('../Cosine/distilroberta.csv', index = False)

In [6]:
allminilm_df = cosines_by_image('sentence-transformers/all-MiniLM-L12-v2', study_2)
allminilm_df.to_csv('../Cosine/allminilm.csv', index = False)