In [1]:
import os
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
current_directory = os.getcwd()

In [3]:
data = pd.read_csv(os.path.join(current_directory, '../gpt4omini.csv'))
collapsed_df = data.groupby('text', as_index=False).agg({'image': 'first', 'iter': 'first'})
ganfd = pd.read_csv(os.path.join(current_directory, '../../GANFD/image_lookup.csv'))
ganfd.rename(columns={'race_guess': 'race'}, inplace=True)

In [4]:
merged_df = collapsed_df.merge(
    ganfd[['full_ID', 'race', 'gender']],  # Select only the necessary columns
    how='left',  # Use 'left' to keep all rows from collapsed_df
    left_on='image',  # Column in collapsed_df to merge on
    right_on='full_ID'  # Column in image_lookup to merge on
)

# Drop the full_ID column from the result if not needed
merged_df = merged_df.drop(columns=['full_ID'])

In [5]:
study_1 = merged_df.to_dict()

## Define Functions

In [6]:
def cosines_by_image(model_name, data_dict):

    model = SentenceTransformer(model_name)
    embeddings = model.encode(list(data_dict['text'].values()))
    data_dict['embedding'] = embeddings
     
    # Initialize containers for results
    cosine_sim_results = []

    # Process embeddings by gender
    genders = set(data_dict['gender'].values()) # Identify unique genders

    for gender in genders:

        # Initialize lists to hold gender-specific embeddings and images
        gender_embeddings = []
        gender_images = []

        for i in range(len(data_dict['gender'])):
            
            if data_dict['gender'][i] == gender:
                gender_embeddings.append(data_dict['embedding'][i])
                gender_images.append(data_dict['image'][i])

        # Now, calculate pairwise cosine similarity for the current gender
        for i in range(len(gender_embeddings)):
            for j in range(i + 1, len(gender_embeddings)):
                # Calculate cosine similarity
                sim = cosine_similarity([gender_embeddings[i]], [gender_embeddings[j]])[0][0]
                
                # Append result
                cosine_sim_results.append({
                    'gender': gender,
                    'image_1': gender_images[i],
                    'image_2': gender_images[j],
                    'cosine_similarity': sim
                })

    results_df = pd.DataFrame(cosine_sim_results)
    return(results_df)

In [7]:
mpnetbase_df = cosines_by_image('sentence-transformers/all-mpnet-base-v2', study_1)
mpnetbase_df.to_csv('mpnetbase_gender.csv', index = False)