In [1]:
import os, re, sys, math, base64
from openai import OpenAI
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
current_directory = os.getcwd()

In [3]:
data = pd.read_csv('llama.csv')
data.rename(columns={'image_id': 'image'}, inplace=True)
data.rename(columns={'simulation': 'iter'}, inplace=True)

## Define Measures

In [4]:
def calculate_pd(probs): 
    prob_diff = 1 - sum(x**2 for x in probs)
    return prob_diff

def calculate_entropy(probs):
    return -np.sum(probs * np.log(probs))

def calculate_perplexity(probs):
    ent = calculate_entropy(probs)
    perplexity = 2 ** ent
    return perplexity

In [5]:
# Explode the DataFrame based on 'top_20_probs'
def explode_and_calculate_measures(row):
    nested_probs = eval(row)  # Convert the string to list of tuples
    exploded_rows = []
    for position, inner_list in enumerate(nested_probs, start=1):
        if position > 50:  # Skip positions greater than 50
            break
        log_probs = [item[1] for item in inner_list]
        probs = np.exp(log_probs - np.max(log_probs))  # Normalize for numerical stability
        probs /= np.sum(probs)  # Ensure probabilities sum to 1
        
        # Calculate measures
        entropy = calculate_entropy(probs)
        perplexity = calculate_perplexity(probs)
        pd = calculate_pd(probs)
        
        exploded_rows.append({
            'position': position,
            'entropy': entropy,
            'perplexity': perplexity,
            'probability_of_differentiation': pd
        })
    return exploded_rows

In [6]:
# Explode each row into multiple rows with measures
exploded_data = []
for idx, row in data.iterrows():
    exploded_rows = explode_and_calculate_measures(row['top_20_probs'])
    for exploded_row in exploded_rows:
        exploded_data.append({
            'image': row['image'],
            'iter': row['iter'],
            'position': exploded_row['position'],
            'entropy': exploded_row['entropy'],
            'perplexity': exploded_row['perplexity'],
            'pd': exploded_row['probability_of_differentiation']
        })

# Create a new DataFrame
exploded_df = pd.DataFrame(exploded_data)

In [7]:
os.chdir(current_directory)
exploded_df.to_csv('llama_probs.csv', index = False)