Import necessary packages

In [29]:
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Download NLTK resources

In [32]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jacksongeorge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jacksongeorge/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacksongeorge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Results for Figure 4

Load the flitzes and compute cosine similarity (data for Figure 4)

In [41]:
# Load the human-written flitzes (saved in an Excel file, one flitz per row)
human_written_flitz_file_path = "/Users/jacksongeorge/Desktop/human_written_flitzes.xlsx"
human_written_flitz_df = pd.read_excel(human_written_flitz_file_path, header=None)

# Load the LLM-generated flitzes
ai_generated_flitz_folder = "/Users/jacksongeorge/Desktop/ai_flitz_samples"
ai_flitz_file_list = os.listdir(ai_generated_flitz_folder)

# Provide the pattern for how the LLM-generated flitzes are stored. The above folder contains 170 Excel files...
# ... 10 at each temperature value, and 17 total temperature values
# Example file: temp_0.00_sample_01.xlsx
pattern = r"temp_(\d+\.\d{2})_sample_(\d+)\.xlsx"

averaged_results = []

for filename in ai_flitz_file_list:
    match = re.match(pattern, filename)
    if not match:
        continue  # skip files that don't match the pattern

    temperature = float(match.group(1))
    sample_number = int(match.group(2))

    file_path = os.path.join(ai_generated_flitz_folder, filename)
    ai_df = pd.read_excel(file_path)

    similarity_scores = []

    for i in range(33):  # for each of the 33 virtual students
        real_text = human_written_flitz_df.iloc[i, 0]
        ai_text = ai_df.loc[i, 'Flitz Output']

        if pd.isna(real_text) or pd.isna(ai_text):
            print(f"⚠️ Skipping index {i+1} in {filename} due to missing data")
            continue

        real_text = str(real_text)
        ai_text = str(ai_text)

        # Compute cosine similarity
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform([real_text, ai_text])
        score = cosine_similarity(tfidf[0], tfidf[1])[0][0]
        similarity_scores.append(score)

    if similarity_scores:
        average_score = sum(similarity_scores) / len(similarity_scores)
        averaged_results.append({
            'temperature': temperature,
            'sample_number': sample_number,
            'average_cosine_similarity': average_score
        })

Save the cosine similarity results (Data for Figure 4)

In [43]:
averaged_df = pd.DataFrame(averaged_results)
output_path = os.path.join(os.path.expanduser("~"), "Desktop", "average_cosine_similarity_results.xlsx")
averaged_df.to_excel(output_path, index=False)
print(f"Averaged results saved to: {output_path}")

Averaged results saved to: /Users/jacksongeorge/Desktop/average_cosine_similarity_results.xlsx


## Results for Figure 2D

Compute Wasserstein Distance from a reference point (Data for Figure 2D)

In [53]:
from scipy.stats import wasserstein_distance

# Compute Wasserstein distances between the probability distributions of average cosine similarities at each temperature
reference_temp = 0.00
temperatures = sorted(averaged_df['temperature'].unique())

# Create a list to hold distances for each temperature
all_distances = {temp: [] for temp in temperatures}

# Extract the reference data for the specified temperature
ref_data = averaged_df[averaged_df['temperature'] == reference_temp]['average_cosine_similarity']

if len(ref_data) == 0:
    print(f"⚠️ No data for T={reference_temp}")
else:
    for temp in temperatures:
        # Extract data for the current temperature
        other_data = averaged_df[averaged_df['temperature'] == temp]['average_cosine_similarity']
        
        if len(other_data) > 0:
            dist = wasserstein_distance(ref_data, other_data)
            all_distances[temp].append(dist)
        else:
            all_distances[temp].append(np.nan)

# Compute mean and std dev for each temperature
summary = {
    "temperature": [],
    "mean_distance": [],
    "std_distance": []
}

for temp in temperatures:
    dists = np.array(all_distances[temp])
    dists = dists[~np.isnan(dists)]  # Drop NaNs
    if len(dists) > 0:
        summary["temperature"].append(temp)
        summary["mean_distance"].append(np.mean(dists))
        summary["std_distance"].append(np.std(dists))

Save Wasserstein distance from reference point results (Figure 2D)

In [56]:
summary_df = pd.DataFrame(summary)
output_path = os.path.join(os.path.expanduser("~"), "Desktop", "wasserstein_distance_reference_point.xlsx")
summary_df.to_excel(output_path, index=False)
print(f"Wasserstein distance from reference point results saved to: {output_path}")

Wasserstein distance from reference point results saved to: /Users/jacksongeorge/Desktop/wasserstein_distance_reference_point.xlsx


## Results for Figure 2C

Compute Wasserstein Distance between Consecutive Temperature Values (Data for Figure 2C)

In [60]:
# Sort the temperature values
temperatures = sorted(averaged_df['temperature'].unique())

midpoints = []
distances = []

# Iterate over consecutive pairs
for i in range(len(temperatures) - 1):
    t1, t2 = temperatures[i], temperatures[i+1]
    data1 = averaged_df[averaged_df['temperature'] == t1]['average_cosine_similarity']
    data2 = averaged_df[averaged_df['temperature'] == t2]['average_cosine_similarity']
    
    if len(data1) > 0 and len(data2) > 0:
        dist = wasserstein_distance(data1, data2)
        midpoint = (t1 + t2) / 2
        midpoints.append(midpoint)
        distances.append(dist)

# Create DataFrame
diff_df = pd.DataFrame({
    'temperature_midpoint': midpoints,
    'wasserstein_distance': distances
})

Save Wasserstein distance between consecutive temperature values results (Figure 2C)

In [62]:
output_path = os.path.join(os.path.expanduser("~"), "Desktop", "wasserstein_distance_consecutive_temps.xlsx")
diff_df.to_excel(output_path, index=False)
print(f"Wasserstein distance between consecutive temps results saved to: {output_path}")

Wasserstein distance between consecutive temps results saved to: /Users/jacksongeorge/Desktop/wasserstein_distance_consecutive_temps.xlsx
