# massive workflow

In [None]:
from google import genai
import glob
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import umap
import csv
import time
from sentence_transformers import SentenceTransformer
import random
import re
from openai import OpenAI
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# FUNCTIONS

In [None]:
def check_empty(text_list):
  empty_indices = []
  for i in range(0, len(text_list)):
    if re.fullmatch(r"[\s*]*", text_list[i]):
      empty_indices.append(i)

  return(empty_indices)

In [None]:
# deals with the double pair of asterisks, pairing them up
def split_combine_doubleA(output, split_on):
  output_split = output.split(split_on)  # split on **

  sec_start = 1
  if output.startswith(split_on):
    sec_start = 0

  if sec_start == 1:
    sections = [output_split[i] + output_split[i+1] for i in range(sec_start, len(output_split)-1, 2)]
    sections.insert(0, output_split[0])
  else:
    # if it starts with **, then there is an empty string at the front
    output_split = output_split[1:]
    sections = [output_split[i] + output_split[i+1] for i in range(sec_start, len(output_split)-1, 2)]

  return sections

In [None]:
# each segment is three sentences. the first of which is overlapping with previous segment.
def segment(output_sentences, comb_value, overlap_value):
  output_sentences_modified = []
  i = 0
  while(True):
    if i-overlap_value < 0:
      output_sentences_modified.append(" ".join(output_sentences[i:i+comb_value]) )
      i = i + comb_value
      continue
    if i+comb_value > len(output_sentences):
      output_sentences_modified.append(" ".join(output_sentences[i-overlap_value:]) )
      break
    output_sentences_modified.append(" ".join(output_sentences[i-overlap_value : i+comb_value]) )

    i = i + comb_value

  return(output_sentences_modified)

In [None]:
# count tokens
client = genai.Client(api_key=GEMINI_API_KEY)
def count_tokens(output_sentences):
  tokens = []
  for i in output_sentences:
    weird_output = client.models.count_tokens(
        model="gemini-2.5-flash-lite", contents = i
    )

    tokens.append(weird_output.total_tokens)
  return(tokens)

In [None]:
# returns intfload embeddings, L2 normalized
def get_embedding_intfloat(text, model="text-embedding-3-large"):
  model_intfloat = SentenceTransformer('intfloat/e5-base-v2')
  embeddings = model_intfloat.encode(text, normalize_embeddings=True)
  return(embeddings)

# WORK

# WORKFLOW FOR SENTENCE SPLIT

In [None]:
output_files = sorted(glob.glob("cot1.*_output.txt"))
output_files

In [None]:
# SPLIT BY SENTENCE
comb_value = 3
overlap_value = 1

for comb_value in [3, 5]:
  for overlap_value in [0, 1]:

    embedding_fname = f"embeddings_cot1_splitSentence_comb{comb_value}_over{overlap_value}.csv"

    n_segments_per_response = []
    embeddings = []
    n_tokens = []

    for file in output_files:
      # read in file
      with open(file, "r") as f:
        output = f.read()

      # split on period
      output_sentences = re.split(r'(?<=\S\.)\s', output)
      output_sentences_modified = segment(output_sentences, comb_value, overlap_value)
      n_tokens.append( count_tokens(output_sentences_modified) )

      # embeddings
      response_embeddings = get_embedding_intfloat(output_sentences_modified)

      n_segments_per_response.append(len(response_embeddings))
      embeddings.append(response_embeddings)

    np_arrays = [np.array(arr) for arr in embeddings]
    all_embeddings = np.vstack(np_arrays)

    print(embedding_fname)
    print(all_embeddings.shape)
    print(n_segments_per_response)
    print(n_tokens)

    np.savetxt(embedding_fname, all_embeddings, delimiter=",")
    # do the max token check -- lol what is this -_- jiu ming a


    png_fname = f"pca_cot1_splitSentence_comb{comb_value}_over{overlap_value}.png"

    # pca
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(all_embeddings)
    print(embeddings_2d.shape)


    %matplotlib inline
    sns.set(style='white', context='poster', rc={'figure.figsize':(14,10)})

    colors = []
    start_index = 0
    end_index = 0

    for i in range(0, len(output_files)):
      end_index = end_index + n_segments_per_response[i]

      colors = np.linspace(0, 1, n_segments_per_response[i])
      indices = np.arange(start_index, end_index)

      plt.plot(embeddings_2d[indices, 0], embeddings_2d[indices, 1], color="black", alpha=1, linewidth=0.1)
      plt.scatter(embeddings_2d[indices,0], embeddings_2d[indices,1], c=colors, s=10, cmap='viridis')

      start_index = end_index # for next iteration


    plt.colorbar(label='Order gradient (first → last)')
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title(f"CoT1 PCA - Sentence Split - Combine {comb_value} - Overlap {overlap_value}")
    plt.savefig(png_fname, dpi=300, bbox_inches='tight')
    plt.show()

# WORKFLOW FOR ** SPLIT

In [None]:
output_files = sorted(glob.glob("cot2.1.*_output.txt"))
output_files

In [None]:
# WORKFLOW FOR SPLIT ON **
for comb_value in [3, 5]:
  for overlap_value in [0, 1]:

    embedding_fname = f"embeddings_cot1_splitAA_comb{comb_value}_over{overlap_value}.csv"

    embeddings = []
    n_tokens = []
    n_segments_per_response = []
    for file in output_files:
      with open(file, "r") as f:
        output = f.read()

      output_split = split_combine_doubleA(output, "**")
      output_sentences_modified = segment(output_split, comb_value, overlap_value)
      n_tokens.append( count_tokens(output_sentences_modified) )

      # embeddings
      response_embeddings = get_embedding_intfloat(output_sentences_modified)

      n_segments_per_response.append(len(response_embeddings))
      embeddings.append(response_embeddings)

    np_arrays = [np.array(arr) for arr in embeddings]
    all_embeddings = np.vstack(np_arrays)
    print(embedding_fname)
    print(all_embeddings.shape)
    print(n_segments_per_response)
    print(n_tokens)

    np.savetxt(embedding_fname, all_embeddings, delimiter=",")

    png_fname = f"pca_cot1_splitAA_comb{comb_value}_over{overlap_value}.png"

    # pca
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(all_embeddings)
    print(embeddings_2d.shape)


    %matplotlib inline
    sns.set(style='white', context='poster', rc={'figure.figsize':(14,10)})

    colors = []
    start_index = 0
    end_index = 0

    for i in range(0, len(output_files)):
      end_index = end_index + n_segments_per_response[i]

      colors = np.linspace(0, 1, n_segments_per_response[i])
      indices = np.arange(start_index, end_index)

      plt.plot(embeddings_2d[indices, 0], embeddings_2d[indices, 1], color="black", alpha=1, linewidth=0.1)
      plt.scatter(embeddings_2d[indices,0], embeddings_2d[indices,1], c=colors, s=10, cmap='viridis')

      start_index = end_index # for next iteration


    plt.colorbar(label='Order gradient (first → last)')
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title(f"CoT1 PCA - AA Split - Combine {comb_value} - Overlap {overlap_value}")
    plt.savefig(png_fname, dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
n_segments_per_response

In [None]:
print(embedding_fname)
print(png_fname)

# umaps for both CoTs

In [None]:
embedding_files = sorted(glob.glob('embeddings_cot*_splitAA*.csv'))
embedding_files

In [None]:
embeddings = []
for file in embedding_files:
  embeddings.append(np.loadtxt(file, delimiter=','))

In [None]:
# in order of cot, same order as embeddings
segs = np.array(
    [[9, 10, 8, 9, 10, 10, 7, 10, 7, 6],
     [11, 10, 21, 15, 12, 16, 31, 14, 12, 12],
     [6, 9, 12, 5, 12, 8, 2, 10, 2, 6]]
)

# segs = segs - 1

In [None]:
all_embeddings_combined = np.vstack(embeddings)
all_embeddings_combined.shape

In [None]:
# pca
pca = PCA(n_components=2)
embeddings_2d_combined = pca.fit_transform(all_embeddings_combined)
print(embeddings_2d_combined.shape)

In [None]:
[sum(i) for i in segs]

In [None]:
pca_split = []
pca_split.append(embeddings_2d_combined[0:86])
pca_split.append(embeddings_2d_combined[86:154+86])
pca_split.append(embeddings_2d_combined[154+86:])


# EX: output: [np.int64(86), np.int64(154), np.int64(50)]
# pca_split.append(embeddings_2d_combined[0:86])
# pca_split.append(embeddings_2d_combined[86:154+86])
# pca_split.append(embeddings_2d_combined[154+86:])

In [None]:
[len(i) for i in pca_split]

In [None]:
# umaps for 2 cots

%matplotlib inline
sns.set(style='white', context='poster', rc={'figure.figsize':(14,10)})

colors = []

for j in range(0, len(embeddings)):
  start_index = 0
  end_index = 0
  lw = 0.3
  ptsize = 10
  marker = 'o'

  if j == 0:
    cot_color = 'blue'
  if j == 1:
    cot_color = 'red'
  if j == 2:
    cot_color = 'green'
    lw = 2
    ptsize = 100
    marker = '*'

  embeddings_2d = pca_split[j]
  n_segments_per_response = segs[j]

  for i in range(0, len(n_segments_per_response)):
    end_index = end_index + n_segments_per_response[i]

    colors = np.linspace(0, 1, n_segments_per_response[i])
    indices = np.arange(start_index, end_index)

    plt.plot(embeddings_2d[indices, 0], embeddings_2d[indices, 1], color=cot_color, alpha=1, linewidth=lw)
    plt.scatter(embeddings_2d[indices,0], embeddings_2d[indices,1], c=colors, s=ptsize, marker=marker, cmap='viridis')

    start_index = end_index # for next iteration

blue_line = mlines.Line2D([], [], color='blue', label='CoT1')
red_line = mlines.Line2D([], [], color='red', label='CoT2')
green_line = mlines.Line2D([], [], color='green', label='CoT3')
plt.legend(handles=[blue_line, red_line, green_line])

plt.colorbar(label='Order gradient (first → last)')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("CoT1, 2.1,3.1 PCA - Bold Split - Combine 552 - Overlap 1")
plt.show()

# COSINE SIMILARITY BETWEEN RESPONSES

In [None]:
output_files = sorted(glob.glob("cot*_output.txt"))
output_files

In [None]:
all_doc_embs = []
for file in output_files:
  with open(file, 'r') as f:
    output = f.read()

  all_doc_embs.append( get_embedding_intfloat(output) )

all_doc_embs = np.vstack(np.array(all_doc_embs))
all_doc_embs.shape

In [None]:

# 1️⃣ Compute pairwise cosine similarity
sim_matrix = cosine_similarity(all_doc_embs)

# 2️⃣ Plot as heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(sim_matrix,
            cmap="viridis",
            square=True,
            cbar_kws={"label": "Cosine Similarity"},
            xticklabels=np.arange(0, 20),
            yticklabels=np.arange(0, 20))

plt.title("CoT1&2 Cosine Similarity")
plt.show()