# Load packages

In [1]:
import json 
import numpy as np
from scipy.spatial.distance import cosine
from scipy.spatial.distance import squareform
from sklearn.metrics import pairwise_distances as pdist
from sentence_transformers import SentenceTransformer
import itertools
import json
import pandas as pd
import itertools
import re
import numpy as np
import pickle
import random
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

model = SentenceTransformer("all-mpnet-base-v2")

np.random.seed(416)
random.seed(416)

# Clean AI responses

In [2]:
def fix_cat(x):
    if x == "startup":
        return "startups"
    elif x == "oped":
        return "opeds"
    elif x== "podcast":
        return "podcasts"
        
def extract_json(text):
    try:
        matches = re.findall(r'\{[^{}]*\}', text)
        if matches:
            json_str = matches[0]
            data = json.loads(json_str)
            if len(data.keys()) == 1:
                return list(data.values())[0]
            else:
                raise ValueError("More than 1 key")
        else:
            raise ValueError("No JSON found in the provided text.")
    except Exception as e:
        print(f"Error: {e}\nResponse: {text}\n")
        return np.nan

# Example usage

##############
# Read in and fix cat col
##############

ai_ideas = pd.read_json("ai_ideas.jsonl", lines=True)
ai_ideas['category'] = ai_ideas['category'].apply(fix_cat)

##############
# Simple stats
##############
print("====="*10)
print("Number of AI ideas", len(ai_ideas))
print("Number of AI ideas by domain and model", ai_ideas.groupby(by=['category', 'model']).count())
print("====="*10)


##############
# Coerce json data
##############
ai_ideas['text'] = ai_ideas['output'].apply(extract_json)

##############
# Drop NaNs
##############
print("====="*10)
print(f"Number of nans:", np.sum(ai_ideas['text'].isna()))
print(ai_ideas[ai_ideas['text'].isna()][['model', 'category']])
print("====="*10)

ai_ideas = ai_ideas.dropna(subset=['text'])


Number of AI ideas 900
Number of AI ideas by domain and model                         output  dataset_id
category model                            
opeds    claude-2          100         100
         gpt-3.5-turbo     100         100
         gpt-4-0613        100         100
podcasts claude-2          100         100
         gpt-3.5-turbo     100         100
         gpt-4-0613        100         100
startups claude-2          100         100
         gpt-3.5-turbo     100         100
         gpt-4-0613        100         100
Error: Expecting ',' delimiter: line 1 column 708 (char 707)
Response: {"description": "Welcome to 'Invisible Threads,' an immersive podcast journey where we unravel the unseen ties that connect us all. Each episode is a deep dive into the intricate world of culture, science, history, and the collective human experience. Join our host, renowned anthropologist Dr. Emma Robinson, as she navigates through compelling stories, mind-bending theories, and engaging dia

In [3]:
ai_ideas.sample(5)

Unnamed: 0,model,category,output,dataset_id,text
2,gpt-3.5-turbo,startups,"{""idea"": ""A platform that connects remote work...",gpt-3.5-turbo_startup_2,A platform that connects remote workers with l...
554,claude-2,opeds,Here is a suggested op-ed headline for the Ne...,claude-2_oped_54,America's Uncertain Future
33,gpt-3.5-turbo,startups,"{""idea"": ""A platform that helps users find and...",gpt-3.5-turbo_startup_33,A platform that helps users find and connect w...
810,claude-2,podcasts,"{""description"": ""Welcome to the Podcast Pros ...",claude-2_podcast_10,"Welcome to the Podcast Pros podcast, where we ..."
157,gpt-4-0613,startups,"{""idea"": ""An AI-powered platform that connects...",gpt-4-0613_startup_57,An AI-powered platform that connects young pro...


# Read in human responses

In [3]:
# Opeds
opeds = pd.read_json("2017-01-01_to_2024-04-15_nyt_headlines.jsonl", lines=True)
brief_opeds = opeds[['abstract', 'date', 'dataset_id']]
brief_opeds['date'] = pd.to_datetime(brief_opeds['date'])
brief_opeds.columns = ['text', 'date', 'dataset_id']
brief_opeds['category'] = 'opeds'

# # Startups
startups = pd.read_json("2017-01-01_to_2024-04-15_startups.jsonl", lines=True)
brief_startups = startups[['description', 'date', 'dataset_id']]
brief_startups.columns = ['text', 'date', 'dataset_id']
brief_startups['category'] = 'startups'

brief_human = pd.concat([brief_startups, brief_opeds])
brief_human['date'] = pd.to_datetime(brief_human['date'], utc=True).dt.date

brief_human.to_json("brief_human.jsonl", lines=True, orient='records')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brief_opeds['date'] = pd.to_datetime(brief_opeds['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brief_opeds['category'] = 'opeds'


In [4]:
brief_human.sample(5)

Unnamed: 0,text,date,dataset_id,category
5035,Collaboration and meeting software for Virtual...,2017-11-20,startup_5035,startups
19096,Remote work advice just for you,2020-05-05,startup_19096,startups
28218,Free polling and survey,2021-11-13,startup_28218,startups
23702,Free Typeform alternative with next-level calc...,2021-02-18,startup_23702,startups
1974,Some alterations to the A.C.A. proposed by the...,2017-07-13,oped_1974,opeds


# Sbert embeddings

In [None]:
human_embeddings = model.encode(brief_human['text'].tolist(), show_progress_bar=True)
ai_embeddings = model.encode(ai_ideas['output'].tolist(), show_progress_bar=True)

brief_human['vec'] = [i for i in human_embeddings]
ai_ideas['vec'] = [i for i in ai_embeddings]

brief_human.to_json("brief_human_w_vec.jsonl", lines=True, orient='records')
ai_ideas.to_json("ai_ideas_w_vec.jsonl", lines=True, orient='records')

Batches:   0%|          | 0/1536 [00:00<?, ?it/s]

# Save files for analysis

Object properties:
- `vec`: The SBERT embedding of text
- `dataset_id`: A textual id (e.g: `gpt4_startup_0`)
- `idx`: Every idea assigned a unique index, which is useful for slicing the similarity matrix

Pickle files created:
- `dataset_id2idx`
- `idx2dataset_id`
- `idx2vec`
- `dataset_id2vec`
  

In [None]:
dids = brief_human['dataset_id'].tolist() + ai_ideas['dataset_id'].tolist()
vecs = brief_human['vec'].tolist() + ai_ideas['vec'].tolist()
comb = pd.DataFrame({'dataset_id':dids, 'vec':vecs})
comb['idx'] = [i for i in range(len(comb))]

# Different dictionaries and mappings
data_dict = {
    "dataset_id2idx": comb.set_index('dataset_id')['idx'].to_dict(),
    "idx2dataset_id": comb.set_index('idx')['dataset_id'].to_dict(),
    "idx2vec": comb.set_index('idx')['vec'].to_dict(),
    "dataset_id2vec": comb.set_index('dataset_id')['vec'].to_dict()
}

for filename, data in data_dict.items():
    with open(f"{filename}.pkl", "wb") as file:
        pickle.dump(data, file)
        print(f'Created {filename}.pkl')


# One master similarity matrix so we don't have to re-compute
vec_array = np.array(comb['vec'].tolist())
cdist = pdist(vec_array, metric='cosine', n_jobs=-1)  
csim = 1 - cdist  
np.fill_diagonal(csim, 1)
with open('sim_mat.pkl', 'wb') as file:
    pickle.dump(csim, file)
    print("Created sim mat")

# Create domain-level datasets for regression

## Simple pairwise differences

`pw_data.csv` has triplets like `(human_id, ai_id, sim)`

In [None]:
import os 
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import os


cpus = os.cpu_count()
cpus_to_use = min(8, cpus-4)


with open('sim_mat.pkl', 'rb') as file:
    csim = pickle.load(file)  # Load the similarity matrix

with open('dataset_id2idx.pkl', 'rb') as file:
    dataset_id2idx = pickle.load(file)  

def process_ai_idea(ai_ids_batch, human_ids, csim, dataset_id2idx):
    results = []
    human_idxs = [dataset_id2idx[human_id] for human_id in human_ids]
    for ai_id in ai_ids_batch:
        ai_idx = dataset_id2idx[ai_id]
        sim_scores = csim[ai_idx][human_idxs]
        for human_id, sim in zip(human_ids, sim_scores):
            combined_row = {'ai_id': ai_id, 'human_id': human_id, 'sim': sim}
            results.append(combined_row)
    return results


domains = ['startups', 'opeds']
pw_data = []
cpus_to_use = min(8, os.cpu_count() - 4)
batch_size = 128  

for domain in domains:
    print("Processing domain: ", domain)
    human_ids = brief_human.query(f"category == '{domain}'")['dataset_id'].tolist()
    ai_ids = ai_ideas.query(f"category == '{domain}'")['dataset_id'].tolist()

    with ThreadPoolExecutor(max_workers=cpus_to_use) as executor:
        ai_id_batches = [ai_ids[i:i + batch_size] for i in range(0, len(ai_ids), batch_size)]
        futures = [executor.submit(process_ai_idea, batch, human_ids, csim, dataset_id2idx) for batch in ai_id_batches]
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing {domain} domain"):
            pw_data.extend(future.result())

pw_df = pd.DataFrame(pw_data)
print("Got pw data")
pw_df.to_csv("all_pw_data.csv")
print("Data has been written to all_pw_data.csv")

## Make enriched file

`pw_data_enriched` has enriched columns: 

`['date', 'dataset_id', 'category', 'ai_id', 'human_id', 'sim', 'model','in_window']`

In [None]:
merged = pd.merge(brief_human, pw_df, left_on=['dataset_id'], right_on=['human_id'])
merged = merged[[x for x in merged.columns if 'vec' not in x and 'text' not in x]]
merged['model'] = merged['ai_id'].apply(lambda x: x.split("_")[0])
merged['date'] = pd.to_datetime(merged['date'])
merged['claude_window'] = (merged['date'] <= "2023-04-01").astype(int)
merged['gpt_window'] = (merged['date'] <= "2021-10-01").astype(int)
merged['in_window'] = np.where(merged['model'] == 'chatgpt', merged['gpt_window'], merged['claude_window'])
merged['in_window_str'] =merged['in_window'].apply(lambda x: "Inside Training Window" if x==1 else "Outside Training Window")
bad_cols = [x for x in merged.columns if 'window' in x and x != 'in_window']
merged = merged[[x for x in merged.columns if x not in bad_cols]]
print("Made pw_data_enriched")
merged.to_csv("all_pw_data_meta.csv")