In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

In [15]:
# Story definition
from common_variables import *

In [13]:
# Path data
path_data_files = "../data/"

# Path of experiment data
path_final_data = f'{path_data_files}/final_experiment/full_data.csv'

path_text_embeddings = f'{path_data_files}/text_embeddings/'

In [3]:
def process_file(data, out_path, chain=True):
    """
    Processes and cleans up data from a Heroku database, then writes the cleaned data to a file.

    This function iterates over groups of data segmented by replication factor. For each group,
    it sorts the data by generation, then formats and writes the data to an output file in a tab-separated format.
    The output includes the generation number, replication factor, three versions of the story based on generation logic,
    and the merged story.

    Parameters:
    - data (DataFrame): The input data to be processed. Expected to be a pandas DataFrame with columns
      'replication', 'generation', and 'response', among others.
    - out_path (str): The file path where the cleaned data should be written. The data is saved in a tab-separated format.
    - chain (bool, optional): Determines whether experimental design is a chain. If True,
      the previous generation's response is used for all three story versions in the current generation.
      If False, it uses distinct responses from the previous generation for each story version. Default is True.

    Returns:
    None. The function writes the cleaned data to the specified file path.
    """

    # Open or create the file at out_path for writing
    with open(out_path, "w+") as f:
        # Write the header row to the file
        f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format("layer_n", "rep", "story1", "story2", "story3", "story_merged"))

        # Iterate over each replication factor in the data
        for i, rep in data.groupby("replication"):
            # Sort the replication group by generation and iterate over each row
            for rn, row in rep.sort_values(by="generation").iterrows():

                gen = row["generation"]  # Current generation

                story_merged = row["response"]  # The merged story response
                # If this is the first generation, initialize all story versions to the original story
                if gen < 1:
                    story1, story2, story3 = story_merged, story_merged, story_merged
                else:
                    # For subsequent generations, determine the story versions based on the 'chain' parameter
                    if chain:
                        # If chaining, use the previous generation's response for all story versions
                        pv = rep.loc[rep["generation"] == gen - 1, "response"].values[0]
                        story1, story2, story3 = pv, pv, pv
                    else:
                        # If not chaining, use distinct responses from the previous generation for each story version
                        story1, story2, story3 = rep.loc[rep["generation"] == gen - 1, "response"].values

                # Write the processed data to the file, replacing newline characters in stories to ensure format consistency
                f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(gen + 1, i, story1.replace("\n", " "), story2.replace("\n", " "), story3.replace("\n", " "), story_merged.replace("\n", " ")))


In [9]:
# Read the data
exp = pd.read_csv(path_final_data)

# Separate into chain and network
chain_exp = exp.loc[exp["network_type"] == "chain"]
chain_exp["replication"] = chain_exp["transmission_id"] #replication is not unique, use "transmission_id" as ID

network_exp = exp.loc[exp["network_type"] == "network"]


# Clean data
process_file(chain_exp,f'{path_data_files}/data_processing/cleaned_chain_exp.csv',chain=True)
process_file(network_exp,f'{path_data_files}/data_processing/cleaned_network_exp.csv',chain=False)

# Read and concatenate again
network_exp = pd.read_csv(f"{path_data_files}/data_processing/cleaned_network_exp.csv",sep="\t")
network_exp["condition"] = "Network"

chain_exp = pd.read_csv(f"{path_data_files}/data_processing/cleaned_chain_exp.csv",sep="\t")
chain_exp["condition"] = "Chain"
results = pd.concat([network_exp, chain_exp])
results = results.reset_index(drop=True)
results = results.sort_values(by=["condition","rep","layer_n"],ascending=[False, True, True])


In [10]:
# Manual inspection (+ viz) to see if bots were able to skip the bot detection. Remove when this was the case
print(len(results))

display(results.loc[results["rep"].isin({"44-1","4-0","0-1"}), ["layer_n","rep","story_merged"]].values)
results = results.loc[~results["rep"].isin({"44-1","4-0","0-1"})]
print(len(results))

864


array([[1, '0-1',
        'Its long been known that drawing something helps a person remember it. a new study shows that drawing is superior to activities such as reading or writing because it forces the person to process information in multiple ways; visually, kinesthetically, and semantically. across a series of experiments, researchers found drawing information to be a powerful way to boost memory, increasing recall by nearly double. Myra Fernandes, Jeffrey Warms, and Melissa Meade are experts in the science of memory  how people encode, retain, and recall information. at the university of waterloo, they conducted experiments to better understand how activities such as writing, looking at pictures, listening to lectures, drawing, and visualizing images affect a students ability to remember information. in an early experiment, they asked undergraduate students to study lists of common terms words like truck and pear and then either writes down or illustrate those words.'],
       [2,

846


In [11]:

# Save to use in next analysis
results.to_csv(f'{path_data_files}/data_final/cleaned_combined_data.csv')

In [12]:
results.head()

Unnamed: 0,layer_n,rep,story1,story2,story3,story_merged,condition
0,1,1,People didn't use to die of heart disease and ...,People didn't use to die of heart disease and ...,People didn't use to die of heart disease and ...,People didn't use to die of heart disease and ...,Network
1,1,1,People didn't use to die or heart disease or c...,People didn't use to die or heart disease or c...,People didn't use to die or heart disease or c...,People didn't use to die or heart disease or c...,Network
2,1,1,Alexander Fleming discovered penicillin in 192...,Alexander Fleming discovered penicillin in 192...,Alexander Fleming discovered penicillin in 192...,Alexander Fleming discovered penicillin in 192...,Network
3,2,1,People didn't use to die of heart disease and ...,People didn't use to die or heart disease or c...,Alexander Fleming discovered penicillin in 192...,Before the discovery of lifesaving antibiotics...,Network
4,2,1,People didn't use to die of heart disease and ...,People didn't use to die or heart disease or c...,Alexander Fleming discovered penicillin in 192...,People didn't used to die of cancer or heart d...,Network


In [17]:
# Add original and person ID (so we compare persons of different replications later on)
data = results.loc[:, ["layer_n","rep","condition","story_merged"]]
data.loc[9909] = [8,0,"Full",story_original]
data["k"] = data.index%3
data.loc[data["condition"].str.contains("Chain"),"k"] = 0

data

Unnamed: 0,layer_n,rep,condition,story_merged,k
0,1,1,Network,People didn't use to die of heart disease and ...,0
1,1,1,Network,People didn't use to die or heart disease or c...,1
2,1,1,Network,Alexander Fleming discovered penicillin in 192...,2
3,2,1,Network,Before the discovery of lifesaving antibiotics...,0
4,2,1,Network,People didn't used to die of cancer or heart d...,1
...,...,...,...,...,...
860,3,8-2,Chain,"In the old days, people did not live long enou...",0
861,4,8-2,Chain,In the past people didn't live long enough to ...,0
862,5,8-2,Chain,People used to live shorter lives because they...,0
863,6,8-2,Chain,In the past people did not have access to anti...,0


# Create text embeddings for each text

In [20]:
from sentence_transformers import SentenceTransformer, util
from sklearn.manifold import TSNE
import umap

# Open model
model_t = SentenceTransformer(transformer_model) # Defined in common_variables.py

# Embedd results
emb = model_t.encode(data["story_merged"].values, convert_to_tensor=True)#, device="cuda") #if support for gpu

# Tensors to pandas for saving
emb_df = pd.DataFrame(np.array(emb))
emb_df.loc[:,"condition"] = data["condition"].values
emb_df.loc[:,"layer_n"] = data["layer_n"].values
emb_df.loc[:,"k"] = data["k"].values
emb_df.loc[:,"rep"] = data["rep"].values

emb_df.to_csv(f"{path_text_embeddings}/story_embeddings.csv", index=None)


# Project to TSNE 
X_embedded = TSNE(random_state=42, n_components=2, learning_rate='auto', n_jobs=-1).fit_transform(emb)
np.save(f"{path_text_embeddings}/X_story_embedded_tsne.npy", X_embedded)
print(X_embedded.shape)

# Project to UMAP
X_embedded = umap.UMAP(random_state=42, metric="cosine").fit_transform(emb)
np.save(f"{path_text_embeddings}/X_story_embedded_umap.npy", X_embedded)
print(X_embedded.shape)



(847, 2)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


(847, 2)
