In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

In [6]:
# Story definition
from common_variables import *


# All functions for text analysis are in this script, load when code is run
%load_ext autoreload
%autoreload 1
%aimport common_functions

from common_data_processing import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Clean up data from Heroku

In [7]:
# Read the data
exp = pd.read_csv(path_final_data)

# Separate into chain and network
chain_exp = exp.loc[exp["network_type"] == "chain"]
chain_exp["replication"] = chain_exp["transmission_id"] #replication is not unique, use "transmission_id" as ID

network_exp = exp.loc[exp["network_type"] == "network"]


# Clean data
process_file(chain_exp,f'{path_data_files}/data_processing/cleaned_chain_exp.csv',chain=True)
process_file(network_exp,f'{path_data_files}/data_processing/cleaned_network_exp.csv',chain=False)

# Read and concatenate again
network_exp = pd.read_csv(f"{path_data_files}/data_processing/cleaned_network_exp.csv",sep="\t")
network_exp["condition"] = "Network"

chain_exp = pd.read_csv(f"{path_data_files}/data_processing/cleaned_chain_exp.csv",sep="\t")
chain_exp["condition"] = "Chain"
results = pd.concat([network_exp, chain_exp])
results = results.reset_index(drop=True)
results = results.sort_values(by=["condition","rep","layer_n"],ascending=[False, True, True])


In [8]:
# Manual inspection (+ viz) to see if bots were able to skip the bot detection. Remove when this was the case
print(len(results))

display(results.loc[results["rep"].isin({"44-1","4-0","0-1"}), ["layer_n","rep","story_merged"]].values)
results = results.loc[~results["rep"].isin({"44-1","4-0","0-1"})]
print(len(results))

864


array([[1, '0-1',
        'Its long been known that drawing something helps a person remember it. a new study shows that drawing is superior to activities such as reading or writing because it forces the person to process information in multiple ways; visually, kinesthetically, and semantically. across a series of experiments, researchers found drawing information to be a powerful way to boost memory, increasing recall by nearly double. Myra Fernandes, Jeffrey Warms, and Melissa Meade are experts in the science of memory  how people encode, retain, and recall information. at the university of waterloo, they conducted experiments to better understand how activities such as writing, looking at pictures, listening to lectures, drawing, and visualizing images affect a students ability to remember information. in an early experiment, they asked undergraduate students to study lists of common terms words like truck and pear and then either writes down or illustrate those words.'],
       [2,

846


In [9]:
# Save to use in next analysis
results.to_csv(f'{path_data_files}/data_final/cleaned_combined_data.csv')

In [10]:
results.head()

Unnamed: 0,layer_n,rep,story1,story2,story3,story_merged,condition
0,1,1,People didn't use to die of heart disease and ...,People didn't use to die of heart disease and ...,People didn't use to die of heart disease and ...,People didn't use to die of heart disease and ...,Network
1,1,1,People didn't use to die or heart disease or c...,People didn't use to die or heart disease or c...,People didn't use to die or heart disease or c...,People didn't use to die or heart disease or c...,Network
2,1,1,Alexander Fleming discovered penicillin in 192...,Alexander Fleming discovered penicillin in 192...,Alexander Fleming discovered penicillin in 192...,Alexander Fleming discovered penicillin in 192...,Network
3,2,1,People didn't use to die of heart disease and ...,People didn't use to die or heart disease or c...,Alexander Fleming discovered penicillin in 192...,Before the discovery of lifesaving antibiotics...,Network
4,2,1,People didn't use to die of heart disease and ...,People didn't use to die or heart disease or c...,Alexander Fleming discovered penicillin in 192...,People didn't used to die of cancer or heart d...,Network


# Create text embeddings for each text

In [None]:
emb = create_embeddings(results, transformer_model=transformer_model, path=f"{path_text_embeddings}/story_embeddings")
project_embeddings(emb, path=f"{path_text_embeddings}/X_story_embedded_")

# Create data for R

In [12]:
# Create dataframe to calculate frequencies

# original story as set of words (stemmed)
set_story_original = set(story2set(story_original, stop_words=eng_stopwords))
set_story_original = set([ps.stem(_) for _ in set_story_original])

# create sets and stem words of the transmitted stories
results_prob = results[["story1", "story2", "story3", "story_merged","layer_n","rep","condition"]]

for s in ["story1", "story2", "story3", "story_merged"]:
    results_prob[s] = results_prob[s].apply(story2set,create_set=False,stop_words=eng_stopwords)
    results_prob[s] = results_prob[s].apply(lambda x: [ps.stem(_) for _ in x])
    results_prob[s] = results_prob[s].apply(lambda x: [_ for _ in x if _ in set_story_original])    
    
    
# tidy text structure (one word|replicate|condition per row)
all_persons = []

for i,row in results_prob.iterrows():
    stories = list((row["story1"])) + list((row["story2"])) + list((row["story3"]))
    stories = Counter(stories)
    if len(stories) == 0:
        continue
    person = pd.DataFrame.from_dict(stories, orient="index").reset_index()
    person.columns = ["word", "number_observed"]
    
    stories = list(set(row["story1"])) + list(set(row["story2"])) + list(set(row["story3"]))
    len_stories = len(row["story1"]) + len(row["story2"]) + len(row["story3"])
    if (row["condition"] == "Chain") or (row["layer_n"]==1):
        len_stories /= 3
    stories = Counter(stories)
    if len(stories) == 0:
        continue
    person1 = pd.DataFrame.from_dict(stories, orient="index").reset_index()
    person1.columns = ["word", "number_stories_observed"]
    person = pd.merge(person, person1)
    
    person["transmitted"] = person["word"].isin(set(row["story_merged"]))
    person["number_words_read"] = len_stories
    person["condition"] = row["condition"]
    person["layer_n"] = row["layer_n"]
    person["rep"] = row["rep"]
    
    all_persons.append(person)


all_persons = pd.concat(all_persons)
all_persons["transmitted"] = all_persons["transmitted"].astype(int)
all_persons.loc[(all_persons["condition"]=="Network") & (all_persons["layer_n"]==1), "condition"] = "Chain"
all_persons.loc[:, ["number_observed", "number_stories_observed"]] /= 3 #make mean, and fraction fo stories
all_persons.head()

Unnamed: 0,word,number_observed,number_stories_observed,transmitted,number_words_read,condition,layer_n,rep
0,peopl,1.0,1.0,1,50.0,Chain,1,1
1,use,1.0,1.0,1,50.0,Chain,1,1
2,die,3.0,1.0,1,50.0,Chain,1,1
3,heart,1.0,1.0,1,50.0,Chain,1,1
4,diseas,1.0,1.0,1,50.0,Chain,1,1


In [13]:
# Save to use in next analysis
all_persons.to_csv(f'{path_data_files}/data_processing/transmissions_word_level.csv')