## Create environment 
- The code uses the f-strings available after python 3.6. To install python 3.7 run the code below in the terminal, then refresh this page (then Kernel -> Change kernel -> py37)
`
conda create --name py37 python=3.7 pandas numpy nltk matplotlib seaborn
source activate py37
conda install nb_conda
`



In [344]:
import os
import pandas as pd
import json
pd.set_option("display.max_columns",30)

!mkdir data_cleaned

mkdir: cannot create directory ‘data_cleaned’: File exists


In [345]:
cols = ["language","education","age","ideology","technical_issues","confusion","engagement","difficulty"] 

# General functions

In [346]:
def parse_response(response):
    """
    Parses the questionnaire
    """
    if isinstance(response,float):
        return [""]*8
    
    d = json.loads(response)  

    return  [d.get(_) for _ in cols]


In [356]:
def get_participants(exp):
    """
    Get the info on participants merging the node (tells the layer) and the questionnaire
    """
    n = pd.read_csv(f"data/{exp}/data/node.csv",usecols=["id","property2","participant_id"])
    n.columns = ["destination_id","layer_n","participant_id"]
    q = pd.read_csv(f"data/{exp}/data/question.csv",usecols=["participant_id","response"])
    participants = pd.merge(n,q,how="outer")

    cols = ["language","education","age","ideology","technical_issues","confusion","engagement","difficulty"] 

    parsed_response = participants["response"].apply(parse_response)
    
    parsed_response = pd.DataFrame(zip(*parsed_response), index=range(8)).T
    participants[cols] = parsed_response
    del participants["response"]
    
    
    return participants

def create_network(exp):
    """
    Format the network adding the stories
    """
    stories = pd.read_csv(f"data/{exp}/data/info.csv",usecols=["creation_time","id","origin_id","contents","failed"])
    stories = stories.loc[stories["failed"] == "f"]
    id2story = stories.set_index("origin_id")["contents"].drop_duplicates().to_dict()

    network = pd.read_csv(f"data/{exp}/data/vector.csv")
    data = network.groupby("destination_id")["origin_id"].apply(lambda x: list(x)).reset_index()
    data["story1"] = data["origin_id"].str[0].map(id2story)
    data["story2"] = data["origin_id"].str[-1].map(id2story)
    data["story_merged"] = data["destination_id"].map(id2story)
    
    return data.dropna(subset=["story_merged"])


# Clean the data

In [367]:
exps = os.listdir("data")

for exp in sorted(exps):
    
    #Questions and layer
    participants = get_participants(exp)
    #Network and stories
    network = create_network(exp)
    
    #Save
    data =pd.merge(network,participants)
    
    data.to_csv(f"data_cleaned/{exp}.csv",sep="\t",index=None)



In [368]:
data.loc[:,list(network.columns)+["layer_n","participant_id"]]

Unnamed: 0,destination_id,origin_id,story1,story2,story_merged,layer_n,participant_id
0,3,[1],"Through history, most people didn't die of can...","Through history, most people didn't die of can...",There have been so many deaths throughout hist...,0.0,2.0
1,4,"[3, 2]",There have been so many deaths throughout hist...,,Many have died in history from disease. It se...,1.0,3.0
2,5,"[3, 2]",There have been so many deaths throughout hist...,,A large number of people have died in the past...,1.0,4.0
3,6,"[5, 4]",A large number of people have died in the past...,Many have died in history from disease. It se...,In past people died for many diseases. After W...,2.0,5.0
4,7,"[5, 4]",A large number of people have died in the past...,Many have died in history from disease. It se...,"A long time ago, people didn't live long enoug...",2.0,6.0
5,8,"[7, 6]","A long time ago, people didn't live long enoug...",In past people died for many diseases. After W...,"In the past, people died from a lot of differe...",3.0,7.0
6,9,"[7, 6]","A long time ago, people didn't live long enoug...",In past people died for many diseases. After W...,That prior to Word War II people generally did...,3.0,8.0
7,10,"[9, 8]",That prior to Word War II people generally did...,"In the past, people died from a lot of differe...","Prior to World War II, people generally did no...",4.0,9.0
8,11,"[9, 8]",That prior to Word War II people generally did...,"In the past, people died from a lot of differe...",Before World War II people died of a lot of pr...,4.0,10.0
