## Create environment 
- The code uses the f-strings available after python 3.6. To install python 3.7 run the code below in the terminal, then refresh this page (then Kernel -> Change kernel -> py37)
`
conda create --name py37 python=3.7 pandas numpy nltk matplotlib seaborn
source activate py37
conda install nb_conda
`



In [1]:
import os
import pandas as pd
import json
pd.set_option("display.max_columns",30)

!mkdir data_cleaned

mkdir: cannot create directory ‘data_cleaned’: File exists


In [2]:
cols = ["language","education","age","ideology","technical_issues","confusion","engagement","difficulty"] 

# General functions

In [3]:
def parse_response(response):
    """
    Parses the questionnaire
    """
    if isinstance(response,float):
        return [""]*8
    
    d = json.loads(response)  

    return  [d.get(_) for _ in cols]


In [4]:
def get_participants(exp):
    """
    Get the info on participants merging the node (tells the layer) and the questionnaire
    """
    n = pd.read_csv(f"data/{exp}/data/node.csv",usecols=["id","property2","participant_id"])
    n.columns = ["destination_id","layer_n","participant_id"]
    q = pd.read_csv(f"data/{exp}/data/question.csv",usecols=["participant_id","response"])
    participants = pd.merge(n,q,how="outer")

    cols = ["language","education","age","ideology","technical_issues","confusion","engagement","difficulty"] 

    parsed_response = participants["response"].apply(parse_response)
    
    parsed_response = pd.DataFrame(zip(*parsed_response), index=range(8)).T
    participants[cols] = parsed_response
    del participants["response"]
    
    
    return participants

def create_network(exp):
    """
    Format the network adding the stories
    """
    stories = pd.read_csv(f"data/{exp}/data/info.csv",usecols=["creation_time","id","origin_id","contents","failed"])
    stories = stories.loc[stories["failed"] == "f"]
    id2story = stories.set_index("origin_id")["contents"].drop_duplicates().to_dict()

    network = pd.read_csv(f"data/{exp}/data/vector.csv")
    data = network.groupby("destination_id")["origin_id"].apply(lambda x: list(x)).reset_index()
    data["story1"] = data["origin_id"].str[0].map(id2story)
    data["story2"] = data["origin_id"].str[-1].map(id2story)
    data["story_merged"] = data["destination_id"].map(id2story)
    
    return data.dropna(subset=["story_merged"])


# Clean the data

In [5]:
exps = os.listdir("data")

for exp in sorted(exps):
    
    #Questions and layer
    participants = get_participants(exp)
    #Network and stories
    network = create_network(exp)
    
    #Save
    data =pd.merge(network,participants)
    
    data.to_csv(f"data_cleaned/{exp}.csv",sep="\t",index=None)



In [6]:
data.loc[:,list(network.columns)+["layer_n","participant_id"]]

Unnamed: 0,destination_id,origin_id,story1,story2,story_merged,layer_n,participant_id
0,2,[1],"Through history, most people didn't die of can...","Through history, most people didn't die of can...","Throughout time, we have used one form or anot...",0.0,1.0
1,3,[2],"Throughout time, we have used one form or anot...","Throughout time, we have used one form or anot...",Superbugs are a very real problems generated f...,1.0,2.0
2,6,[3],Superbugs are a very real problems generated f...,Superbugs are a very real problems generated f...,superbugs are caused by resistance to antibiot...,2.0,5.0
3,7,[6],superbugs are caused by resistance to antibiot...,superbugs are caused by resistance to antibiot...,Superbugs are when bacteria becomes resistant ...,3.0,6.0
4,8,[7],Superbugs are when bacteria becomes resistant ...,Superbugs are when bacteria becomes resistant ...,Superbugs are bacteria that grow resistant to ...,4.0,7.0
