In [1]:
import os
import pandas as pd
import json

In [4]:
Events = [
    "ebola-essien-all-rnr-threads",
    "charliehebdo-all-rnr-threads",
    "ferguson-all-rnr-threads",
    "germanwings-crash-all-rnr-threads",
    "gurlitt-all-rnr-threads",
    "ottawashooting-all-rnr-threads",
    "prince-toronto-all-rnr-threads",
    "putinmissing-all-rnr-threads",
    "sydneysiege-all-rnr-threads"
]

Categories = [
    "rumours",
    "non-rumours"
]

directory = "./data"

In [3]:
rumour = 0
non_rumour = 0

for event in os.listdir(directory):
    event_name = os.fsdecode(event)
    print(event_name+": ")

    combined_json = []  # store the result of merging all meta json into one
    sub_path = os.path.join(directory, event_name)
    for r in Categories:
        cate_path = os.path.join(sub_path, r)
        thread_list = os.listdir(cate_path)
        print(r+":", len(thread_list))

        for thread in thread_list:
            thread_id = os.fsdecode(thread)
            # ignore DS_store files. Use this once and then make it as comment
            # if os.path.isfile(os.path.join(cate_path, thread_name)):
            #     os.remove(os.path.join(cate_path, thread_name))
            thread_folder = os.path.join(cate_path, thread_id)
            with open(os.path.join(thread_folder, "source-tweets", thread_id+".json")) as json_file:
                meta_json = json.load(json_file)
            # Add key event
            meta_json['event'] = event_name
            # Add key "is_rumour"
            if r == "rumours":
                rumour += 1
                meta_json['is_rumour'] = True

            else:
                non_rumour += 1
                meta_json['is_rumour'] = False
            # Add key "structure" by extracting structure from structure.json
            with open(os.path.join(thread_folder, "structure.json")) as json_file:
                structure_json = json.load(json_file)
            meta_json['structure'] = structure_json
            combined_json.append(meta_json)

    # Create output directory if not exists
    output_path = os.path.join(os.getcwd(), "combined_data", event_name, "source_tweets")
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    with open(os.path.join(output_path, 'combined.json'), 'w') as output_file:
        json.dump(combined_json, output_file)

    df = pd.DataFrame.from_records(combined_json)
    df = df.set_index('id')
    df.to_csv(os.path.join(output_path, 'combined.csv'))

    print()

print("In Total:")
print("rumours:", str(rumour))
print("non_rumours:", str(non_rumour))

charliehebdo-all-rnr-threads: 
rumours: 458
non-rumours: 1621

ebola-essien-all-rnr-threads: 
rumours: 14
non-rumours: 0

ferguson-all-rnr-threads: 
rumours: 284
non-rumours: 859

germanwings-crash-all-rnr-threads: 
rumours: 238
non-rumours: 231

gurlitt-all-rnr-threads: 
rumours: 61
non-rumours: 77

ottawashooting-all-rnr-threads: 
rumours: 470
non-rumours: 420

prince-toronto-all-rnr-threads: 
rumours: 229
non-rumours: 4

putinmissing-all-rnr-threads: 
rumours: 126
non-rumours: 112

sydneysiege-all-rnr-threads: 
rumours: 522
non-rumours: 699

In Total:
rumours: 2402
non_rumours: 4023


In [10]:
for event in Events:
    combined_json = []
    for r in Categories:
        cate_path = os.path.join(directory, event, r)
        for thread in os.listdir(cate_path):
            thread_id = os.fsdecode(thread)
            reaction_path = os.path.join(cate_path, thread_id, "reactions")
            reaction_list = os.listdir(reaction_path)
            for reaction in reaction_list:
                reaction_id = os.fsdecode(reaction)
                if not reaction_id.startswith("."):
                    with open(os.path.join(reaction_path, reaction_id)) as json_file:
                        meta_json = json.load(json_file)
                    meta_json['event'] = event
                    meta_json['thread'] = thread_id
                    combined_json.append(meta_json)
    
    output_path = os.path.join(os.getcwd(), "combined_data", event, "reactions")
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    with open(os.path.join(output_path, 'combined.json'), 'w') as output_file:
        json.dump(combined_json, output_file)

    df = pd.DataFrame.from_records(combined_json)
    df = df.set_index('id')
    df.to_csv(os.path.join(output_path, 'combined.csv'))