In [1]:
import os
import json
import utils
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
def annotate(word, ents_list, output):
    for ents in ents_list:
        if word.idx >= ents['start'] and word.idx < ents['end']:
            if ents['count'] == 0:
                o = "{} {} {} {} {} {}".format(word.text, "B-" + ents['type'], word.pos_, word.lemma_, word.tag_, word.dep_)
                output.append(o)
                ents['count'] += 1
            else:
                o = "{} {} {} {} {} {}".format(word.text, "I-" + ents['type'], word.pos_, word.lemma_, word.tag_, word.dep_)
                output.append(o)
            return output
        
    o = "{} {} {} {} {} {}".format(word.text, "0", word.pos_, word.lemma_, word.tag_, word.dep_)
    output.append(o)
    return output

In [4]:
def process_file(filename, out_file):
    ents_list = []
    output = []
    ## open file
    with open(filename) as f:
        if os.stat(filename).st_size != 0:
            d = json.load(f)
            doc = nlp(d["content"]) ## content as the original text
            hopper = d["cyberevent"]["hopper"] ## hopper array
            for h in hopper:
                events = h["events"] ## events array
                for E in events:
                    event_type = E["type"] ## example=> type: Attack
                    nugget = E["nugget"]
                    event_subtype = E["subtype"] ## example=> subtype: Databreach
                    if "argument" in E:
                        arguments = E["argument"] ## arguments array
                        for T in arguments:
                            t = T["type"] ## example=> type: Organization or Person or System
                            role_type = T["role"]["type"] ## example=> type: Victim or Attacker or
                            startOffset = T["startOffset"]
                            endOffset = T["endOffset"]
                            ents = {'type': role_type, 'start': startOffset, 'end':endOffset, 'count': 0}
                            ents_list.append(ents)

            for word in doc:
                output = annotate(word, ents_list, output)

    return output

In [5]:
def process_dataset():
    directory = r'data/annotation/'
    directory_output = 'data/annotation_j/'
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            in_file = os.path.join(directory, filename)
            out_file = os.path.join(directory_output, filename.split(".")[0] + "_j.txt")
            print(in_file)
            output = process_file(in_file, out_file)
            with open(out_file, 'w') as filehandle:
                filehandle.writelines("%s\n" % line for line in output)
            print("Done.")

In [6]:
process_dataset()

data/annotation/2075.json
Done.
data/annotation/10222.json
Done.
data/annotation/1989.json
Done.
data/annotation/1120.json
Done.
data/annotation/1435.json
Done.
data/annotation/10367.json
Done.
data/annotation/953.json
Done.
data/annotation/10019.json
Done.
data/annotation/2167.json
Done.
data/annotation/10275.json
Done.
data/annotation/904.json
Done.
data/annotation/10330.json
Done.
data/annotation/10263.json
Done.
data/annotation/1531.json
Done.
data/annotation/1024.json
Done.
data/annotation/1474.json
Done.
data/annotation/10326.json
Done.
data/annotation/2258.json
Done.
data/annotation/10058.json
Done.
data/annotation/10234.json
Done.
data/annotation/800.json
Done.
data/annotation/10371.json
Done.
data/annotation/10166.json
Done.
data/annotation/10023.json
Done.
data/annotation/10189.json
Done.
data/annotation/894.json
Done.
data/annotation/197.json
Done.
data/annotation/2761.json
Done.
data/annotation/10131.json
Done.
data/annotation/1663.json
Done.
data/annotation/343.json
Done.


Done.
data/annotation/2602.json
Done.
data/annotation/332.json
Done.
data/annotation/10005.json
Done.
data/annotation/2194.json
Done.
data/annotation/10140.json
Done.
data/annotation/2897.json
Done.
data/annotation/627.json
Done.
data/annotation/559.json
Done.
data/annotation/10052.json
Done.
data/annotation/365.json
Done.
data/annotation/1350.json
Done.
data/annotation/220.json
Done.
data/annotation/10117.json
Done.
data/annotation/10300.json
Done.
data/annotation/1452.json
Done.
data/annotation/1147.json
Done.
data/annotation/1517.json
Done.
data/annotation/10245.json
Done.
data/annotation/10183.json
Done.
data/annotation/2157.json
Done.
data/annotation/10029.json
Done.
data/annotation/430.json
Done.
data/annotation/10357.json
Done.
data/annotation/10212.json
Done.
data/annotation/575.json
Done.
data/annotation/10091.json
Done.
data/annotation/2045.json
Done.
data/annotation/2100.json
Done.
data/annotation/84.json
Done.
data/annotation/10341.json
Done.
data/annotation/1413.json
Done.

Done.
data/annotation/127.json
Done.
data/annotation/10210.json
Done.
data/annotation/1811.json
Done.
data/annotation/10355.json
Done.
data/annotation/432.json
Done.
data/annotation/1329.json
Done.
data/annotation/10181.json
Done.
data/annotation/609.json
Done.
data/annotation/10247.json
Done.
data/annotation/2769.json
Done.
data/annotation/873.json
Done.
data/annotation/28.json
Done.
data/annotation/10302.json
Done.
data/annotation/10115.json
Done.
data/annotation/388.json
Done.
data/annotation/222.json
Done.
data/annotation/737.json
Done.
data/annotation/1352.json
Done.
data/annotation/10050.json
Done.
data/annotation/10379.json
Done.
data/annotation/2712.json
Done.
data/annotation/808.json
Done.
data/annotation/625.json
Done.
data/annotation/2196.json
Done.
data/annotation/10142.json
Done.
data/annotation/10007.json
Done.
data/annotation/10284.json
Done.
data/annotation/1539.json
Done.
data/annotation/2315.json
Done.
data/annotation/10154.json
Done.
data/annotation/263.json
Done.
da

Done.
data/annotation/10148.json
Done.
data/annotation/2173.json
Done.
data/annotation/290.json
Done.
data/annotation/455.json
Done.
data/annotation/1460.json
Done.
data/annotation/10298.json
Done.
data/annotation/1899.json
Done.
data/annotation/10332.json
Done.
data/annotation/18.json
Done.
data/annotation/10277.json
Done.
data/annotation/843.json
Done.
data/annotation/510.json
Done.
data/annotation/2020.json
Done.
data/annotation/1319.json
Done.
data/annotation/286.json
Done.
data/annotation/2165.json
Done.
data/annotation/10365.json
Done.
data/annotation/117.json
Done.
data/annotation/1088.json
Done.
data/annotation/10220.json
Done.
data/annotation/394.json
Done.
data/annotation/4009.json
Done.
data/annotation/10109.json
Done.
data/annotation/10191.json
Done.
data/annotation/1786.json
Done.
data/annotation/619.json
Done.
data/annotation/38.json
Done.
data/annotation/2329.json
Done.
data/annotation/2779.json
Done.
data/annotation/10257.json
Done.
data/annotation/863.json
Done.
data/a