In [1]:
import re
import json

In [2]:
with open("unstructured.txt", 'r') as file:
    lines = file.readlines()

source_list = [
    "Unit42",
    "Fireeye"
]

source_list = [source.lower() for source in source_list]

In [5]:
def check_new_entry(list_of_sources, line):
    first_substr = line.split(':')[0]

    is_new_entry = first_substr.lower() in list_of_sources

    return is_new_entry

def collect_ioc(entry_text):
    substr_format = r'\([^\(:\)]+: [^\(:\)]*(?:\([^\(:\)]*\)[^\(:\)]*)*\)'
    matches = re.findall(substr_format, entry_text)
    return matches

def parse_data(list_of_sources, lines):
    entries = []
    entry_text = lines[0]
    for line in lines[1:]:
        if check_new_entry(list_of_sources, line):
            entries.append(entry_text)
            entry_text = line
            source = line.split(':')[0]
        else:
            entry_text = entry_text + line
    entries.append(entry_text)

    return entries

def entry_to_dict(entry):
    substr_format = r'([^:]+):([^\s]+) - ([.\S\s]+)'
    match = re.search(substr_format, entry)
    if match:
        source = match.group(1).strip()
        name = match.group(2).strip()
        text = match.group(3).strip()
        list_ioc_dicts = []
        iocs = collect_ioc(text)
        
        for ioc in iocs:
            ioc_no_semicolon = ioc.replace(';', ':')
            ioc_elements = ioc_no_semicolon[1:-1].split(':')
            ioc_type = ioc_elements[0]
            ioc_value = ioc_elements[1][1:]
            list_ioc_dicts.append(
                {
                    'ioc_type':ioc_type,
                    'ioc_value':ioc_value
                }
            )

    entry_dict = {
        'name':name,
        'source':source,
        'ioc':list_ioc_dicts,
        'text':text
    }

    return entry_dict


parsed_data = parse_data(source_list, lines)
data_dicts = []
for data in parsed_data:
    data_dicts.append(entry_to_dict(data))

for data_dict in data_dicts:
    source = data_dict['source']
    name = data_dict['name']
    output_fname = source + ', ' + name
    with open(output_fname, 'w') as outfile:
        json.dump(data_dict, outfile)


In [9]:
f = open('Fireeye, UNC1945')
data = json.load(f)
data

{'name': 'UNC1945',
 'source': 'Fireeye',
 'ioc': [{'ioc_type': 'MD5',
   'ioc_value': 'd5b9a1845152d8ad2b91af044ff16d0b (SLAPSTICK)'},
  {'ioc_type': 'MD5', 'ioc_value': '6983f7001de10f4d19fc2d794c3eb534'},
  {'ioc_type': 'IP', 'ioc_value': '46.30.189.0/24'},
  {'ioc_type': 'IP', 'ioc_value': '66.172.12.0/24'}],
 'text': 'PUPYRAT (aka Pupy) is an open source, multi-platform (Windows, Linux, OSX, Android), multi-function RAT (Remote Administration Tool) and post-exploitation tool mainly written in Python. It features an all-in-memory execution guideline and leaves very low footprint. It can communicate using various transports, migrate into processes (reflective injection), and load remote Python code, Python packages and Python C-extensions from memory.(MD5: d5b9a1845152d8ad2b91af044ff16d0b (SLAPSTICK)) (MD5; 0845835e18a3ed4057498250d30a11b1 (STEELCORGI)) (MD5: 6983f7001de10f4d19fc2d794c3eb534) (IP: 46.30.189.0/24) (IP: 66.172.12.0/24)'}