In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re

In [2]:
import pickle

# loading only
with open('item_type.pkl', 'rb') as handle:
    item_dict = pickle.load(handle)

In [3]:
# dumping only
with open('item_type.pkl', 'wb') as handle:
    b = pickle.dump(item_dict,handle)


In [4]:
item_value = list(set(item_dict.keys()))
item_value

['termination of a material definitive agreement',
 'change in credit enhancement or other external support',
 'shareholder director nominations',
 'acquisition or disposition of assets',
 'unregistered sales of equity securities',
 'material impairments',
 "resignations of registrant's directors",
 'changes in control of registrant',
 "amendment to registrant's code of ethics, or waiver of a provision of the code of ethics",
 'regulation fd disclosure',
 'completion of acquisition or disposition of assets',
 'triggering events that accelerate or increase a direct financial obligation or an obligation under an off-balance sheet arrangement',
 'non-reliance on previously issued financial statements or a related audit report or completed interim review',
 "changes in registrant's certifying accountant",
 'material modifications to rights of security holders',
 'departure of directors or certain officers; election of directors; appointment of certain officers; compensatory arrangements of

In [5]:
# CHECK IF AN ITEM IS IN THE TEXT AND GET THE START IN DEX
def is_item_exist(text, item):
    """
        check if the item exists in the text
        if item is in the text, return True, starting index
        otherwise, return False, None
    """
    start = f'{item}'
    # multiple spaces between words
    start = re.sub(' ',r'[ ]*',start)
    start_section_pattern = re.compile(start, re.IGNORECASE)
    
    start_found = start_section_pattern.findall(text)
    start_search = start_section_pattern.search(text)
    if len(start_found) > 0:
        return True, start_search.start()
    return False, None
            

        

    

In [6]:
def get_items_section_index(text, items_list):
    """
        use the pre-build item list
        return the location of the items 
    """
    item_start_location = {}
    for item in items_list:
        has_item, start_index  = is_item_exist(text, item)
        if has_item:
            item_start_location[item] = start_index
    item_start_order = dict(sorted(
        item_start_location.items(),
        key=lambda x: x[1],
        reverse=False)).keys()
    
    # find end location
    end = r'Pursuant to the requirements of the Securities Exchange Act'
    end_section_pattern = re.compile(end, re.IGNORECASE)
    end_found = end_section_pattern.findall(text)
    if(len(end_found) > 0):
        end_search = end_section_pattern.search(text)
        end_index = end_search.end()
    else:
        end_index = len(text)
        
    return item_start_location, list(item_start_order), end_index
        
    
     

In [7]:
def extract_8k_text(f):
        """
        Given a string of html, return the raw text part of the string
        """
        a = f.lower()
        a = re.sub('<document>.*?<type>graphic.*?</document>', ' ', a, flags = re.DOTALL)
        b = re.sub('<.*?>', ' ', a, flags = re.DOTALL)
        c = re.sub('&nbsp;', " ", b)
        d = re.sub('\t|\n',' ',c)
        return re.sub(r"&[a-z0-9#]+;", "", d)

In [8]:
def get_item_section_text(text,item_start_location, item_start_order, end_index):
    """
        use the pre-build item list
        return the location of the items in the list
    """
    item_section = {}
    num_found = len(item_start_order)
    for i in range(num_found):
        item = item_start_order[i]
        start = item_start_location[item]
        # last item found
        if(i == num_found - 1):        
            end = r'Pursuant to the requirements of the Securities Exchange Act'.lower()
            new_text = text[start:end_index]
            new_text = re.sub( end, '',new_text)
        else:
            next_item = item_start_order[i+1]
            next_start = item_start_location[next_item]
            new_text = text[start:next_start]
        
        new_text = new_text.strip()
        if len(new_text) == 0:
            continue
        item_section[item] = new_text
    return item_section


In [9]:
d_dir ='a8-kq1201912292018.htm'
#d_dir = 'd843403d8k.htm'
#d_dir = 'divendend.htm'
with open(d_dir, encoding='utf-8') as file:
    f = file.read()
text = extract_8k_text(f)
#text

In [12]:
get_items_section_index(text, item_value)

({'financial statements and exhibits': 8123,
  'results of operations and financial condition': 7122},
 ['results of operations and financial condition',
  'financial statements and exhibits'],
 9967)

In [15]:
item_start_location, item_start_order, end_index =get_items_section_index(text, item_value)
get_item_section_text(text,item_start_location, item_start_order, end_index)

{'results of operations and financial condition': 'results of operations and financial condition.                                           on                           january\xa029, 2019                           , apple\xa0inc. (“apple”) issued a press release regarding apple’s financial results for its                           first                           fiscal quarter ended                           december\xa029, 2018                           . a copy of apple’s press release is attached hereto as exhibit\xa099.1.                                           the information contained in this current report shall not be deemed “filed” for purposes of section 18 of the securities exchange act of 1934, as amended (the “exchange act”), or incorporated by reference in any filing under the securities act of 1933, as amended, or the exchange act, except as shall be expressly set forth by specific reference in such a filing.                                                            

In [14]:
def testing_item_parsing(test_file,items_type):
    error_file = []
    parsed_file = []
    for filename in test_file:
        with open(filename, encoding='utf-8') as file:
            f = file.read()
        text = extract_8k_text(f)
        item_start_location, item_start_order, end_index = get_items_section_index(text, items_type)
        item_text = get_item_section_text(text,item_start_location, item_start_order, end_index)
        if(len(item_text)==0):
            print(filename)
            error_file=[filename]
        else:
            parsed_file += [(filename, item_text)]
    return parsed_file,error_file 
    

In [988]:
test_file = ['a8-kq1201912292018.htm',
            'd843403d8k.htm',
            'divendend.htm',
            #'v89153e8vk.htm',
             'd376136d8k.htm',
             'form8kdirectors.htm'
            ]

In [989]:
testing_item_parsing(test_file,item_value )

no end


([('a8-kq1201912292018.htm',
   {'results of operations and financial condition': 'results of operations and financial condition.                                           on                           january\xa029, 2019                           , apple\xa0inc. (“apple”) issued a press release regarding apple’s financial results for its                           first                           fiscal quarter ended                           december\xa029, 2018                           . a copy of apple’s press release is attached hereto as exhibit\xa099.1.                                           the information contained in this current report shall not be deemed “filed” for purposes of section 18 of the securities exchange act of 1934, as amended (the “exchange act”), or incorporated by reference in any filing under the securities act of 1933, as amended, or the exchange act, except as shall be expressly set forth by specific reference in such a filing.                            