In [31]:
import re
from bs4 import BeautifulSoup
from bs4 import Comment

In [99]:
filelist = ['/development/data/sec/edgar/extract/cik/1680142/10-K_20170628_bodynar16c10k033117.htm'
           ,'/development/data/sec/edgar/extract/cik/1051251/10-K_20170321_jcg-10k_20170128.htm'
           ]

In [100]:
class HTMLParse:
    def __init__(self, filename):
        self.filename = filename
        self.tags_to_skip =  []
        self.tags_to_consolidate = ['a', 'b', 'font', 'small'] 
        self.consolidated_name = 'X'
        
    def parse_children(self, parent_tag, root_key, html_parts, tag):
        if tag.name is not None and tag.name not in self.tags_to_skip and tag.contents is not None:
            child_tag = parent_tag
            if tag.name not in self.tags_to_consolidate:
                child_tag += tag.name + "/"
                root_key += 1
            #print('[{}:{}]: {}'.format(root_key, child_tag, "has_children"))
            for children in tag.contents:
                if not isinstance(children, Comment):
                    root_key = self.parse_children(child_tag, root_key, html_parts, children)
        else:
            if tag.string is not None and len(tag.string.strip()) > 0:
                #print('[{}:{}]: {}'.format(root_key, parent_tag, tag.string.strip()))
                data = tag.string.strip()
                if root_key not in html_parts:
                    html_parts[root_key] = {}
                if parent_tag not in html_parts[root_key]:
                    html_parts[root_key][parent_tag] = data
                else:
                    html_parts[root_key][parent_tag] += ' ' + data
        return root_key

    def get_html_parts(self):
        html_parts = {}
        with open(self.filename) as fp:
            soup = BeautifulSoup(fp, "html5lib")   
            root_key = 0
            for tag in soup.body.contents: 
                if tag.name is not None and tag.contents is not None:
                    #print('[{}:{}]: {}'.format(root_key, tag.name, "has_children"))
                    root_key = self.parse_children('/', root_key, html_parts, tag)
                    root_key += 1
        return html_parts

    def remove_nonascii_string(self, s):
        return re.sub(r'[^\x00-\x7f]',r' ', s)
    
    def get_cleaned_string(self, s):
        toRet = self.remove_nonascii_string(s).replace('\n', ' ').lower().strip()
        if toRet.startswith('i tem '):
            toRet = toRet.replace('i tem ', 'item ')
        return toRet
    
    def get_form_parts(self, html_parts):
        form_parts = {}
        part_key = ''
        item_key = ''
        for key in html_parts:
            #print('key: {}'.format(part_key))
            for tag in html_parts[key]:
                data = html_parts[key][tag]
                cleaned_data = self.get_cleaned_string(data)
                #print('{}:{}, [{}:{}]: #{}#'.format(part_key, item_key, key, tag, cleaned_data))
                if cleaned_data.startswith('part '):
                    part_key = cleaned_data
                    item_key = ''
                    form_parts[part_key] = {}
                elif cleaned_data.startswith('item '):
                    if part_key == '' and cleaned_data.startswith('item 1'):
                        part_key = "PART_INS"
                        form_parts[part_key] = {}
                    if part_key != '':
                        item_key = cleaned_data
                        form_parts[part_key][item_key] = []
                else:
                    if part_key != '' and item_key != '':
                        if part_key in form_parts and item_key in form_parts[part_key]:
                            form_parts[part_key][item_key].append(self.remove_nonascii_string(data))
        return form_parts
    
    def parse_html(self, cik, filingdt):
        print('...parsing: {}'.format(self.filename))        
        parsed_data = {}
        parsed_data['cik'] = cik
        parsed_data['filingdt'] = filingdt        
        parsed_data['fullpath'] = self.filename   
        html_parts = self.get_html_parts()        
        form_parts = self.get_form_parts(html_parts)                            
        parsed_data['text'] = form_parts        
        return parsed_data


In [101]:
filename = filelist[1]
cik = 'a'
filingdt = 'b'
print(filename)
htmlParse = HTMLParse(filename)
parsed_data = htmlParse.parse_html(cik, filingdt)

/development/data/sec/edgar/extract/cik/1051251/10-K_20170321_jcg-10k_20170128.htm
...parsing: /development/data/sec/edgar/extract/cik/1051251/10-K_20170321_jcg-10k_20170128.htm


In [103]:
for part_key in parsed_data['text']:
    print(part_key)
    for item_key in parsed_data['text'][part_key]:
        print('...#{}#'.format(item_key))
        for data in parsed_data['text'][part_key][item_key]:
            print('-->{}'.format(data))

part i
...#item  1.#
-->BUSINESS.
--> J.Crew,  the  Company,   we,   us  and  our  refer to J.Crew Group, Inc. ( Group ) and its wholly owned subsidiaries.  Parent  refers to Group s ultimate parent, Chinos Holdings, Inc.
-->Overview
-->J.Crew is an internationally recognized multi-brand apparel and accessories retailer that differentiates itself through high standards of quality, style, design and fabrics. We are a vertically-integrated omni-channel specialty retailer that operates stores and websites both domestically and internationally. We design, market and sell our products, including those under the J.Crew
--> 
-->and Madewell
--> 
-->brands, offering complete assortments of women s, men s and children s apparel and accessories. We believe our customer base consists primarily of college-educated, professional and fashion-conscious women and men.
-->We sell our J.Crew and Madewell merchandise primarily through our retail and factory stores, our websites and our catalogs. As of Ja

--> 
-->1,500,000
--> 
-->Jenna Lyons
-->2016
-->1,830,450
-->3,047,850
-->3,043,500
-->0.10
-->6/29/26
-->1,000,000
--> 
-->1,000,000
--> 
-->2011
-->1,777,777
--> 
--> 
-->0.25
-->9/15/17
--> 
--> 
--> 
--> 
-->Total
-->3,608,227
-->3,047,850
-->3,043,500
--> 
--> 
-->1,000,000
--> 
-->1,000,000
--> 
-->Libby Wadle
-->2016
-->838,050
-->1,038,050
-->1,876,100
-->0.10
-->6/29/26
-->1,000,000
--> 
-->1,000,000
--> 
-->Lynda Markoe
-->2016
-->476,375
-->576,375
-->1,052,750
-->0.10
-->6/29/26
-->500,000
--> 
-->500,000
--> 
-->(1)
-->Represents (i) stock options awarded to Ms. Lyons prior to the Acquisition, which were rolled over into vested options of Parent, effective March 7, 2011, (ii) stock options that were granted to Mr. Nicholson on January 11, 2016 in connection with his hiring, and (iii) stock options that were subject to re-pricing in June 2016. All Named Executive Officers (except Mr. Nicholson) participated in the stock option re-pricing offer and, in June 2016, stock opti

-->12.7
-->Transaction costs
-->7.4
-->8.2
-->State taxes and interest
-->5.5
-->5.1
-->Sales returns
-->4.3
-->4.1
-->State net operating losses
-->1.6
-->1.2
-->Tax credit carryforward
-->1.1
--> 
-->Other
-->3.7
-->4.3
-->114.3
-->117.2
-->Less: Valuation allowance
-->(20.3
-->)
-->(12.1
-->)
-->Deferred tax assets, net of valuation allowance
-->94.0
-->105.1
-->Deferred tax liabilities:
-->Intangible assets
-->(170.8
-->)
-->(172.6
-->)
-->Difference in book and tax basis for property and equipment
-->(55.2
-->)
-->(65.5
-->)
-->Prepaid catalog and other prepaid expenses
-->(16.2
-->)
-->(15.8
-->)
-->Deferred tax liabilities
-->(242.2
-->)
-->(253.9
-->)
-->Net deferred income tax liability
-->$
-->(148.2
-->)
-->$
-->(148.8
-->)
-->The financial statements of the Company reflect a benefit for income taxes at the Group level. The federal tax return, however, is filed at the Parent level. The difference between the entity at which the provision is calculated and the entity which fi