# Libraries

In [1]:
import os
from io import StringIO 
import pandas as pd
from urllib import request
from bs4 import BeautifulSoup
import re
import plotly.plotly as py
import plotly.graph_objs as go
import requests
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import nltk
from nltk import word_tokenize, everygrams
from nltk.util import ngrams

# Obtain data

## Function Definitions

In [3]:
def single_read_in_text(file_path,file_name):
    '''
    input:
    file_path: path of a individule csv file e.g. location of 21centfoxinc_sec_files.csv
    file_name: name of the file e.g 21centfoxinc_sec_files.csv
    
    output:
    a sigle pandas dataframe with all the original columns from the input file + a column for the 'cleaned' data
    
    dev:
    1) can add more output columns for features
    2) can further clean the data 
    
    '''
    
    ## read in single csv to pandas 
    individule_csv = pd.read_csv(file_path+"/"+file_name)
    processed_file = []
    
    ## create a connection with the url link and readin the raw file
    for url in individule_csv['sec_full_path']:
        print(url)
        text = request.urlopen(url).read().decode('utf8')
        soup = BeautifulSoup(text,'html.parser')
        
        ## clean raw text
        
        ## 1. remove html tags 
        ## 2. break the text by "\n"
        ## 3. remove the spaces in the front of and after each "\n"

        cleaned_text = '%%'.join(list(filter(None,
                                             [re.sub('[\t]+', ' ', i.strip()) for 
                                              i in soup.text.split('\n')])))
        
        ## find all tables
        raw_table = soup.find_all('table')
        
        
        ## clean all tables
        if not raw_table:

            cleaned_table = []
        else:
            
            '''
            cleaned_table = [[k[0],''.join(k[1:])] 
                             for k in [[j.replace('\n',"").strip() for j in re.split('  +',i)] for i in raw_table[0].get_text().split('\n\n')]]
            
            '''
     
            
            #cleaned_table = [re.split('  +',j) for j in [i for i in [h.get_text().split('\n\n') for h in raw_table]]]
            cleaned_table = [[[k[0],''.join(k[1:])] for k in 
             [[j.replace('\n',"").strip() 
               for j in re.split('  +',i)] 
              for i in l.get_text().split('\n\n')]] for l in raw_table]
            
            
            '''
            
            cleaned_table = [[k[0],''.join(k[1:])] 
                             for k in [[j.replace('\n',"").strip() 
                                        for j in re.split('  +',i)] 
                                       for i in [h.get_text().split('\n\n') for h in raw_table]]]            
            '''
        ## search the text
        Item_text = soup.findAll(text=re.compile("Item "))
        Item_category = [re.search('(?<=Item ).*', i).group(0) for i in Item_text]

        
        processed_file.append([url,
                               text,
                               cleaned_text,
                               raw_table,
                               cleaned_table,
                               Item_text,
                               Item_category])
    
    
    ## merge back to the original read in dataframe 
    clean_texts_df = pd.DataFrame(processed_file)
    clean_texts_df.columns = ['sec_full_path', 'text', 'cleaned_text',
                              'raw_table','cleaned_table',
                              'Item_text', 'Item_category']
    merged_df = pd.merge(left = individule_csv, right = clean_texts_df, on = 'sec_full_path')
    
    ## add one more column to indicate the file name 
    merged_df['file_name'] = file_name
    return merged_df
def folder_read_in_text(folder_path, ext = '.csv'):
    '''
    input: 
    folder_path: path of a individule csv file e.g. location of 21centfoxinc_sec_files.csv
    ext: extension of the files that are interested, default to be .csv 
    
    output:
    a sigle pandas dataframe with all the original columns from all the input files inside the folder
    + a column for the 'cleaned' data
    + a column for the file name 
    
    Utilize the single_read_in_text function 
    '''
    file = []
    direc = folder_path 

    # Select only files with the ext extension
    txt_files = [i for i in os.listdir(direc) if os.path.splitext(i)[1] == ext]
    temp_df = pd.DataFrame()
    
    ## Utilize the single_read_in_text function to process data

    for i in txt_files:
        temp_df = temp_df.append(single_read_in_text(file_path = folder_path,file_name = i), ignore_index=True)
    return temp_df

## Function Call

In [4]:
## test file 
working_file = folder_read_in_text(folder_path='C:/Users/li haoran/Documents/GitHub/Capstone_Vanguard_NLP_Prediction/sample_inputs',
                                   ext = '.csv')


https://www.sec.gov/Archives/edgar/data/732712/0000950133-94-000018.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950172-94-000010.txt
https://www.sec.gov/Archives/edgar/data/732712/0000893220-94-000061.txt
https://www.sec.gov/Archives/edgar/data/732712/0000893220-94-000079.txt
https://www.sec.gov/Archives/edgar/data/732712/0000893220-94-000108.txt
https://www.sec.gov/Archives/edgar/data/732712/0000732712-94-000006.txt


In [10]:
working_file

Unnamed: 0,fdate,cik,findexdate,form,coname,fsize,doccount,sec_full_path,text,cleaned_text,raw_table,cleaned_table,Item_text,Item_category,file_name
0,1/21/1994,732712,3/31/1994,8-K,BELL ATLANTIC CORP,3399,1,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,[<table> <s> <c> Date of Report: ...,"[[[Date of Report:, January 20, 1994], [Exact ...",[ 2 Item 5. Other Events.  Bell At...,[5. Other Events.],sample_inputs.csv
1,2/1/1994,732712,3/31/1994,8-K,BELL ATLANTIC CORP,2923,1,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,[],[],[  SECURITIES AND EXCHANGE COMMISSION...,[5. Other Events.],sample_inputs.csv
2,2/9/1994,732712,3/31/1994,8-K,BELL ATLANTIC CORP,18580,2,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,[<table> <caption>  ...,"[[[, ], [, Three months endedYear endedDecembe...",[ 2 Item 5. Other Events. Bell Atlantic Cor...,"[5. Other Events., 7. Financial Statements and...",sample_inputs.csv
3,2/14/1994,732712,3/31/1994,8-K,BELL ATLANTIC CORP,2930,1,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,[<table> <s> <c> Date of Report: ...,"[[[Date of Report:, February 14, 1994], [Exact...",[ 3 Item 5. Other Events.  Bell At...,[5. Other Events.],sample_inputs.csv
4,2/24/1994,732712,3/31/1994,8-K,BELL ATLANTIC CORP,3196,1,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,[],[],[ 2 Item 5. Other Events.  Bell Atlant...,[5. Other Events.],sample_inputs.csv
5,3/23/1994,732712,3/31/1994,8-K,BELL ATLANTIC CORP,6014,2,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,[],[],[ 2 Item 5. Other Events.  Attached as...,[5. Other Events.],sample_inputs.csv


In [20]:
working_file = folder_read_in_text(folder_path='C:/Users/li haoran/Documents/GitHub/Capstone_Vanguard_NLP_Prediction/Inputs',
                                   ext = '.csv')

https://www.sec.gov/Archives/edgar/data/732712/0000950133-94-000018.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950172-94-000010.txt
https://www.sec.gov/Archives/edgar/data/732712/0000893220-94-000061.txt
https://www.sec.gov/Archives/edgar/data/732712/0000893220-94-000079.txt
https://www.sec.gov/Archives/edgar/data/732712/0000893220-94-000108.txt
https://www.sec.gov/Archives/edgar/data/732712/0000732712-94-000006.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950109-94-000587.txt
https://www.sec.gov/Archives/edgar/data/732712/0000732712-94-000009.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950109-94-000824.txt
https://www.sec.gov/Archives/edgar/data/732712/0000732712-94-000014.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950109-94-001534.txt
https://www.sec.gov/Archives/edgar/data/732712/0000732712-94-000017.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950109-94-002093.txt
https://www.sec.gov/Archives/edgar/data/732712/0000732712-95-000

https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-004377.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-005648.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950130-02-003892.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950123-02-006330.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-008709.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-008892.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-009527.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-009530.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-010485.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-011940.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-012975.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-014129.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-014130.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950134-02-015

https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-238309.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-245448.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-247678.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-248715.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-251425.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-06-256227.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-07-001162.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-07-006937.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-07-014634.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-07-016525.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-07-020398.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-07-044126.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-07-048544.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-07-094

https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-057878.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-093344.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-104504.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-113952.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-134085.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-173294.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-194187.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-196160.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-201011.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-248070.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-276677.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-280401.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-328252.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-11-332

https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-433999.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-473367.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-527195.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-549165.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-551222.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-562197.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-586418.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-616451.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-617604.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-633746.txt
https://www.sec.gov/Archives/edgar/data/732712/0000898822-16-000404.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-657235.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-664131.txt
https://www.sec.gov/Archives/edgar/data/732712/0001193125-16-664

In [21]:
working_file.to_csv('C:/Users/li haoran/Desktop/sec files/full_data_verizon.csv', sep='|', encoding='utf-8')

In [22]:
working_file_subset = working_file[['fdate','cik','findexdate','form','coname','raw_table','cleaned_table','Item_text','Item_category']]
working_file_subset.to_csv('C:/Users/li haoran/Desktop/sec files/table_item_data_verizon.csv', sep='|', encoding='utf-8')

In [16]:
working_file['cleaned_text'][2].split('%%')

"-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc-Type: 2001,MIC-CLEAR%%Originator-Name: keymaster@town.hall.org%%Originator-Key-Asymmetric:%%MFkwCgYEVQgBAQICAgADSwAwSAJBALeWW4xDV4i7+b6+UyPn5RtObb1cJ7VkACDq%%pKb9/DClgTKIm08lCfoilvi9Wl4SODbR1+1waHhiGmeZO8OdgLUCAwEAAQ==%%MIC-Info: RSA-MD5,RSA,%%GMQAlRnJvVLSy5tYNW+mtKTV/65v5fFNLQ176DahHwPr/SOIt/Pk794jNRnCxAXZ%%98Wfg25vcdOAKEqvBAUf7w==%%0000893220-94-000061.txt : 19940210%%0000893220-94-000061.hdr.sgml : 19940210%%ACCESSION NUMBER: 0000893220-94-000061%%CONFORMED SUBMISSION TYPE: 8-K%%PUBLIC DOCUMENT COUNT: 2%%CONFORMED PERIOD OF REPORT: 19940121%%ITEM INFORMATION: 5%%FILED AS OF DATE: 19940209%%FILER:%%COMPANY DATA:%%COMPANY CONFORMED NAME: BELL ATLANTIC CORP%%CENTRAL INDEX KEY: 0000732712%%STANDARD INDUSTRIAL CLASSIFICATION: 4813%%IRS NUMBER: 232259884%%STATE OF INCORPORATION: DE%%FISCAL YEAR END: 1231%%FILING VALUES:%%FORM TYPE: 8-K%%SEC ACT: 34%%SEC FILE NUMBER: 001-08606%%FILM NUMBER: 94505225%%BUSINESS ADDRESS:%%STREET 1: 1717 ARCH ST%

In [13]:
working_file['Item_text'][2]

["   2\nItem 5. Other Events.\n\nBell Atlantic Corporation (the Company) on January 21, 1994 announced that it\nwas reporting 1993 earnings of $3.22 per share, which included a one-time,\nnon-cash charge of $.19 per share, or $85.0 million, for the adoption of a\nchange in the method of accounting for postemployment benefits, versus $3.13\nper share in 1992.  Excluding this charge and the impact of 1993 tax\nlegislation, 1993 earnings per share would have been $3.44.\n\n         In addition to the one-time, non-cash charge for the change in the\nmethod of accounting for postemployment benefits (Statement of Financial\nAccounting Standards No. 112 [FAS 112]) and a net decrease of $.03 per share,\nor $11.4 million, for the effect of 1993 federal tax legislation, results for\n1993 also included extraordinary charges of $.13 per share, or $58.4 million,\nfor early extinguishment of debt; and a net benefit of $.11 per share, or $51.0\nmillion, for other previously reported items.  Earnings 

In [4]:
working_file['raw_table'][0]

[<table>
 <s> <c>
 Date of Report:                              January 20, 1994
 
 Exact name of registrant
 as specified in its charter:                 BELL ATLANTIC CORPORATION
 
 Commission File No.:                         1-8606
 
 State of Incorporation:                      Delaware
 
 IRS Employer Identification No.:             23-2259884
 
 Address of principal
 executive offices:                           1717 Arch Street
                                              Philadelphia, Pennsylvania
 Zip Code                                     19103
 
 Registrant's telephone number,
 including area code:                         (215) 963-6000
 
 Former name or former address,
 if changed since last report:                N/A
 </c></s></table>]

In [5]:
working_file['cleaned_table'][0]

[[['Date of Report:', 'January 20, 1994'],
  ['Exact name of registrantas specified in its charter:',
   'BELL ATLANTIC CORPORATION'],
  ['Commission File No.:', '1-8606'],
  ['State of Incorporation:', 'Delaware'],
  ['IRS Employer Identification No.:', '23-2259884'],
  ['Address of principalexecutive offices:',
   '1717 Arch StreetPhiladelphia, PennsylvaniaZip Code19103'],
  ["Registrant's telephone number,including area code:", '(215) 963-6000'],
  ['Former name or former address,if changed since last report:', 'N/A']]]

In [7]:
pd.DataFrame(working_file['cleaned_table'][0][0])

Unnamed: 0,0,1
0,Date of Report:,"January 20, 1994"
1,Exact name of registrantas specified in its ch...,BELL ATLANTIC CORPORATION
2,Commission File No.:,1-8606
3,State of Incorporation:,Delaware
4,IRS Employer Identification No.:,23-2259884
5,Address of principalexecutive offices:,"1717 Arch StreetPhiladelphia, PennsylvaniaZip ..."
6,"Registrant's telephone number,including area c...",(215) 963-6000
7,"Former name or former address,if changed since...",


In [8]:
working_file['cleaned_table'][1]

[]

In [9]:
working_file['cleaned_table'][2][0]

[['', ''],
 ['',
  'Three months endedYear endedDecember 31December 31-------------------------------------19931992*19931992*'],
 ['',
  'OPERATING REVENUES Communications andRelated Services$3,194.5$3,117.9$12,534.8$12,164.6 Financial, Real Estate,and Other Services122.7156.3455.4553.8'],
 ['', 'Total operating revenues3,317.23,274.212,990.212,718.4'],
 ['OPERATING EXPENSES Employee costs, including',
  'benefits and taxes1,019.01,004.34,027.63,941.5 Depreciation andamortization649.8597.22,545.12,417.4 Other1,041.21,154.63,619.93,853.3'],
 ['', 'Total operating expenses2,710.02,756.110,192.610,212.2'],
 ['OPERATING INCOME', '607.2518.12,797.62,506.2'],
 ['Other income and',
  'expense, net29.142.388.1214.4Interest expense, excludingFinancial Services141.3162.3612.1694.9Income before provision for incometaxes, extraordinary item, andcumulative effect of changes inaccounting principles495.0398.12,273.62,025.7Provision for income taxes157.870.6792.0643.5Income before extraordinary iteman

In [7]:
working_file['cleaned_table'][3]

[[['Date of Report:', 'February 14, 1994'],
  ['Exact name of registrantas specified in its charter:',
   'BELL ATLANTIC CORPORATION'],
  ['Commission File No.:', '1-8606'],
  ['State of Incorporation:', 'Delaware'],
  ['IRS Employer Identification No.:', '23-2259884'],
  ['Address of principalexecutive offices:',
   '1717 Arch StreetPhiladelphia, PennsylvaniaZip Code19103'],
  ["Registrant's telephone number,including area code:", '(215) 963-6000'],
  ['Former name or former address,if changed since last report:', 'N/A']]]

In [14]:
working_file['raw_table'][2][0]

<table>
<caption>
                                                           Three months ended                         Year ended
                                                               December 31                            December 31    
                                                           ------------------                     -------------------
                                                        1993                1992*              1993                1992*

<s> <c> <c> <c> <c>
OPERATING REVENUES
 Communications and
  Related Services                                    $3,194.5             $3,117.9          $12,534.8            $12,164.6
 Financial, Real Estate,
  and Other Services                                     122.7                156.3              455.4                553.8

     Total operating revenues                          3,317.2              3,274.2           12,990.2             12,718.4

OPERATING EXPENSES
 Employee costs, including
   bene

In [15]:
working_file['raw_table'][2][0].get_text()

'\n\n                                                           Three months ended                         Year ended\n                                                               December 31                            December 31    \n                                                           ------------------                     -------------------\n                                                        1993                1992*              1993                1992*\n\n    \nOPERATING REVENUES\n Communications and\n  Related Services                                    $3,194.5             $3,117.9          $12,534.8            $12,164.6\n Financial, Real Estate,\n  and Other Services                                     122.7                156.3              455.4                553.8\n\n     Total operating revenues                          3,317.2              3,274.2           12,990.2             12,718.4\n\nOPERATING EXPENSES\n Employee costs, including\n   benefits and tax

In [20]:
working_file['raw_table'][2][0].get_text().split('\n\n')[0]

''

In [22]:
working_file['raw_table'][2][0].get_text().split('\n\n')[1]

'                                                           Three months ended                         Year ended\n                                                               December 31                            December 31    \n                                                           ------------------                     -------------------\n                                                        1993                1992*              1993                1992*'

In [23]:
working_file['raw_table'][2][0].get_text().split('\n\n')[3]

'     Total operating revenues                          3,317.2              3,274.2           12,990.2             12,718.4'

In [26]:
working_file['raw_table'][2][0].get_text().split('\n\n')[4]

'OPERATING EXPENSES\n Employee costs, including\n   benefits and taxes                                  1,019.0              1,004.3            4,027.6              3,941.5\n Depreciation and\n   amortization                                          649.8                597.2            2,545.1              2,417.4\n Other                                                 1,041.2              1,154.6            3,619.9              3,853.3'

In [25]:
print(working_file['raw_table'][2][0].get_text().split('\n\n')[4])

OPERATING EXPENSES
 Employee costs, including
   benefits and taxes                                  1,019.0              1,004.3            4,027.6              3,941.5
 Depreciation and
   amortization                                          649.8                597.2            2,545.1              2,417.4
 Other                                                 1,041.2              1,154.6            3,619.9              3,853.3


In [12]:
[ [[j.replace('\n',"").strip() 
   for j in re.split('  +',i)] 
  for i in working_file['raw_table'][2][0].get_text().split('\n\n')]]

[[[''],
  ['',
   'Three months ended',
   'Year ended',
   'December 31',
   'December 31',
   '',
   '------------------',
   '-------------------',
   '1993',
   '1992*',
   '1993',
   '1992*'],
  ['',
   'OPERATING REVENUES Communications and',
   'Related Services',
   '$3,194.5',
   '$3,117.9',
   '$12,534.8',
   '$12,164.6 Financial, Real Estate,',
   'and Other Services',
   '122.7',
   '156.3',
   '455.4',
   '553.8'],
  ['',
   'Total operating revenues',
   '3,317.2',
   '3,274.2',
   '12,990.2',
   '12,718.4'],
  ['OPERATING EXPENSES Employee costs, including',
   'benefits and taxes',
   '1,019.0',
   '1,004.3',
   '4,027.6',
   '3,941.5 Depreciation and',
   'amortization',
   '649.8',
   '597.2',
   '2,545.1',
   '2,417.4 Other',
   '1,041.2',
   '1,154.6',
   '3,619.9',
   '3,853.3'],
  ['',
   'Total operating expenses',
   '2,710.0',
   '2,756.1',
   '10,192.6',
   '10,212.2'],
  ['OPERATING INCOME', '607.2', '518.1', '2,797.6', '2,506.2'],
  ['Other income and',
   '

In [11]:
[[k[0],''.join(k[1:])] for k in 
 [[j.replace('\n',"").strip() 
   for j in re.split('  +',i)] 
  for i in working_file['raw_table'][2][0].get_text().split('\n\n')]]

[['', ''],
 ['',
  'Three months endedYear endedDecember 31December 31-------------------------------------19931992*19931992*'],
 ['',
  'OPERATING REVENUES Communications andRelated Services$3,194.5$3,117.9$12,534.8$12,164.6 Financial, Real Estate,and Other Services122.7156.3455.4553.8'],
 ['', 'Total operating revenues3,317.23,274.212,990.212,718.4'],
 ['OPERATING EXPENSES Employee costs, including',
  'benefits and taxes1,019.01,004.34,027.63,941.5 Depreciation andamortization649.8597.22,545.12,417.4 Other1,041.21,154.63,619.93,853.3'],
 ['', 'Total operating expenses2,710.02,756.110,192.610,212.2'],
 ['OPERATING INCOME', '607.2518.12,797.62,506.2'],
 ['Other income and',
  'expense, net29.142.388.1214.4Interest expense, excludingFinancial Services141.3162.3612.1694.9Income before provision for incometaxes, extraordinary item, andcumulative effect of changes inaccounting principles495.0398.12,273.62,025.7Provision for income taxes157.870.6792.0643.5Income before extraordinary iteman

In [330]:
mystr = StringIO(working_file[0:1]['table'][0][0].get_text())
df = pd.read_csv(mystr, header=None, delimiter=':')
df

Unnamed: 0,0,1
0,Date of Report,"January 20, 1994"
1,Exact name of registrant,
2,as specified in its charter,BELL ATLANTIC CORPORATION
3,Commission File No.,1-8606
4,State of Incorporation,Delaware
5,IRS Employer Identification No.,23-2259884
6,Address of principal,
7,executive offices,1717 Arch Street
8,P...,
9,Zip Code 1...,


In [364]:
working_file[0:1]['raw_table'][0][0]

<table>
<s> <c>
Date of Report:                              January 20, 1994

Exact name of registrant
as specified in its charter:                 BELL ATLANTIC CORPORATION

Commission File No.:                         1-8606

State of Incorporation:                      Delaware

IRS Employer Identification No.:             23-2259884

Address of principal
executive offices:                           1717 Arch Street
                                             Philadelphia, Pennsylvania
Zip Code                                     19103

Registrant's telephone number,
including area code:                         (215) 963-6000

Former name or former address,
if changed since last report:                N/A
</c></s></table>

In [337]:
working_file[0:1]['raw_table'][0][0].get_text()

"\n \nDate of Report:                              January 20, 1994\n\nExact name of registrant\nas specified in its charter:                 BELL ATLANTIC CORPORATION\n\nCommission File No.:                         1-8606\n\nState of Incorporation:                      Delaware\n\nIRS Employer Identification No.:             23-2259884\n\nAddress of principal\nexecutive offices:                           1717 Arch Street\n                                             Philadelphia, Pennsylvania\nZip Code                                     19103\n\nRegistrant's telephone number,\nincluding area code:                         (215) 963-6000\n\nFormer name or former address,\nif changed since last report:                N/A\n"

In [338]:
pd.DataFrame([[k[0],''.join(k[1:])] for k in 
 [[j.replace('\n',"").strip() 
   for j in re.split('  +',i)] 
  for i in working_file[0:1]['raw_table'][0][0].get_text().split('\n\n')]])

Unnamed: 0,0,1
0,Date of Report:,"January 20, 1994"
1,Exact name of registrantas specified in its ch...,BELL ATLANTIC CORPORATION
2,Commission File No.:,1-8606
3,State of Incorporation:,Delaware
4,IRS Employer Identification No.:,23-2259884
5,Address of principalexecutive offices:,"1717 Arch StreetPhiladelphia, PennsylvaniaZip ..."
6,"Registrant's telephone number,including area c...",(215) 963-6000
7,"Former name or former address,if changed since...",


In [419]:
working_file['raw_table'][2]

[<table>
 <caption>
                                                            Three months ended                         Year ended
                                                                December 31                            December 31    
                                                            ------------------                     -------------------
                                                         1993                1992*              1993                1992*
 
 <s> <c> <c> <c> <c>
 OPERATING REVENUES
  Communications and
   Related Services                                    $3,194.5             $3,117.9          $12,534.8            $12,164.6
  Financial, Real Estate,
   and Other Services                                     122.7                156.3              455.4                553.8
 
      Total operating revenues                          3,317.2              3,274.2           12,990.2             12,718.4
 
 OPERATING EXPENSES
  Employee costs,

In [420]:
working_file['cleaned_table'][2]

[[['', ''],
  ['',
   'Three months endedYear endedDecember 31December 31-------------------------------------19931992*19931992*'],
  ['',
   'OPERATING REVENUES Communications andRelated Services$3,194.5$3,117.9$12,534.8$12,164.6 Financial, Real Estate,and Other Services122.7156.3455.4553.8'],
  ['', 'Total operating revenues3,317.23,274.212,990.212,718.4'],
  ['OPERATING EXPENSES Employee costs, including',
   'benefits and taxes1,019.01,004.34,027.63,941.5 Depreciation andamortization649.8597.22,545.12,417.4 Other1,041.21,154.63,619.93,853.3'],
  ['', 'Total operating expenses2,710.02,756.110,192.610,212.2'],
  ['OPERATING INCOME', '607.2518.12,797.62,506.2'],
  ['Other income and',
   'expense, net29.142.388.1214.4Interest expense, excludingFinancial Services141.3162.3612.1694.9Income before provision for incometaxes, extraordinary item, andcumulative effect of changes inaccounting principles495.0398.12,273.62,025.7Provision for income taxes157.870.6792.0643.5Income before extraord

In [429]:
[[j for j in re.split('  +',i)] 
           for i in working_file['raw_table'][2][0].get_text().split('\n\n')]

[[''],
 ['',
  'Three months ended',
  'Year ended\n',
  'December 31',
  'December 31',
  '\n',
  '------------------',
  '-------------------\n',
  '1993',
  '1992*',
  '1993',
  '1992*'],
 ['',
  '\nOPERATING REVENUES\n Communications and\n',
  'Related Services',
  '$3,194.5',
  '$3,117.9',
  '$12,534.8',
  '$12,164.6\n Financial, Real Estate,\n',
  'and Other Services',
  '122.7',
  '156.3',
  '455.4',
  '553.8'],
 ['',
  'Total operating revenues',
  '3,317.2',
  '3,274.2',
  '12,990.2',
  '12,718.4'],
 ['OPERATING EXPENSES\n Employee costs, including\n',
  'benefits and taxes',
  '1,019.0',
  '1,004.3',
  '4,027.6',
  '3,941.5\n Depreciation and\n',
  'amortization',
  '649.8',
  '597.2',
  '2,545.1',
  '2,417.4\n Other',
  '1,041.2',
  '1,154.6',
  '3,619.9',
  '3,853.3'],
 ['',
  'Total operating expenses',
  '2,710.0',
  '2,756.1',
  '10,192.6',
  '10,212.2'],
 ['OPERATING INCOME', '607.2', '518.1', '2,797.6', '2,506.2'],
 ['Other income and\n',
  'expense, net',
  '29.1',
  

In [422]:
[[k[0],''.join(k[1:])] 
 for k in [[j.replace('\n',"").strip() 
   for j in re.split('  +',i)] 
           for i in working_file['raw_table'][2][0].get_text().split('\n\n')]]

[['', ''],
 ['',
  'Three months endedYear endedDecember 31December 31-------------------------------------19931992*19931992*'],
 ['',
  'OPERATING REVENUES Communications andRelated Services$3,194.5$3,117.9$12,534.8$12,164.6 Financial, Real Estate,and Other Services122.7156.3455.4553.8'],
 ['', 'Total operating revenues3,317.23,274.212,990.212,718.4'],
 ['OPERATING EXPENSES Employee costs, including',
  'benefits and taxes1,019.01,004.34,027.63,941.5 Depreciation andamortization649.8597.22,545.12,417.4 Other1,041.21,154.63,619.93,853.3'],
 ['', 'Total operating expenses2,710.02,756.110,192.610,212.2'],
 ['OPERATING INCOME', '607.2518.12,797.62,506.2'],
 ['Other income and',
  'expense, net29.142.388.1214.4Interest expense, excludingFinancial Services141.3162.3612.1694.9Income before provision for incometaxes, extraordinary item, andcumulative effect of changes inaccounting principles495.0398.12,273.62,025.7Provision for income taxes157.870.6792.0643.5Income before extraordinary iteman

In [None]:
#single_read_in_text('C:/Users/li haoran/Desktop/New folder','21centfoxinc_sec_files.csv')
working_file = folder_read_in_text(folder_path='C:/Users/li haoran/Documents/GitHub/Capstone_Vanguard_NLP_Prediction/Inputs',
                                   ext = '.csv')

## Write Output

In [None]:
working_file.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data.csv', sep='|', encoding='utf-8')

# Further processing 

## Read documents

In [3]:
file_path = 'C:/Users/li haoran/Desktop/sec files/cleaned_data.csv'
cleaned_data = pd.read_csv(file_path, sep='|', encoding='utf-8')

## Further clean the data

In [4]:
words = set(nltk.corpus.words.words())
cleaned_data['only_eng_words'] = cleaned_data['text'].apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) \
                                      if w.lower() in words))
cleaned_data['evy_gram_1_3'] =cleaned_data['only_eng_words'].apply(lambda x: [' '.join(ng) for ng in everygrams(word_tokenize(x), 1, 4)])

In [7]:
cleaned_data[['file_name', 'doccount']]\
.groupby(['file_name'])\
.agg(['count','sum'])\
.reset_index()

Unnamed: 0_level_0,file_name,doccount,doccount
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
0,21centfoxinc_sec_files.csv,1048,3978.0
1,attinc_sec_files.csv,555,3383.0
2,cbscorp_sec_files.csv,385,2578.0
3,comcastcorp_sec_files.csv,277,2735.0
4,verizoncom_sec_files.csv,525,3319.0


## Write Output

In [None]:
cleaned_data.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated.csv', sep='|', encoding='utf-8')

In [9]:
cleaned_data[cleaned_data['file_name']=='21centfoxinc_sec_files.csv']\
.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_21centfoxinc_sec_files.csv', sep='|', encoding='utf-8')

cleaned_data[cleaned_data['file_name']=='attinc_sec_files.csv']\
.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_attinc_sec_files.csv', sep='|', encoding='utf-8')

cleaned_data[cleaned_data['file_name']=='cbscorp_sec_files.csv']\
.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_cbscorp_sec_files.csv', sep='|', encoding='utf-8')

cleaned_data[cleaned_data['file_name']=='comcastcorp_sec_files.csv']\
.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_comcastcorp_sec_files.csv', sep='|', encoding='utf-8')

cleaned_data[cleaned_data['file_name']=='verizoncom_sec_files.csv']\
.to_csv('C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_verizoncom_sec_files.csv', sep='|', encoding='utf-8')

# Analysis

## Read documents

In [None]:
## Full data
file_path = 'C:/Users/li haoran/Desktop/sec files/cleaned_data.csv'
cleaned_data = pd.read_csv(file_path, sep='|', encoding='utf-8')

In [2]:
## cleaned_data_updated_verizoncom_sec_files.csv
file_path = 'C:/Users/li haoran/Desktop/sec files/cleaned_data_updated_verizoncom_sec_files.csv'
cleaned_verizon_data = pd.read_csv(file_path, sep='|', encoding='utf-8')

## EDA

In [3]:
table1 = cleaned_data[['file_name', 'doccount']]\
.groupby(['file_name'])\
.agg(['count','sum'])\
.reset_index()
print(list(table1))
table1

NameError: name 'cleaned_data' is not defined

In [None]:
trace1 = go.Bar(x = table1['file_name'], 
                y = table1['doccount']['count'],
                name = 'filing count')

trace2 = go.Bar(x = table1['file_name'],
                y = table1['doccount']['sum'],
                name = 'document count')

data = [trace1,trace2]


iplot({
    "data":data,
    "layout":go.Layout(title="Distribution of number of emails received", 
                       xaxis={'title':'Num of emails'}, 
                       yaxis={'title':'Percentage of people received %'})})

In [None]:
table2 = cleaned_data[['file_name','form','doccount']]\
.groupby(['file_name','form'])\
.agg(['count','sum'])\
.sort_values(by=['file_name'])\
.reset_index()
print(list(table2))
table2

## Extract Features

Notes:
It is very very very very hard to parse old sec financial table, more recent files are easier 

https://www.sec.gov/investor/pubs/readan8k.pdf

https://github.com/ragraw26/Edgar-COMPANY-FILINGS-Web-Scrapping-Data-Analysis/blob/master/Data%20Scrapping/Team_5_Part1_Report.pdf

https://www.sec.gov/cgi-bin/viewer?action=view&cik=732712&accession_number=0001193125-10-041685&xbrl_type=v

https://www.codeproject.com/Articles/1227268/Accessing-Financial-Reports-in-the-EDGAR-Database

https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all-next-and-find-next

In [4]:
cleaned_verizon_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,fdate,cik,findexdate,form,coname,fsize,doccount,sec_full_path,text,file_name,only_eng_words,evy_gram_1_3
0,2265,2265,1994-01-21,732712,1994-03-31,8-K,BELL ATLANTIC CORP,3399.0,1.0,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,verizoncom_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"['BEGIN', 'PRIVACY', 'ENHANCED', 'MESSAGE', 'T..."
1,2266,2266,1994-02-01,732712,1994-03-31,8-K,BELL ATLANTIC CORP,2923.0,1.0,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,verizoncom_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"['BEGIN', 'PRIVACY', 'ENHANCED', 'MESSAGE', 'T..."
2,2267,2267,1994-02-09,732712,1994-03-31,8-K,BELL ATLANTIC CORP,18580.0,2.0,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,verizoncom_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"['BEGIN', 'PRIVACY', 'ENHANCED', 'MESSAGE', 'T..."
3,2268,2268,1994-02-14,732712,1994-03-31,8-K,BELL ATLANTIC CORP,2930.0,1.0,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,verizoncom_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"['BEGIN', 'PRIVACY', 'ENHANCED', 'MESSAGE', 'T..."
4,2269,2269,1994-02-24,732712,1994-03-31,8-K,BELL ATLANTIC CORP,3196.0,1.0,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,verizoncom_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"['BEGIN', 'PRIVACY', 'ENHANCED', 'MESSAGE', 'T..."
5,2270,2270,1994-03-23,732712,1994-03-31,8-K,BELL ATLANTIC CORP,6014.0,2.0,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,verizoncom_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"['BEGIN', 'PRIVACY', 'ENHANCED', 'MESSAGE', 'T..."
6,2271,2271,1994-03-31,732712,1994-03-31,10-K,BELL ATLANTIC CORP,533905.0,17.0,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,verizoncom_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"['BEGIN', 'PRIVACY', 'ENHANCED', 'MESSAGE', 'T..."
7,2272,2272,1994-04-22,732712,1994-06-30,8-K,BELL ATLANTIC CORP,13572.0,2.0,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,verizoncom_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"['BEGIN', 'PRIVACY', 'ENHANCED', 'MESSAGE', 'T..."
8,2273,2273,1994-05-13,732712,1994-06-30,10-Q,BELL ATLANTIC CORP,67565.0,3.0,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,verizoncom_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"['BEGIN', 'PRIVACY', 'ENHANCED', 'MESSAGE', 'T..."
9,2274,2274,1994-07-21,732712,1994-07-21,8-K,BELL ATLANTIC CORP,17411.0,2.0,https://www.sec.gov/Archives/edgar/data/732712...,-----BEGIN PRIVACY-ENHANCED MESSAGE-----%%Proc...,verizoncom_sec_files.csv,BEGIN PRIVACY ENHANCED MESSAGE Type CLEAR Orig...,"['BEGIN', 'PRIVACY', 'ENHANCED', 'MESSAGE', 'T..."


In [10]:
print(cleaned_verizon_data[cleaned_verizon_data['form']=='8-K']['sec_full_path'][0])
print(cleaned_verizon_data[cleaned_verizon_data['form']=='8-K']['sec_full_path'][1])
print(cleaned_verizon_data[cleaned_verizon_data['form']=='8-K']['sec_full_path'][2])

https://www.sec.gov/Archives/edgar/data/732712/0000950133-94-000018.txt
https://www.sec.gov/Archives/edgar/data/732712/0000950172-94-000010.txt
https://www.sec.gov/Archives/edgar/data/732712/0000893220-94-000061.txt


In [11]:
cleaned_verizon_data[['file_name','form']]\
.groupby(['form'])\
.agg(['count'])\
.reset_index()

Unnamed: 0_level_0,form,file_name
Unnamed: 0_level_1,Unnamed: 1_level_1,count
0,10-K,21
1,10-Q,74
2,8-K,430


## Explore a 10K

In [39]:
test_html = 'https://www.sec.gov/Archives/edgar/data/732712/0000950109-94-000587.txt'
test_file = request.urlopen(test_html).read().decode('utf8')

In [13]:
#r = re.compile(r'\table\b | \bCAPTION\b', flags=re.I | re.X)
#r = re.compile(r'\table\b\bCAPTION\b', flags=re.I | re.X)
#r = re.compile(r'\<table\>\s*\<caption\>')

In [40]:
soup = BeautifulSoup(test_file,'html.parser')
table = soup.find_all('table')

In [41]:
table[3]

<table>
<caption> 
                                                                                          Held
        Name                  Age                      Office                             Since
        ----                  ---                      ------                             -----
<s> <c> <c> <c>  
Raymond W. Smith...........   56  Chairman of the Board and Chief Executive Officer        1989
James G. Cullen............   51  President                                                1993
William O. Albertini.......   50  Vice President and Chief Financial Officer               1991
Joseph T. Ambrozy..........   54  Vice President - Strategic Planning                      1992
Lawrence T. Babbio, Jr.....   49  Chairman, President and Chief Executive Officer,         1991
                                        Bell Atlantic Enterprises International, Inc.
P. Alan Bulliner...........   50  Vice President - Corporate Secretary and Counsel         1992  
Barbara L. C

## Explore a 8-K

In [125]:
test_html = 'https://www.sec.gov/Archives/edgar/data/732712/0000950133-94-000018.txt'
test_file = request.urlopen(test_html).read().decode('utf8')

In [126]:
soup = BeautifulSoup(test_file,'html.parser')
## find all tables
table = soup.find_all('table')
## search the text
Item_text = soup.findAll(text=re.compile("Item "))

In [127]:
table

[<table>
 <s> <c>
 Date of Report:                              January 20, 1994
 
 Exact name of registrant
 as specified in its charter:                 BELL ATLANTIC CORPORATION
 
 Commission File No.:                         1-8606
 
 State of Incorporation:                      Delaware
 
 IRS Employer Identification No.:             23-2259884
 
 Address of principal
 executive offices:                           1717 Arch Street
                                              Philadelphia, Pennsylvania
 Zip Code                                     19103
 
 Registrant's telephone number,
 including area code:                         (215) 963-6000
 
 Former name or former address,
 if changed since last report:                N/A
 </c></s></table>]

In [128]:
Item_text

['   2\n\n\n\n\nItem 5.  Other Events.\n\n     Bell Atlantic Corporation (the "Company") today said that rumors that the\nCompany intends to cut its dividend are totally false. The Company does not\nintend to cut its dividend as a result of its pending merger with\nTeleCommunications, Inc. ("TCI") and Liberty Media Corporation ("Liberty").\n\n     The Company also confirmed that negotiations on that transaction are\nproceeding satisfactorily. The proposed merger is an exceedingly complex\ntransaction, and, as in any negotiation, various elements may be modified. It\nis the Company\'s intention that the final financial terms of the transaction\nwill not differ significantly from the Letter of Intent executed by the\nCompany, TCI and Liberty on October 12, 1993.\n']

In [130]:
re.search('(?<=Item )\d', Item_text_id[0])

<_sre.SRE_Match object; span=(10, 11), match='5'>

## Another 8-K with two Items

In [131]:
test_html = 'https://www.sec.gov/Archives/edgar/data/732712/0000893220-94-000061.txt'
test_file = request.urlopen(test_html).read().decode('utf8')

In [132]:
soup = BeautifulSoup(test_file,'html.parser')
## find all tables
table = soup.find_all('table')
## search the text
Item_text = soup.findAll(text=re.compile("Item "))

In [133]:
Item_text[1]

'   4\nItem 7. Financial Statements and Exhibits.\n\n(c)  Exhibits.\n\nThe exhibit identified below is filed as an exhibit hereto.\n\nExhibit 99:\n\nUnaudited condensed consolidated statements of income, unaudited other selected\ndata and unaudited selected operating statistics for the three months and year\nended December 31, 1993 and 1992.\n'

In [142]:
[re.search('(?<=Item ).*', i).group(0) for i in Item_text]

['5. Other Events.', '7. Financial Statements and Exhibits.']

In [None]:
soup = BeautifulSoup(html)
table = soup.find("table", attrs={"class":"details"})

# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]

datasets = []
for row in table.find_all("tr")[1:]:
    dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
    datasets.append(dataset)

In [6]:
##8-k
test_html = 'https://www.sec.gov/Archives/edgar/data/732712/0000950109-94-000587.txt'
test_file = request.urlopen(test_html).read().decode('utf8')