In [1]:
import os
import pandas as pd
from lxml import html
import requests
import re
import sys
from datetime import datetime

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import time


# Input the CIK list for the fund family you are looking at

In [118]:
fund_name = 'seligman.xlsx'

# select CIK 1398078 as test file
CIK_LIST = ['316411']




In [119]:
cur_wd = os.getcwd()
if 'trunk' in cur_wd:
    output_directory = os.path.join("\\".join(cur_wd.split('\\')[0:-1]),'output')
else:
    output_directory = os.path.join("\\".join(cur_wd.split('\\')[0:-1]),'output','working')
    
cur_wd = os.path.join("\\".join(cur_wd.split('\\')[0:-1]),'data')
print('data directory')
print(cur_wd)
print('output directory')
print(output_directory)


data directory
data
output directory
output/working


# Helper Functions for Analysis

In [120]:
def get_cleaned(lines):
    cleaned = []
    for i in lines:
        #gets rid of tags
        cleaned_i = re.sub('<[^>]*>', ' ', i)
        #gets rid of &nbsp; and replaces with space
        cleaned_i = re.sub('&nbsp;', ' ', cleaned_i)
        #re.sub("<.*?>","",st)
        if not re.match(r'^\s*$', cleaned_i):
            cleaned.append(cleaned_i)
    return cleaned

def find(s, ch):
    return [i for i, ltr in enumerate(s) if ltr == ch]


def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))


def remove_junk(addline):
    addline = re.sub('<[^>]*>', ' ', addline)
    addline = re.sub('&nbsp;', ' ', addline)
    addline = re.sub('&#174;', ' ', addline)
    addline = re.sub('&#8480;', ' ', addline)
    addline = re.sub(r'&reg;',' ',addline)
    addline = re.sub(r'\n', ' ', addline)
    addline = re.sub(r'\t', ' ', addline)
    return addline


In [121]:
weblink = {}
for CIK in CIK_LIST:
    weblink[CIK]  = []
    txt_files = os.listdir(os.path.join(cur_wd, CIK, 'n-q'))
    for i in txt_files:
        weblink[CIK].append(r"https://www.sec.gov/Archives/edgar/data/" + str(CIK)+r"/"  + i)

In [122]:
weblink

{'316411': ['https://www.sec.gov/Archives/edgar/data/316411/0001169232-08-002193.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0000950123-11-054303.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0000950123-09-065935.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0000950123-10-109224.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0001169232-07-004458.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0000936329-06-000167.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0001145443-06-001745.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0001169232-09-002769.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0000950123-10-053029.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0001145443-04-001859.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0000936329-07-000060.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0000936329-05-000066.txt',
  'https://www.sec.gov/Archives/edgar/data/316411/0001169232-08-00

get panel 2

In [123]:
panel2 = pd.DataFrame()
error_panel2 = []

    
for CIK in CIK_LIST:
    txt_files = os.listdir(os.path.join(cur_wd, CIK, 'n-q'))
    
    for i in txt_files:
        if '.csv' not in i:
            try:
                series = {}
                series_keys = ['SERIES-ID', 'OWNER-CIK', 'SERIES-NAME']
                for s in series_keys:
                    series[s] = []

                contract = {}
                contract_keys = ['CLASS-CONTRACT-ID','CLASS-CONTRACT-NAME','CLASS-CONTRACT-TICKER-SYMBOL', 'LAST-READ-SERIES']
                for c in contract_keys:
                    contract[c] = []

                uncleaned_file = open(os.path.join(cur_wd, CIK, 'n-q',i), mode='r') 
                for u in uncleaned_file:
                    

                    if 'COMPANY CONFORMED NAME' in u:
                        conformed_name = u.replace('COMPANY CONFORMED NAME:', '').strip()

                    if 'FILED AS OF DATE:' in u:
                        filed_date = str(u).replace('FILED AS OF DATE:','').strip()

                    #series information
                    for s in series_keys:
                        if s in u:
                                series[s].append(str(u).replace('<' + s + '>', "").strip())
                    for c in contract_keys:
                        if c != 'LAST-READ-SERIES':
                            if c in u:
                                contract[c].append(str(u).replace('<' + c + '>', "").strip())
                                if c == 'CLASS-CONTRACT-ID':
                                    contract['LAST-READ-SERIES'].append(series['SERIES-ID'][-1])


                series = pd.DataFrame.from_dict(series, orient = 'index').T
                contract = pd.DataFrame.from_dict(contract, orient = 'index').T


                for s in series_keys:
                    contract.loc[:, s] = ''

                for ix,row in contract.iterrows():
                    for s in series_keys:
                        contract.loc[contract.index == ix, s] = series.loc[series['SERIES-ID'] == row['LAST-READ-SERIES'],s].values[0]
                contract.loc[:,'file_read'] = i
                contract.loc[:,'date_filed'] = filed_date
                contract.loc[:, 'company conformed name'] = conformed_name
                contract = contract.drop(['LAST-READ-SERIES'], axis=1)

                if panel2.empty:
                    panel2 = contract.copy()
                else:
                    panel2 = pd.concat([panel2, contract], axis = 0)
            except ValueError:
                error_panel2.append(i)
                

In [124]:
print(error_panel2)

['0001145443-04-001859.txt', '0000936329-05-000066.txt', '0000936329-05-000174.txt']


In [125]:
print(panel2)
panel2.to_csv(os.path.join(output_directory, fund_name + '_panel2.csv'), sep = ',')

   CLASS-CONTRACT-ID                      CLASS-CONTRACT-NAME  \
0         C000026136      SELIGMAN GROWTH FUND, INC., CLASS A   
1         C000026137      SELIGMAN GROWTH FUND, INC., CLASS B   
2         C000026138      SELIGMAN GROWTH FUND, INC., CLASS C   
3         C000026139      SELIGMAN GROWTH FUND, INC., CLASS D   
4         C000026140      SELIGMAN GROWTH FUND, INC., CLASS I   
5         C000026141      SELIGMAN GROWTH FUND, INC., CLASS R   
0         C000026136      SELIGMAN GROWTH FUND, INC., CLASS A   
1         C000026137      SELIGMAN GROWTH FUND, INC., CLASS B   
2         C000026138      SELIGMAN GROWTH FUND, INC., CLASS C   
3         C000026140     SELIGMAN GROWTH FUND, INC., CLASS R5   
4         C000026141      SELIGMAN GROWTH FUND, INC., CLASS R   
5         C000076414      SELIGMAN GROWTH FUND, INC., CLASS I   
6         C000076416     SELIGMAN GROWTH FUND, INC., CLASS R4   
0         C000026136      SELIGMAN GROWTH FUND, INC., CLASS A   
1         C000026137     

FileNotFoundError: [Errno 2] No such file or directory: 'output/working/seligman_panel2.csv'

## Test on One FILE

In [126]:
CIK  ='316411'
text =  '0000950123-11-054303.txt'


In [476]:
# url2 = 'https://www.sec.gov/Archives/edgar/data/316411/0000950123-09-065935.txt'
url2

'https://www.sec.gov/Archives/edgar/data/316411/0000950123-09-065935.txt'

In [127]:
url = r'https://www.sec.gov/Archives/edgar/data/' + CIK + r'/' + text.replace('.txt', '').replace('-','')  + r'/' + text
print(url)

https://www.sec.gov/Archives/edgar/data/316411/000095012311054303/0000950123-11-054303.txt


In [543]:
def bs(url):
    source = urllib.request.urlopen(url).read()
    soup = bs.BeautifulSoup(source,'lxml')
    return soup

In [545]:
import bs4 as bs
import urllib.request

In [1015]:
source = urllib.request.urlopen('https://www.sec.gov/Archives/edgar/data/881466/000095012311030523/0000950123-11-030523.txt').read()
soup = bs.BeautifulSoup(source,'lxml')
div = soup.find_all('td')
# print(div)
isfound = 0
list_legend = []
dic_legend = {}
for i in div:
    if isfound == 2:
        isfound = 0
        list_legend.append(i.text)
#         print(i.text)
    else:
        if isfound == 1:
            isfound+=1
            continue
        else:
            z = re.match(r'\(([^()]*)\)',i.text)
            
            if z:
                isfound = 1
                list_legend.append(i.text)
# print(list_legend)
# legend_dict.update([(i.text[1:2],legend_data)])
p = re.compile('\(([\w]*)\)')
for counter, value in enumerate(list_legend):
    if p.match(value):
        if value not in dic_legend.keys():
            dic_legend[value] = list_legend[counter + 1]
#             print(dic_legend)
#             print(list_legend[counter + 1])
#     print(counter, value)
#             print(i.text)
#     if i.text.find('(a)') != -1:
#     print(i.text)
# dic_legend['(c)']     
# legend_data = i.text[6:].replace(u'\xa0', u' ')
#     legend_data = legend_data.replace(u'\x97', u' ')
#     legend_data = legend_data.replace(u'\x92', u' ')
#     legend_data = legend_data.replace(u'\n', u' ')
#         report_date = i.text
#         report_date = i.text.split(":")[1]
# pattern = re.compile(r'Unaudited')
# report_date = "";
# for i in date:
#     m = re.match(pattern, i.text)
#     if m:
#         report_date = i.text.split(":")[1]

'Foreign security values are stated in U.S. dollars. At Jan. 31, 2011, the value of foreign\nsecurities, excluding short-term securities, represented 34.24% of net assets.'

In [1025]:
# source = urllib.request.urlopen('https://www.sec.gov/Archives/edgar/data/316411/000095012311054303/0000950123-11-054303.txt').read()
# soup = bs.BeautifulSoup(source,'lxml')
# div = soup.find_all('div')
# list_legend = []
# dic_legend = {}
# for i in div:
#     z = re.match(r'\(([^()]*)\)',i.text)
#     if z:
#         list_legend.append(i.text)
# #         print(i.text)
# # print(list_legend)
# p = re.compile('\(([\w]*)\)')
# for counter, value in enumerate(list_legend):
#     if p.match(value):
# #         print(value)
#         if value not in dic_legend.keys():
# #             print(value[4:])
#             dic_legend[value[0:3]] = value[4:]
# #             if counter < len(list_legend):
# #                 dic_legend[value] = list_legend[counter + 1]
# dic_legend

{'(a)': '\xa0\xa0\xa0Securities are valued by using policies described in Note 2 to the financial statements in the\nmost recent Annual Report dated Dec. 31, 2010.\n',
 '(b)': '\xa0\xa0\xa0Non-income producing.\n',
 '(c)': '\xa0\xa0\xa0Foreign security values are stated in U.S. dollars. At March\xa031, 2011, the value of foreign\nsecurities, excluding short-term securities, represented 5.35% of net assets.\n',
 '(d)': '\xa0\xa0Represents fractional shares.\n',
 '(e)': '\xa0\xa0At March\xa031, 2011, security was partially or fully on loan.\n',
 '(f)': '\xa0\xa0Affiliated Money Market Fund \x97 The Fund may invest its daily cash balance in Columbia\nShort-Term Cash Fund, a money market fund established for the exclusive use of funds and other\ninstitutional clients of Columbia Management. The rate shown is the seven-day current annualized\nyield at March\xa031, 2011.\n',
 '(g)': '\xa0\xa0The table below represents securities received as collateral for repurchase agreements. This\ncollate

In [1066]:
def get_fundname_time(url):
    source = urllib.request.urlopen(url).read()
    soup = bs.BeautifulSoup(source,'lxml')
    div = soup.find_all('div')
    hybrid_list = []
    # fund_name
    for i in div:
        if "Unaudited" in i.text:
            hybrid_list.append(i.text[0:i.text.find("Unaudited")-1]);
    for i in hybrid_list:
        items = i.split("\n")
        for counter, value in enumerate(items):
            if value == "Portfolio of Investments":
                return items[counter + 1], items[counter + 2]

In [1068]:
url = 'https://www.sec.gov/Archives/edgar/data/316411/000095012311054303/0000950123-11-054303.txt'
# url = 'https://www.sec.gov/Archives/edgar/data/316411/000095012309065935/0000950123-09-065935.txt'
print(get_fundname_time(url))

('Seligman Growth Fund', 'March\xa031, 2011 ')


In [1065]:
# url = 'https://www.sec.gov/Archives/edgar/data/316411/000095012311054303/0000950123-11-054303.txt'
# source = urllib.request.urlopen(url).read()
# soup = bs.BeautifulSoup(source,'lxml')
# div = soup.find_all('div')
# # pattern = re.compile('Unaudited')
# # report_date = "";
# hybrid_list = []
# # fund_name
# for i in div:
#     if "Unaudited" in i.text:
#         hybrid_list.append(i.text[0:i.text.find("Unaudited")-1]);
# hybrid_list
# for i in hybrid_list:
#     print(i.split("\n"))
#         print(i.text[0:i.text.find("Unaudited")])
#     m = re.match(pattern, i.text)
#     if m:
#         print(i.text)
#         report_date = i.text.split(":")[1].strip()

['', 'Portfolio of Investments', 'Seligman Growth Fund', 'March\xa031, 2011 ']
['Portfolio of Investments', 'Seligman Growth Fund', 'March\xa031, 2011 ']


In [1049]:
# report_date

''

In [519]:
# def parse(table, output):
#     for i in range(1,5):
#         df = table[i].dropna(axis = 0, how = 'all')
#         df = df.drop(columns=[1,2,5,6,8])
#         df = df.dropna(thresh = 2)
#         df = df.rename(columns={0: "Holding", 3:"Share", 4 : "legend" , 7:"Value"})
#         df = df.drop(1)
#         df = df[df.Holding != 'Total']
#         df = df[df['Holding'].notna()]
#         # add a filter in case there is a total file in the end
#         filterF = df['Holding'].str.contains("Total")
#         df = df[~filterF]
# #         df.reset_index()
# #         df.drop(df.index)
#         output = output.append(df)
#     return output

In [206]:
def parse(table, output, i):
    df = table[i].dropna(axis = 0, how = 'all')
    df = df.drop(columns=[1,2,5,6,8])
    df = df.dropna(thresh = 2)
    df = df.rename(columns={0: "Holding", 3:"Share", 4 : "legend" , 7:"Value"})
    df = df.drop(1)
    df = df[df.Holding != 'Total']
    df.reset_index()
    df.drop(df.index)
    # add a filter in case there is a total file in the end
    filter = df['Holding'].str.contains("Total")
    df = df[~filter]
    output.append(df)
    return df

In [None]:
# 'https://www.sec.gov/Archives/edgar/data/316411/0001169232-08-002193.txt',
#   'https://www.sec.gov/Archives/edgar/data/316411/0000950123-11-054303.txt',
#   'https://www.sec.gov/Archives/edgar/data/316411/0000950123-09-065935.txt',
#   'https://www.sec.gov/Archives/edgar/data/316411/0000950123-10-109224.txt',
#   'https://www.sec.gov/Archives/edgar/data/316411/0001169232-07-004458.txt',
#   'https://www.sec.gov/Archives/edgar/data/316411/0000936329-06-000167.txt',
#   'https://www.sec.gov/Archives/edgar/data/316411/0001145443-06-001745.txt',
#   'https://www.sec.gov/Archives/edgar/data/316411/0001169232-09-002769.txt',
#   'https://www.sec.gov/Archives/edgar/data/316411/0000950123-10-053029.txt',
#   'https://www.sec.gov/Archives/edgar/data/316411/0000936329-07-000060.txt',
#   'https://www.sec.gov/Archives/edgar/data/316411/0001169232-08-004392.txt',

In [526]:
import bs4 as bs
import urllib.request
import unicodedata

In [1070]:
def get_legend_for_td(url):
    source = urllib.request.urlopen(url).read()
    soup = bs.BeautifulSoup(source,'lxml')
    div = soup.find_all('td')
    isfound = 0
    list_legend = []
    dic_legend = {}
    for i in div:
        if isfound == 2:
            isfound = 0
            list_legend.append(i.text)
        else:
            if isfound == 1:
                isfound+=1
                continue
            else:
                z = re.match(r'\(([^()]*)\)',i.text)

                if z:
                    isfound = 1
                    list_legend.append(i.text)
                    
    p = re.compile('\(([\w]*)\)')
    for counter, value in enumerate(list_legend):
        if p.match(value):
            if value not in dic_legend.keys():
                dic_legend[value] = list_legend[counter + 1]
    return dic_legend

In [1072]:
def get_legend_for_div(url):
    source = urllib.request.urlopen(url).read()
    soup = bs.BeautifulSoup(source,'lxml')
    div = soup.find_all('div')
    list_legend = []
    dic_legend = {}
    for i in div:
        z = re.match(r'\(([^()]*)\)',i.text)
        if z:
            list_legend.append(i.text)
    p = re.compile('\(([\w]*)\)')
    for counter, value in enumerate(list_legend):
        if p.match(value):
            if value not in dic_legend.keys():
                dic_legend[value[0:3]] = value[4:]
    return dic_legend

In [1073]:
print(get_legend_for_div(url))

{'(a)': '\xa0\xa0\xa0Securities are valued by using policies described in Note 2 to the financial statements in the\nmost recent Annual Report dated Dec. 31, 2010.\n', '(b)': '\xa0\xa0\xa0Non-income producing.\n', '(c)': '\xa0\xa0\xa0Foreign security values are stated in U.S. dollars. At March\xa031, 2011, the value of foreign\nsecurities, excluding short-term securities, represented 5.35% of net assets.\n', '(d)': '\xa0\xa0Represents fractional shares.\n', '(e)': '\xa0\xa0At March\xa031, 2011, security was partially or fully on loan.\n', '(f)': '\xa0\xa0Affiliated Money Market Fund \x97 The Fund may invest its daily cash balance in Columbia\nShort-Term Cash Fund, a money market fund established for the exclusive use of funds and other\ninstitutional clients of Columbia Management. The rate shown is the seven-day current annualized\nyield at March\xa031, 2011.\n', '(g)': '\xa0\xa0The table below represents securities received as collateral for repurchase agreements. This\ncollateral, w

In [754]:
def parseAll(table, output):
    colDic = {}
    for i in range(len(table)):
        df = table[i].dropna(axis = 1, how = 'all')
        df = df.dropna(axis = 0, how = 'all')
        df = df.dropna(thresh = 3)
        output = output.append(df)
    for col in output.columns:
        colDic[col] = "Col" + str(col)
#         output = output[output[col].notna()]
    output = output.rename(columns=colDic)
    
    return output

In [704]:
def getHolding(output):
    colDic = {}
    for i in output.columns:
        if i == 0:
            colDic[i] = "Holding"
        elif i == 3:
            colDic[i] = "Share"
        elif i == 4:
            colDic[i] = "legend"
        elif i == 7:
            colDic[i] = "Value"
        else:
            colDic[i] = "Col" + str(i)
    output = output.rename(columns=colDic)
#     output = output.dropna(subset=["Holding", "Share", "Value"])
#     output = output.dropna(axis = 0, how = 'all') # drop row if all row is NAN
#     output = output.dropna(thresh = 3)
#     output = output.dropna(axis = 1, how = 'all') # drop col if all col is NAN
#     for i in output.columns:
#         if i == 'Holding':
#             continue
#         elif i == 'Share':
#             continue
#         elif i == 'legend':
#             continue
#         elif i == 'Col6':
#             continue
#         elif i == 'Value':
#             continue
#         else:
#             bool_series = pd.isnull(output[i])
#             output = output[bool_series]
    return output

In [1163]:
def rename_clean_df(df, share_col, value_col, legend_col):
    num = 0
    colDic = {}
    has_legend = False
    if legend_col is not None:
        has_legend = True
    for i in df.columns:
        if num == 0:
            colDic[i] = "Holding"
            df = df[df[i].notnull()]
            num+=1
        elif i == share_col:
            colDic[i] = "Share"
            df = df[df[i].notnull()]
        elif i == value_col:
            colDic[i] = "Value"
            df = df[df[i].notnull()]
        elif has_legend and i == legend_col:
            colDic[i] = "Legend"
        else:
            df = df[~df[i].notnull()]
            df = df.drop(i,axis=1)
        
    df = df.rename(columns=colDic)    
    return df

In [860]:
def clean_data(df):
    # this function will drop column that has more than 80% nan value
    for i in df.columns:
        if sum(df[i].isnull())/len(df) >= 0.8:
            del df[i]
    # drop any row contains any nan
    df = df.dropna(how='any')
    # drop any row contains duplicate values
    df = df[~df.nunique(axis = 1).eq(1)]
    return df

In [940]:
#designed for table has legend data info:
# will keep 4 rows of data
def clean_data_2(df):
    # this function will drop column that has more than 80% nan value
    for i in df.columns:
        if sum(df[i].isnull())/len(df) >= 0.8:
            del df[i]
    # drop any row contains any nan
#     df = df.dropna(how='any')
    # drop any row contains duplicate values
    df = df[~df.nunique(axis = 1).eq(1)]
    return df

In [1074]:
def find_legend_col(df):
    count = 0
    for i in df.columns:
        count = 0
        for items in df[i]:
            if count == 10:
                break
            z = re.match(r'\(([^()]*)\)',str(items))
            count+=1
            if z:
                return i;
    return None

In [1093]:
def find_value_shares_col(df):
    count = 0
    col1 = -1
    col2 = -1
    for i in df.columns:
        count = 0
        for items in df[i]:
            if count == 10:
                break
            z = any(i.isdigit() for i in str(items))
#             print(any(i.isdigit() for i in str(items)))
            count+=1
            if z:
#                 print()
                if col1 == -1: 
                    col1 = i
                else:
                    if col1 == i:
                        continue
                    if col2 == -1:
                        col2 = i
    return col1,col2

In [1171]:
def construct_table_on_one_file(CIK, text, url):
    fundname, time = get_fundname_time(url)
    table = pd.read_html(url)
    df = pd.DataFrame()
    df = parseAll(table,df)
    df = clean_data_2(df) # or use clean_data_1
    share_col, value_col = find_value_shares_col(df)
    legend_col = find_legend_col(df)
    legend_dic = {}
    if legend_col is not None:
        legend_dic = get_legend_for_div(url)
    df = rename_clean_df(df, share_col,value_col, legend_col)
    return df
        
    

In [1175]:
url = 'https://www.sec.gov/Archives/edgar/data/316411/000095012309065935/0000950123-09-065935.txt'

In [1176]:
construct_table_on_one_file(CIK, text, url)

Unnamed: 0,Holding,Share,Legend,Value
4,Honeywell Intl,311757,,11581773
5,ITT,249000,(d),12985350
6,Lockheed Martin,105288,,8220887
7,Precision Castparts,110700,,11277009
13,United Parcel Service Cl B,167800,,9475666
16,Amgen,551956,(b),33244310
17,Biogen Idec,225256,(b),11379933
18,Gilead Sciences,486446,"(b,d)",22658655
19,Vertex Pharmaceuticals,228400,"(b,d)",8656360
25,Invesco,764790,,17406620


In [1177]:
url = 'https://www.sec.gov/Archives/edgar/data/316411/000095012311054303/0000950123-11-054303.txt'
construct_table_on_one_file(CIK, text, url)

Unnamed: 0,Holding,Share,Legend,Value
3,Precision Castparts Corp.,127100,(e),18706578
4,United Technologies Corp.,319100,,27011815
10,"United Parcel Service, Inc., Class B",226100,,16803752
13,"Autoliv, Inc.",260800,(c),19359184
16,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136
17,Celgene Corp.,151700,(b),8727301
18,Dendreon Corp.,329900,(b),12348157
24,"BlackRock, Inc.",90400,,18171304
25,"Franklin Resources, Inc.",106200,,13283496
26,Morgan Stanley,445100,,12160132


In [None]:
https://www.sec.gov/Archives/edgar/data/881466/000116923207003736/0001169232-07-003736.txt

In [972]:
url_cik = 'https://www.sec.gov/Archives/edgar/data/316411/000095012311054303/0000950123-11-054303.txt'
tb1 = pd.read_html(url_cik)
len(tb1)

46

In [1164]:
df_cik = pd.DataFrame()
df_cik = parseAll(tb1,df_cik)
df_cik

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Col11,Col12,Col13,Col14,Col15,Col16
1,Issuer,,Shares,Shares,,,Value(a),Value(a),,,,,,,,,
3,Precision Castparts Corp.,,,127100,(e),,$,18706578,,,,,,,,,
4,United Technologies Corp.,,,319100,,,,27011815,,,,,,,,,
10,"United Parcel Service, Inc., Class B",,,226100,,,,16803752,,,,,,,,,
13,"Autoliv, Inc.",,,260800,(c),,,19359184,,,,,,,,,
16,"Alexion Pharmaceuticals, Inc.",,,155200,"(b,e)",,,15315136,,,,,,,,,
17,Celgene Corp.,,,151700,(b),,,8727301,,,,,,,,,
18,Dendreon Corp.,,,329900,(b),,,12348157,,,,,,,,,
24,"BlackRock, Inc.",,,90400,,,,18171304,,,,,,,,,
25,"Franklin Resources, Inc.",,,106200,,,,13283496,,,,,,,,,


In [1165]:
df_cik = clean_data_2(df_cik)
df_cik

Unnamed: 0,Col0,Col2,Col3,Col4,Col7,Col11
1,Issuer,Shares,Shares,,Value(a),
3,Precision Castparts Corp.,,127100,(e),18706578,
4,United Technologies Corp.,,319100,,27011815,
10,"United Parcel Service, Inc., Class B",,226100,,16803752,
13,"Autoliv, Inc.",,260800,(c),19359184,
16,"Alexion Pharmaceuticals, Inc.",,155200,"(b,e)",15315136,
17,Celgene Corp.,,151700,(b),8727301,
18,Dendreon Corp.,,329900,(b),12348157,
24,"BlackRock, Inc.",,90400,,18171304,
25,"Franklin Resources, Inc.",,106200,,13283496,


In [1166]:
share_col, value_col = find_value_shares_col(df_cik)

In [1167]:
legend_col = find_legend_col(df_cik)

In [1168]:
print(legend_col)

Col4


In [1169]:
# col2 = df_cik['Col2'].notnull

In [1170]:
df_cik = rename_clean_df(df_cik, share_col,value_col, legend_col)
df_cik

Unnamed: 0,Holding,Share,Legend,Value
3,Precision Castparts Corp.,127100,(e),18706578
4,United Technologies Corp.,319100,,27011815
10,"United Parcel Service, Inc., Class B",226100,,16803752
13,"Autoliv, Inc.",260800,(c),19359184
16,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136
17,Celgene Corp.,151700,(b),8727301
18,Dendreon Corp.,329900,(b),12348157
24,"BlackRock, Inc.",90400,,18171304
25,"Franklin Resources, Inc.",106200,,13283496
26,Morgan Stanley,445100,,12160132


In [1154]:
# df_cik = df_cik.drop('Col2',axis=1)
# df_cik

Unnamed: 0,Holding,Share,Legend,Value,Col11
3,Precision Castparts Corp.,127100,(e),18706578,
4,United Technologies Corp.,319100,,27011815,
10,"United Parcel Service, Inc., Class B",226100,,16803752,
13,"Autoliv, Inc.",260800,(c),19359184,
16,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136,
17,Celgene Corp.,151700,(b),8727301,
18,Dendreon Corp.,329900,(b),12348157,
24,"BlackRock, Inc.",90400,,18171304,
25,"Franklin Resources, Inc.",106200,,13283496,
26,Morgan Stanley,445100,,12160132,


In [1151]:
# df_cik = df_cik[~df_cik.Col2.notnull()]
# df_cik

In [983]:
tb1[7]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,,,,,,,,,,,,
1,,,Effective,Effective,Effective,,Amount payable,Amount payable,Amount payable,,,,
2,Issuer,,yield,yield,yield,,at maturity,at maturity,at maturity,,Value(a),Value(a),Value(a)
3,Sumitomo Mitsui Banking Corp.,,,,,,,,,,,,
4,04-14-11,,,0.330,,,,10000000,,,,10000000,
5,Union Bank of Switzerland,,,,,,,,,,,,
6,04-18-11,,,0.334,,,,7000000,,,,7000000,
7,United Overseas Bank Ltd.,,,,,,,,,,,,
8,04-18-11,,,0.390,,,,6000000,,,,6000000,
9,Westpac Banking Corp.,,,,,,,,,,,,


In [946]:
url_c = 'https://www.sec.gov/Archives/edgar/data/881466/000116923207003736/0001169232-07-003736.txt'
tb2 = pd.read_html(url_c)
len(tb2)

22

In [956]:
df_cik1 = pd.DataFrame()
df_cik1 = parseAll(tb2,df_cik1)
df_cik1

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Col5,Col6,Col8,Col9
1,,,Shares,Shares,,Value,Value,,
5,Banco Macro (ADR)* (Commercial Banks),,,28200,,$,799470,,
8,Companhia Vale do Rio Doce “CVRD” (ADR) (Meta...,,,80858,,,3962851,,
9,Cyrela Brazil Realty (Household Durables),,,91200,,,1067785,,
10,Localiza Rent A CAR (Road and Rail),,,49800,,,511671,,
11,Lupatech (Machinery),,,48400,,,1151341,,
12,OdontoPrev* (Insurance),,,28100,,,805713,,
13,"Petroleo Brasileiro “Petrobras” (ADR) (Oil, G...",,,60200,,,3906980,,
14,Redecard* (IT Services),,,76400,,,1310317,,
15,Submarino* (Internet and Catalog Retail),,,41500,,,1740827,,


In [957]:
df_cik1 = clean_data_2(df_cik1)
df_cik1

Unnamed: 0,Col0,Col3,Col6
1,,Shares,Value
5,Banco Macro (ADR)* (Commercial Banks),28200,799470
8,Companhia Vale do Rio Doce “CVRD” (ADR) (Meta...,80858,3962851
9,Cyrela Brazil Realty (Household Durables),91200,1067785
10,Localiza Rent A CAR (Road and Rail),49800,511671
11,Lupatech (Machinery),48400,1151341
12,OdontoPrev* (Insurance),28100,805713
13,"Petroleo Brasileiro “Petrobras” (ADR) (Oil, G...",60200,3906980
14,Redecard* (IT Services),76400,1310317
15,Submarino* (Internet and Catalog Retail),41500,1740827


In [886]:
ur = 'https://www.sec.gov/Archives/edgar/data/316411/000095012311054303/0000950123-11-054303.txt'
tb = pd.read_html(ur)
len(tb)

46

In [941]:
df = pd.DataFrame()
df = parseAll(table,df)
df

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Col11,Col12,Col13,Col14,Col15,Col16
1,Issuer,,Shares,Shares,,,Value(a),Value(a),,,,,,,,,
3,Precision Castparts Corp.,,,127100,(e),,$,18706578,,,,,,,,,
4,United Technologies Corp.,,,319100,,,,27011815,,,,,,,,,
10,"United Parcel Service, Inc., Class B",,,226100,,,,16803752,,,,,,,,,
13,"Autoliv, Inc.",,,260800,(c),,,19359184,,,,,,,,,
16,"Alexion Pharmaceuticals, Inc.",,,155200,"(b,e)",,,15315136,,,,,,,,,
17,Celgene Corp.,,,151700,(b),,,8727301,,,,,,,,,
18,Dendreon Corp.,,,329900,(b),,,12348157,,,,,,,,,
24,"BlackRock, Inc.",,,90400,,,,18171304,,,,,,,,,
25,"Franklin Resources, Inc.",,,106200,,,,13283496,,,,,,,,,


In [942]:
df = clean_data_2(df)
df

Unnamed: 0,Col0,Col2,Col3,Col4,Col7,Col11
1,Issuer,Shares,Shares,,Value(a),
3,Precision Castparts Corp.,,127100,(e),18706578,
4,United Technologies Corp.,,319100,,27011815,
10,"United Parcel Service, Inc., Class B",,226100,,16803752,
13,"Autoliv, Inc.",,260800,(c),19359184,
16,"Alexion Pharmaceuticals, Inc.",,155200,"(b,e)",15315136,
17,Celgene Corp.,,151700,(b),8727301,
18,Dendreon Corp.,,329900,(b),12348157,
24,"BlackRock, Inc.",,90400,,18171304,
25,"Franklin Resources, Inc.",,106200,,13283496,


In [888]:
for i in df.columns:
        if sum(df[i].isnull())/len(df) >= 0.8:
            del df[i]

In [889]:
df

Unnamed: 0,Col0,Col2,Col3,Col4,Col7,Col11
1,Issuer,Shares,Shares,,Value(a),
3,Precision Castparts Corp.,,127100,(e),18706578,
4,United Technologies Corp.,,319100,,27011815,
10,"United Parcel Service, Inc., Class B",,226100,,16803752,
13,"Autoliv, Inc.",,260800,(c),19359184,
16,"Alexion Pharmaceuticals, Inc.",,155200,"(b,e)",15315136,
17,Celgene Corp.,,151700,(b),8727301,
18,Dendreon Corp.,,329900,(b),12348157,
24,"BlackRock, Inc.",,90400,,18171304,
25,"Franklin Resources, Inc.",,106200,,13283496,


In [892]:
print(sum(df['Col2'].isnull())/len(df))

0.7602040816326531


In [865]:
url2 = 'https://www.sec.gov/Archives/edgar/data/316411/000095012310053029/0000950123-10-053029.txt'
table2 = pd.read_html(url2)
len(table2)

28

In [875]:
df2 = pd.DataFrame()
df2 = parseAll(table2,df2)
df2

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Col11,Col12,Col13,Col14,Col15
1,Issuer,,Shares,Shares,,,Value(a),Value(a),,,,,,,,
3,Goodrich Corp.,,,137498,(d),,$,9696359,,,,,,,,
4,"Honeywell International, Inc.",,,311757,,,,14113239,,,,,,,,
5,Precision Castparts Corp.,,,177400,(d),,,22478354,,,,,,,,
11,"United Parcel Service, Inc., Class B",,,167800,,,,10807998,,,,,,,,
14,"PepsiCo, Inc.",,,548000,,,,36255680,,,,,,,,
17,"Amgen, Inc.",,,493556,(b),,,29494907,,,,,,,,
18,Dendreon Corp.,,,326900,"(b,d)",,,11922043,,,,,,,,
19,Genzyme Corp.,,,306900,(b),,,15906627,,,,,,,,
20,"Gilead Sciences, Inc.",,,265446,(b),,,12072484,,,,,,,,


In [876]:
for i in df2.columns:
        if sum(df2[i].isnull())/len(df2) >= 0.8:
            del df2[i]

In [885]:
print(sum(df2['Col11'].isnull())/len(df2))

0.6353591160220995


In [912]:
df2

Unnamed: 0,Col0,Col3,Col4,Col7,Col11
1,Issuer,Shares,,Value(a),
3,Goodrich Corp.,137498,(d),9696359,
4,"Honeywell International, Inc.",311757,,14113239,
5,Precision Castparts Corp.,177400,(d),22478354,
11,"United Parcel Service, Inc., Class B",167800,,10807998,
14,"PepsiCo, Inc.",548000,,36255680,
17,"Amgen, Inc.",493556,(b),29494907,
18,Dendreon Corp.,326900,"(b,d)",11922043,
19,Genzyme Corp.,306900,(b),15906627,
20,"Gilead Sciences, Inc.",265446,(b),12072484,


In [928]:
# for i in df.columns:
#     print(df[i][1][1])
#         for j in rangdf[i]:
#             z = re.match(r'\(([^()]*)\)',str(items))
#             if z:
#                 return i;

In [934]:
print(find_legend_col(df3))

None


In [867]:
df2 = clean_data(df2)
df2

Unnamed: 0,Col0,Col3,Col4,Col7,Col11
6,04-08-10,0.48,%,9995867,9995867


# Test parse on url3 (002769)

In [861]:
url3 = 'https://www.sec.gov/Archives/edgar/data/316411/0001169232-09-002769.txt'
table3 = pd.read_html(url3)
len(table3)

40

In [862]:
df3 = pd.DataFrame()
df3 = parseAll(table3,df3)
df3

Unnamed: 0,Col0,Col1,Col2,Col4,Col5,Col6,Col7,Col8
0,,,Shares,,,Value,Value,
4,Honeywell International,,39400,,,$,1097684,
5,Lockheed Martin,,27900,,,,1925937,
6,Precision Castparts,,20500,,,,1227950,
9,United Parcel Service (Class B),,25800,,,,1269876,
12,Delta Air Lines*,,164900,,,,928387,
15,PepsiCo,,90800,,,,4674384,
18,Amgen*,,20300,,,,1005256,
19,Biogen Idec*,,25200,,,,1320984,
20,Celgene*,,31800,,,,1411920,


In [863]:
df3 = clean_data(df3)
df3

Unnamed: 0,Col0,Col2,Col7
4,Honeywell International,39400,1097684
5,Lockheed Martin,27900,1925937
6,Precision Castparts,20500,1227950
9,United Parcel Service (Class B),25800,1269876
12,Delta Air Lines*,164900,928387
15,PepsiCo,90800,4674384
18,Amgen*,20300,1005256
19,Biogen Idec*,25200,1320984
20,Celgene*,31800,1411920
21,Gilead Sciences*,36400,1686048


In [864]:
df3 = rename_df(df3)
df3

Unnamed: 0,Holding,Share,Value
4,Honeywell International,39400,1097684
5,Lockheed Martin,27900,1925937
6,Precision Castparts,20500,1227950
9,United Parcel Service (Class B),25800,1269876
12,Delta Air Lines*,164900,928387
15,PepsiCo,90800,4674384
18,Amgen*,20300,1005256
19,Biogen Idec*,25200,1320984
20,Celgene*,31800,1411920
21,Gilead Sciences*,36400,1686048


# Test parse on url4 (004392)

In [595]:
url4 = 'https://www.sec.gov/Archives/edgar/data/316411/000116923208004392/0001169232-08-004392.txt'
table4 = pd.read_html(url4)
len(table4)
# table4

45

In [729]:
table4[0]

Unnamed: 0,0,1,2,3,4,5,6
0,,Shares or Principal Amount,Shares or Principal Amount,,Value,Value,
1,Common Stocks 98.1%,,,,,,
2,,,,,,,
3,Aerospace and Defense 1.0%,,,,,,
4,Raytheon,,59800,shs.,$,3199898,
5,,,,,,,
6,,,,,,,
7,Airlines 1.3%,,,,,,
8,Delta Air Lines*,,547900,,,4081855,
9,,,,,,,


In [818]:
output_df4 = pd.DataFrame()

In [819]:
output_df4 = parseAll(table4,output_df4)

In [820]:
output_df4

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Col5,Col6
0,,Shares or Principal Amount,Shares or Principal Amount,,Value,Value,
4,Raytheon,,59800,shs.,$,3199898,
8,Delta Air Lines*,,547900,,,4081855,
12,PepsiCo,,55200,,,3934104,
15,Amgen*,,104500,,,6193715,
16,Celgene*,,49800,,,3151344,
17,Cephalon*,,50400,,,3905496,
18,Gilead Sciences*,,71700,,,3268086,
23,Lazard (Class A),,118400,,,5062784,
26,Monsanto,,34100,,,3375218,


In [811]:
for i in output_df4.columns:
    if sum(output_df4[i].isnull())/len(output_df4) >= 0.8:
        del output_df4[i]

In [826]:
output_df4 = clean_data(output_df4)

In [827]:
# output_df4 = output_df4.dropna(how='any')

In [828]:
output_df4

Unnamed: 0,Col0,Col2,Col5
4,Raytheon,59800,3199898
8,Delta Air Lines*,547900,4081855
12,PepsiCo,55200,3934104
15,Amgen*,104500,6193715
16,Celgene*,49800,3151344
17,Cephalon*,50400,3905496
18,Gilead Sciences*,71700,3268086
23,Lazard (Class A),118400,5062784
26,Monsanto,34100,3375218
27,Potash Corp. of Saskatchewan,25700,3392657


In [829]:
output_df4 = rename_df(output_df4)

In [830]:
output_df4

Unnamed: 0,Holding,Share,Value
4,Raytheon,59800,3199898
8,Delta Air Lines*,547900,4081855
12,PepsiCo,55200,3934104
15,Amgen*,104500,6193715
16,Celgene*,49800,3151344
17,Cephalon*,50400,3905496
18,Gilead Sciences*,71700,3268086
23,Lazard (Class A),118400,5062784
26,Monsanto,34100,3375218
27,Potash Corp. of Saskatchewan,25700,3392657


In [857]:
# output_df4 = getHolding(output_df4)

In [858]:
# output_df4

# Test parse on url5 (001745)

In [851]:
url5 = 'https://www.sec.gov/Archives/edgar/data/316411/0001145443-06-001745.txt'
table5 = pd.read_html(url5)

In [852]:
df5 = pd.DataFrame()
df5 = parseAll(table5,df5)
df5

Unnamed: 0,Col0,Col1,Col2,Col3,Col4
8,Boeing,"98,100 shs.",,,"$7,644,933"
9,United Technologies,157700,,,9141869
14,United Parcel Service (Class B),171800,,,13637484
17,Harley-Davidson,49400,,,2562872
20,PepsiCo,150100,,,8674279
23,Amgen*,134230,,,9796777
24,Celgene*,108400,,,4783692
25,Gilead Sciences*,82200,,,5123526
30,Goldman Sachs Group,32200,,,5054112
31,Merrill Lynch,116200,,,9151912


In [853]:
df5 = clean_data(df5)

In [854]:
df5

Unnamed: 0,Col0,Col1,Col4
8,Boeing,"98,100 shs.","$7,644,933"
9,United Technologies,157700,9141869
14,United Parcel Service (Class B),171800,13637484
17,Harley-Davidson,49400,2562872
20,PepsiCo,150100,8674279
23,Amgen*,134230,9796777
24,Celgene*,108400,4783692
25,Gilead Sciences*,82200,5123526
30,Goldman Sachs Group,32200,5054112
31,Merrill Lynch,116200,9151912


In [855]:
df5 = rename_df(df5)

In [856]:
df5

Unnamed: 0,Holding,Share,Value
8,Boeing,"98,100 shs.","$7,644,933"
9,United Technologies,157700,9141869
14,United Parcel Service (Class B),171800,13637484
17,Harley-Davidson,49400,2562872
20,PepsiCo,150100,8674279
23,Amgen*,134230,9796777
24,Celgene*,108400,4783692
25,Gilead Sciences*,82200,5123526
30,Goldman Sachs Group,32200,5054112
31,Merrill Lynch,116200,9151912


In [779]:
df = pd.DataFrame()
df = parseAll(table,df)
df

Unnamed: 0,Col0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,Col8,Col9,Col10,Col11,Col12,Col13,Col14,Col15,Col16
1,Issuer,,Shares,Shares,,,Value(a),Value(a),,,,,,,,,
3,Precision Castparts Corp.,,,127100,(e),,$,18706578,,,,,,,,,
4,United Technologies Corp.,,,319100,,,,27011815,,,,,,,,,
10,"United Parcel Service, Inc., Class B",,,226100,,,,16803752,,,,,,,,,
13,"Autoliv, Inc.",,,260800,(c),,,19359184,,,,,,,,,
16,"Alexion Pharmaceuticals, Inc.",,,155200,"(b,e)",,,15315136,,,,,,,,,
17,Celgene Corp.,,,151700,(b),,,8727301,,,,,,,,,
18,Dendreon Corp.,,,329900,(b),,,12348157,,,,,,,,,
24,"BlackRock, Inc.",,,90400,,,,18171304,,,,,,,,,
25,"Franklin Resources, Inc.",,,106200,,,,13283496,,,,,,,,,


In [780]:
for i in df.columns:
    if sum(df[i].isnull())/len(df) >= 0.8:
        del df[i]

In [781]:
df

Unnamed: 0,Col0,Col2,Col3,Col4,Col7,Col11
1,Issuer,Shares,Shares,,Value(a),
3,Precision Castparts Corp.,,127100,(e),18706578,
4,United Technologies Corp.,,319100,,27011815,
10,"United Parcel Service, Inc., Class B",,226100,,16803752,
13,"Autoliv, Inc.",,260800,(c),19359184,
16,"Alexion Pharmaceuticals, Inc.",,155200,"(b,e)",15315136,
17,Celgene Corp.,,151700,(b),8727301,
18,Dendreon Corp.,,329900,(b),12348157,
24,"BlackRock, Inc.",,90400,,18171304,
25,"Franklin Resources, Inc.",,106200,,13283496,


In [612]:
# table[1].dropna(axis = 0, how = 'all')

In [719]:
for i in range(len(table4)):
    df = table4[i].dropna(axis = 1, how = 'all')
    df = df.dropna(axis = 0, how = 'all')
    df = df.dropna(thresh = 3)
#     print(df)
    output_df4 = output_df4.append(df)
output_df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,,Shares or Principal Amount,Shares or Principal Amount,,Value,Value,,,,,,,,,,,
4,Raytheon,,59800,shs.,$,3199898,,,,,,,,,,,
8,Delta Air Lines*,,547900,,,4081855,,,,,,,,,,,
12,PepsiCo,,55200,,,3934104,,,,,,,,,,,
15,Amgen*,,104500,,,6193715,,,,,,,,,,,
16,Celgene*,,49800,,,3151344,,,,,,,,,,,
17,Cephalon*,,50400,,,3905496,,,,,,,,,,,
18,Gilead Sciences*,,71700,,,3268086,,,,,,,,,,,
23,Lazard (Class A),,118400,,,5062784,,,,,,,,,,,
26,Monsanto,,34100,,,3375218,,,,,,,,,,,


In [687]:
colDic = {}
for i in output_df4.columns:
#     print(i);
    if i == 0:
        colDic[i] = "Holding"
    elif i == 3:
        colDic[i] = "Share"
    elif i == 4:
        colDic[i] = "legend"
    elif i == 7:
        colDic[i] = "Value"
    else:
        colDic[i] = "Col" + str(i)

In [688]:
output_df4 = output_df4.rename(columns=colDic)

In [689]:
# output_df4 = output_df4.rename(columns={0: "Holding", 3:"Share" ,4 : "legend", 7:"Value"})
output_df4 = output_df4.dropna(subset=["Holding", "Share", "Value"])

In [690]:
output_df4 = output_df4.dropna(axis = 0, how = 'all')
output_df4 = output_df4.dropna(thresh = 3)
output_df4

Unnamed: 0,Holding,Col1,Col2,Share,legend,Col5,Col6,Value,Col8,Col9,...,ColCol15,ColCol16,ColCol2,ColCol5,ColCol6,ColCol8,ColHolding,ColShare,ColValue,Collegend
1,Issuer,,Shares,Shares,,,Value(a),Value(a),,,...,,,,,,,,,,
3,Precision Castparts Corp.,,,127100,(e),,$,18706578,,,...,,,,,,,,,,
4,United Technologies Corp.,,,319100,,,,27011815,,,...,,,,,,,,,,
10,"United Parcel Service, Inc., Class B",,,226100,,,,16803752,,,...,,,,,,,,,,
13,"Autoliv, Inc.",,,260800,(c),,,19359184,,,...,,,,,,,,,,
16,"Alexion Pharmaceuticals, Inc.",,,155200,"(b,e)",,,15315136,,,...,,,,,,,,,,
17,Celgene Corp.,,,151700,(b),,,8727301,,,...,,,,,,,,,,
18,Dendreon Corp.,,,329900,(b),,,12348157,,,...,,,,,,,,,,
24,"BlackRock, Inc.",,,90400,,,,18171304,,,...,,,,,,,,,,
25,"Franklin Resources, Inc.",,,106200,,,,13283496,,,...,,,,,,,,,,


In [691]:
output_df4 = output_df4.dropna(axis = 1, how = 'all')
output_df4

Unnamed: 0,Holding,Col1,Col2,Share,legend,Col5,Col6,Value,Col8,Col10,Col11,Col12,Col14,Col15,Col16
1,Issuer,,Shares,Shares,,,Value(a),Value(a),,,,,,,
3,Precision Castparts Corp.,,,127100,(e),,$,18706578,,,,,,,
4,United Technologies Corp.,,,319100,,,,27011815,,,,,,,
10,"United Parcel Service, Inc., Class B",,,226100,,,,16803752,,,,,,,
13,"Autoliv, Inc.",,,260800,(c),,,19359184,,,,,,,
16,"Alexion Pharmaceuticals, Inc.",,,155200,"(b,e)",,,15315136,,,,,,,
17,Celgene Corp.,,,151700,(b),,,8727301,,,,,,,
18,Dendreon Corp.,,,329900,(b),,,12348157,,,,,,,
24,"BlackRock, Inc.",,,90400,,,,18171304,,,,,,,
25,"Franklin Resources, Inc.",,,106200,,,,13283496,,,,,,,


In [692]:
for i in output_df4.columns:
    if i == 'Holding':
        continue
    elif i == 'Share':
        continue
    elif i == 'legend':
        continue
    elif i == 'Col6':
        continue
    elif i == 'Value':
        continue
    else:
        bool_series = pd.isnull(output_df4[i])
        output_df4 = output_df4[bool_series]

In [693]:
# output_df4 = output_df4[bool_series]
output_df4

Unnamed: 0,Holding,Col1,Col2,Share,legend,Col5,Col6,Value,Col8,Col10,Col11,Col12,Col14,Col15,Col16
3,Precision Castparts Corp.,,,127100,(e),,$,18706578,,,,,,,
4,United Technologies Corp.,,,319100,,,,27011815,,,,,,,
10,"United Parcel Service, Inc., Class B",,,226100,,,,16803752,,,,,,,
13,"Autoliv, Inc.",,,260800,(c),,,19359184,,,,,,,
16,"Alexion Pharmaceuticals, Inc.",,,155200,"(b,e)",,,15315136,,,,,,,
17,Celgene Corp.,,,151700,(b),,,8727301,,,,,,,
18,Dendreon Corp.,,,329900,(b),,,12348157,,,,,,,
24,"BlackRock, Inc.",,,90400,,,,18171304,,,,,,,
25,"Franklin Resources, Inc.",,,106200,,,,13283496,,,,,,,
26,Morgan Stanley,,,445100,,,,12160132,,,,,,,


In [694]:
# output_df4 = output_df4[pd.isnull(output_df4["Col2"])]
# output_df4

In [637]:
write_to_csv(output_df4, "output_df4.csv")

In [636]:
# output_df4 = pd.DataFrame()
# output_df4 = parse(table4,output_df4)
# output_df4

In [204]:
# import bs4 as bs
# import urllib.request

# source = urllib.request.urlopen(url).read()
# soup = bs.BeautifulSoup(source,'lxml')
# table = soup.find_all('table')[1] # Grab the first table
# table_rows = table.find_all('tr')
# for tr in table_rows:
#     td = tr.find_all('td')
#     row = [i.text for i in td]
#     print(row)

In [720]:
table = pd.read_html(url)
len(table)

46

In [522]:
table2 = pd.read_html(url2)
len(table2)

23

In [520]:
output_df = pd.DataFrame()
output_df = parse(table,output_df)
output_df

Unnamed: 0,Holding,Share,legend,Value
3,Precision Castparts Corp.,127100,(e),18706578
4,United Technologies Corp.,319100,,27011815
10,"United Parcel Service, Inc., Class B",226100,,16803752
13,"Autoliv, Inc.",260800,(c),19359184
16,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136
17,Celgene Corp.,151700,(b),8727301
18,Dendreon Corp.,329900,(b),12348157
24,"BlackRock, Inc.",90400,,18171304
25,"Franklin Resources, Inc.",106200,,13283496
26,Morgan Stanley,445100,,12160132


In [521]:
output_df2 = pd.DataFrame()
output_df2 = parse(table2,output_df2)
output_df2

Unnamed: 0,Holding,Share,legend,Value
3,Goodrich,137498,,7471641
4,Honeywell Intl,311757,,11581773
5,ITT,249000,(d),12985350
6,Lockheed Martin,105288,,8220887
7,Precision Castparts,110700,,11277009
13,United Parcel Service Cl B,167800,,9475666
16,Amgen,551956,(b),33244310
17,Biogen Idec,225256,(b),11379933
18,Gilead Sciences,486446,"(b,d)",22658655
19,Vertex Pharmaceuticals,228400,"(b,d)",8656360


In [523]:
url3 = 'https://www.sec.gov/Archives/edgar/data/316411/0000950123-10-109224.txt'

In [524]:
table3 = pd.read_html(url3)
len(table3)

26

In [525]:
output_df3 = pd.DataFrame()
output_df3 = parse(table3,output_df3)
output_df3

Unnamed: 0,Holding,Share,legend,Value
3,Goodrich Corp.,137498,,10137728
4,Precision Castparts Corp.,98400,(d),12531240
5,United Technologies Corp.,318400,,22679632
11,"United Parcel Service, Inc., Class B",167800,,11190582
14,"Autoliv, Inc.",311600,"(c,d)",20356828
17,Ford Motor Co.,1007100,"(b,d)",12326904
20,"Dr Pepper Snapple Group, Inc.",101029,,3588550
21,"PepsiCo, Inc.",243500,,16178140
22,The Coca-Cola Co.,324300,,18978036
28,"Alexion Pharmaceuticals, Inc.",208700,(b),13431932


In [517]:
# print(output_df2['Holding'].str.contains("Total"))
filter = output_df2['Holding'].str.contains("Total")
# print(filter)
output_df2 = output_df2[~filter]
output_df2

Unnamed: 0,Holding,Share,legend,Value
3,Goodrich,137498,,7471641
4,Honeywell Intl,311757,,11581773
5,ITT,249000,(d),12985350
6,Lockheed Martin,105288,,8220887
7,Precision Castparts,110700,,11277009
13,United Parcel Service Cl B,167800,,9475666
16,Amgen,551956,(b),33244310
17,Biogen Idec,225256,(b),11379933
18,Gilead Sciences,486446,"(b,d)",22658655
19,Vertex Pharmaceuticals,228400,"(b,d)",8656360


In [478]:
output_df2 = pd.DataFrame()

In [479]:
output_df2 = parse(table2,output_df2)

In [480]:
output_df2

Unnamed: 0,Holding,Share,legend,Value
3,Goodrich,137498,,7471641
4,Honeywell Intl,311757,,11581773
5,ITT,249000,(d),12985350
6,Lockheed Martin,105288,,8220887
7,Precision Castparts,110700,,11277009
13,United Parcel Service Cl B,167800,,9475666
16,Amgen,551956,(b),33244310
17,Biogen Idec,225256,(b),11379933
18,Gilead Sciences,486446,"(b,d)",22658655
19,Vertex Pharmaceuticals,228400,"(b,d)",8656360


In [148]:
df = table[1].dropna(axis = 0, how = 'all')

In [149]:
df = df.drop(columns=[1,2,5,6,8])

In [153]:
df = df.dropna(thresh = 2)

In [174]:
df = df.rename(columns={0: "Holding", 3:"Share", 4 : "legend" , 7:"Value"})

In [180]:
df = df.drop(1)

In [182]:
df = df[df.Holding != 'Total']

In [214]:
df.reset_index()
df.drop(df.index)

Unnamed: 0,Holding,Share,legend,Value


In [215]:
df

Unnamed: 0,Holding,Share,legend,Value
3,Precision Castparts Corp.,127100,(e),18706578
4,United Technologies Corp.,319100,,27011815
10,"United Parcel Service, Inc., Class B",226100,,16803752
13,"Autoliv, Inc.",260800,(c),19359184
16,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136
17,Celgene Corp.,151700,(b),8727301
18,Dendreon Corp.,329900,(b),12348157
24,"BlackRock, Inc.",90400,,18171304
25,"Franklin Resources, Inc.",106200,,13283496
26,Morgan Stanley,445100,,12160132


In [185]:
# for index, row in df.iterrows():
#     if(row[0] == 'Total'):
#         print("yes")
#         df.drop(level = index)
#     print(index, row[0], row[3],row[7])

In [167]:
df

Unnamed: 0,0,3,4,7
1,Issuer,Shares,,Value(a)
3,Precision Castparts Corp.,127100,(e),18706578
4,United Technologies Corp.,319100,,27011815
6,Total,,,45718393
10,"United Parcel Service, Inc., Class B",226100,,16803752
13,"Autoliv, Inc.",260800,(c),19359184
16,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136
17,Celgene Corp.,151700,(b),8727301
18,Dendreon Corp.,329900,(b),12348157
20,Total,,,36390594


In [446]:
output_df = pd.DataFrame()
# output_df = parse(table,output_df)
# output_df = output_df.append(df)
# output_df
# output_df = df

In [447]:
output_df = parse(table,output_df)

In [448]:
output_df = output_df[:-1]

In [449]:
output_df

Unnamed: 0,Holding,Share,legend,Value
3,Precision Castparts Corp.,127100,(e),18706578
4,United Technologies Corp.,319100,,27011815
10,"United Parcel Service, Inc., Class B",226100,,16803752
13,"Autoliv, Inc.",260800,(c),19359184
16,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136
17,Celgene Corp.,151700,(b),8727301
18,Dendreon Corp.,329900,(b),12348157
24,"BlackRock, Inc.",90400,,18171304
25,"Franklin Resources, Inc.",106200,,13283496
26,Morgan Stanley,445100,,12160132


In [450]:
output_df = output_df.reset_index()

In [451]:
output_df

Unnamed: 0,index,Holding,Share,legend,Value
0,3,Precision Castparts Corp.,127100,(e),18706578
1,4,United Technologies Corp.,319100,,27011815
2,10,"United Parcel Service, Inc., Class B",226100,,16803752
3,13,"Autoliv, Inc.",260800,(c),19359184
4,16,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136
5,17,Celgene Corp.,151700,(b),8727301
6,18,Dendreon Corp.,329900,(b),12348157
7,24,"BlackRock, Inc.",90400,,18171304
8,25,"Franklin Resources, Inc.",106200,,13283496
9,26,Morgan Stanley,445100,,12160132


In [452]:
output_df = output_df.drop(columns = ["index"])

In [453]:
output_df

Unnamed: 0,Holding,Share,legend,Value
0,Precision Castparts Corp.,127100,(e),18706578
1,United Technologies Corp.,319100,,27011815
2,"United Parcel Service, Inc., Class B",226100,,16803752
3,"Autoliv, Inc.",260800,(c),19359184
4,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136
5,Celgene Corp.,151700,(b),8727301
6,Dendreon Corp.,329900,(b),12348157
7,"BlackRock, Inc.",90400,,18171304
8,"Franklin Resources, Inc.",106200,,13283496
9,Morgan Stanley,445100,,12160132


In [454]:
output_df["CIK"] = CIK

In [455]:
output_df["weblink"] = url

In [456]:
output_df

Unnamed: 0,Holding,Share,legend,Value,CIK,weblink
0,Precision Castparts Corp.,127100,(e),18706578,316411,https://www.sec.gov/Archives/edgar/data/316411...
1,United Technologies Corp.,319100,,27011815,316411,https://www.sec.gov/Archives/edgar/data/316411...
2,"United Parcel Service, Inc., Class B",226100,,16803752,316411,https://www.sec.gov/Archives/edgar/data/316411...
3,"Autoliv, Inc.",260800,(c),19359184,316411,https://www.sec.gov/Archives/edgar/data/316411...
4,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136,316411,https://www.sec.gov/Archives/edgar/data/316411...
5,Celgene Corp.,151700,(b),8727301,316411,https://www.sec.gov/Archives/edgar/data/316411...
6,Dendreon Corp.,329900,(b),12348157,316411,https://www.sec.gov/Archives/edgar/data/316411...
7,"BlackRock, Inc.",90400,,18171304,316411,https://www.sec.gov/Archives/edgar/data/316411...
8,"Franklin Resources, Inc.",106200,,13283496,316411,https://www.sec.gov/Archives/edgar/data/316411...
9,Morgan Stanley,445100,,12160132,316411,https://www.sec.gov/Archives/edgar/data/316411...


In [457]:
date = soup.find_all('div', {"style" : "font-size: 10pt; margin-top: 18pt"})
pattern = re.compile(r'Date of reporting period')
report_date = "";
for i in date:
    m = re.match(pattern, i.text)
    if m:
        report_date = i.text.split(":")[1].strip()
#         print(i.text.split(":")[1].strip())

In [458]:
report_date

'March\xa031, 2011'

In [459]:
output_df["reporting_date"] = report_date

In [460]:
output_df

Unnamed: 0,Holding,Share,legend,Value,CIK,weblink,reporting_date
0,Precision Castparts Corp.,127100,(e),18706578,316411,https://www.sec.gov/Archives/edgar/data/316411...,"March 31, 2011"
1,United Technologies Corp.,319100,,27011815,316411,https://www.sec.gov/Archives/edgar/data/316411...,"March 31, 2011"
2,"United Parcel Service, Inc., Class B",226100,,16803752,316411,https://www.sec.gov/Archives/edgar/data/316411...,"March 31, 2011"
3,"Autoliv, Inc.",260800,(c),19359184,316411,https://www.sec.gov/Archives/edgar/data/316411...,"March 31, 2011"
4,"Alexion Pharmaceuticals, Inc.",155200,"(b,e)",15315136,316411,https://www.sec.gov/Archives/edgar/data/316411...,"March 31, 2011"
5,Celgene Corp.,151700,(b),8727301,316411,https://www.sec.gov/Archives/edgar/data/316411...,"March 31, 2011"
6,Dendreon Corp.,329900,(b),12348157,316411,https://www.sec.gov/Archives/edgar/data/316411...,"March 31, 2011"
7,"BlackRock, Inc.",90400,,18171304,316411,https://www.sec.gov/Archives/edgar/data/316411...,"March 31, 2011"
8,"Franklin Resources, Inc.",106200,,13283496,316411,https://www.sec.gov/Archives/edgar/data/316411...,"March 31, 2011"
9,Morgan Stanley,445100,,12160132,316411,https://www.sec.gov/Archives/edgar/data/316411...,"March 31, 2011"


In [None]:
soup = BeautifulSoup(data, "html.parser")

In [262]:
url = 'https://www.sec.gov/Archives/edgar/data/316411/000095012311054303/0000950123-11-054303.txt'

In [313]:
import bs4 as bs
import urllib.request

source = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(source,'lxml')
legend = []
label = soup.find_all('div', {"style" : "margin-left:28px; text-indent:-28px"})
for i in label:
    legend.append(i)
rest = soup.find_all('div', {"style" : "margin-left:24px; text-indent:-24px"})
for j in rest:
    legend.append(j)
# legend = label
# legend = legend.extend(label)
# print(label.extend(rest))
# print(rest)
# legend = legend.extend(rest)
# print(label[0])
print(legend)

[<div style="margin-left:28px; text-indent:-28px"><b>(a) </b>   Securities are valued by using policies described in Note 2 to the financial statements in the
most recent Annual Report dated Dec. 31, 2010.
</div>, <div style="margin-left:28px; text-indent:-28px"><b>(b) </b>   Non-income producing.
</div>, <div style="margin-left:28px; text-indent:-28px"><b>(c) </b>   Foreign security values are stated in U.S. dollars. At March 31, 2011, the value of foreign
securities, excluding short-term securities, represented 5.35% of net assets.
</div>, <div style="margin-left:24px; text-indent:-24px"><b>(d) </b>  Represents fractional shares.
</div>, <div style="margin-left:24px; text-indent:-24px"><b>(e) </b>  At March 31, 2011, security was partially or fully on loan.
</div>, <div style="margin-left:24px; text-indent:-24px"><b>(f) </b>  Affiliated Money Market Fund  The Fund may invest its daily cash balance in Columbia
Short-Term Cash Fund, a money market fund established for the exclusive us

In [320]:
str1 = "".join(str(e) for e in legend)

In [322]:
str1

'<div style="margin-left:28px; text-indent:-28px"><b>(a) </b>\xa0\xa0\xa0Securities are valued by using policies described in Note 2 to the financial statements in the\nmost recent Annual Report dated Dec. 31, 2010.\n</div><div style="margin-left:28px; text-indent:-28px"><b>(b) </b>\xa0\xa0\xa0Non-income producing.\n</div><div style="margin-left:28px; text-indent:-28px"><b>(c) </b>\xa0\xa0\xa0Foreign security values are stated in U.S. dollars. At March\xa031, 2011, the value of foreign\nsecurities, excluding short-term securities, represented 5.35% of net assets.\n</div><div style="margin-left:24px; text-indent:-24px"><b>(d) </b>\xa0\xa0Represents fractional shares.\n</div><div style="margin-left:24px; text-indent:-24px"><b>(e) </b>\xa0\xa0At March\xa031, 2011, security was partially or fully on loan.\n</div><div style="margin-left:24px; text-indent:-24px"><b>(f) </b>\xa0\xa0Affiliated Money Market Fund \x97 The Fund may invest its daily cash balance in Columbia\nShort-Term Cash Fund, 

In [372]:
import unicodedata
for i in legend:
    print(unicodedata.normalize("NFKD", i.text[6:]))

 Securities are valued by using policies described in Note 2 to the financial statements in the
most recent Annual Report dated Dec. 31, 2010.

 Non-income producing.

 Foreign security values are stated in U.S. dollars. At March 31, 2011, the value of foreign
securities, excluding short-term securities, represented 5.35% of net assets.

Represents fractional shares.

At March 31, 2011, security was partially or fully on loan.

Affiliated Money Market Fund  The Fund may invest its daily cash balance in Columbia
Short-Term Cash Fund, a money market fund established for the exclusive use of funds and other
institutional clients of Columbia Management. The rate shown is the seven-day current annualized
yield at March 31, 2011.

The table below represents securities received as collateral for repurchase agreements. This
collateral, which is generally high quality short-term obligations, is deposited with the Funds
custodian and, pursuant to the terms of the repurchase agreement, must hav

In [385]:
legend_dict = {}

In [389]:
for i in legend:
    legend_data = i.text[6:].replace(u'\xa0', u' ')
    legend_data = legend_data.replace(u'\x97', u' ')
    legend_data = legend_data.replace(u'\x92', u' ')
    legend_data = legend_data.replace(u'\n', u' ')
    legend_dict.update([(i.text[1:2],legend_data)])
#     print(i.text[:3])
#     print(i.text[6:])
#     f = pd.read_html(i)

In [390]:
legend_dict

{'a': ' Securities are valued by using policies described in Note 2 to the financial statements in the most recent Annual Report dated Dec. 31, 2010. ',
 'b': ' Non-income producing. ',
 'c': ' Foreign security values are stated in U.S. dollars. At March 31, 2011, the value of foreign securities, excluding short-term securities, represented 5.35% of net assets. ',
 'd': 'Represents fractional shares. ',
 'e': 'At March 31, 2011, security was partially or fully on loan. ',
 'f': 'Affiliated Money Market Fund   The Fund may invest its daily cash balance in Columbia Short-Term Cash Fund, a money market fund established for the exclusive use of funds and other institutional clients of Columbia Management. The rate shown is the seven-day current annualized yield at March 31, 2011. ',
 'g': 'The table below represents securities received as collateral for repurchase agreements. This collateral, which is generally high quality short-term obligations, is deposited with the Fund s custodian and

In [463]:
legend_table = pd.DataFrame.from_dict(legend_dict, orient='index')
legend_table = legend_table.rename(columns={0: "Legend"})
legend_table

Unnamed: 0,Legend
a,Securities are valued by using policies descr...
b,Non-income producing.
c,Foreign security values are stated in U.S. do...
d,Represents fractional shares.
e,"At March 31, 2011, security was partially or f..."
f,Affiliated Money Market Fund The Fund may in...
g,The table below represents securities received...
h,"At March 31, 2011, the cost of securities for ..."


In [474]:
def write_to_csv(df, file_name):
    df.to_csv(file_name, encoding='utf-8')

In [467]:
# legend_name = "seligman_legend"

In [473]:
write_to_csv(legend_table, "seligman_legend.csv")

In [475]:
write_to_csv(output_df, "seligman_holdings.csv")

In [462]:
# for i in legend:
#     print(i.text)
# legend_table.columns = [‘key’, ‘legend’]

In [332]:
# url = 'https://www.sec.gov/Archives/edgar/data/1097293/000119312519013629/0001193125-19-013629.txt'
# res = requests.get(url)
# res.content
# df = pd.read_html(url,skiprows = [1,-1])
# df
# tables = pd.read_html(url)
# print(len(tables))


In [48]:
"""

from bs4 import BeautifulSoup
import requests

url=r'https://www.sec.gov/Archives/edgar/data/' + CIK +  r'/' + text

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html

table_tags = soup.find_all("table")
table = []

for i in table_tags:
    print(table_tags)

"""


'\n\nfrom bs4 import BeautifulSoup\nimport requests\n\nurl=r\'https://www.sec.gov/Archives/edgar/data/\' + CIK +  r\'/\' + text\n\n# Make a GET request to fetch the raw HTML content\nhtml_content = requests.get(url).text\n\n# Parse the html content\nsoup = BeautifulSoup(html_content, "lxml")\n#print(soup.prettify()) # print the parsed data of html\n\ntable_tags = soup.find_all("table")\ntable = []\n\nfor i in table_tags:\n    print(table_tags)\n\n'

In [106]:
#read a table

In [107]:
df = tables[0].dropna(axis = 0, how = 'all')
df = df.dropna(axis = 0, subset = [df.columns[0]])
cleaned_df = pd.DataFrame()
df = df.dropna(thresh = 3, axis = 0)
headername = ''
for count, row in df.iterrows():
    name = row[0]
    if ':' in name:
        headername = name
    if name.split(' ')[0] == 'Series' or name[0].isdigit():
        name = headername + ' ' + name
        
    if len(row.dropna().values) < 3:
        print(row)
        
    add = pd.DataFrame(row[1:].dropna())
    add = add[~add[add.columns[0]].str.contains('\D+')]
    if len(add) == 2:
        add = add.transpose().dropna()
        add.columns = ['holdings shares', 'holdings value']
        add['holdings name'] = name
        
        cleaned_df = pd.concat([cleaned_df, add], axis = 0, sort = False)
        
cleaned_df.index = pd.RangeIndex(len(cleaned_df.index))
print(cleaned_df)

Empty DataFrame
Columns: []
Index: []


In [132]:
count = 0
for df in tables:
    df = df.dropna(axis = 0, how = 'all')
    df = df.dropna(axis = 1, how = 'all')
    
    if not df.empty:
        print('count')
        print(count)
        print(' ')
        print(df)
    count = count + 1

count
0
 
                                                   0  \
1  Schedule of Investments (unaudited) December 3...   

                                                   2  
1  BlackRock Advantage U.S. Total Market Fund, In...  
count
1
 
                                                 0                     1  \
1                                         Security                   NaN   
2                             Mutual Fund  100.1%  Mutual Fund  100.1%   
3           Master Advantage U.S. Total Market LLC                   NaN   
6   Total Investments  100.1%(Cost: $422,075,661)                   NaN   
8   Liabilities in Excess of Other Assets  (0.1)%                   NaN   
11                             Net Assets  100.0%                   NaN   

                       2                     3    4  
1                  Value                 Value  NaN  
2   Mutual Fund  100.1%  Mutual Fund  100.1%  NaN  
3                      $             392978243  NaN  
6       

1  4.0
count
30
 
                                                   0  \
1  Schedule of Investments (unaudited) (continued...   

                                        2  
1  Master Advantage U.S. Total Market LLC  
count
31
 
                     0                   2                   3   \
2           Description  Number ofContracts  Number ofContracts   
4             Contracts                 NaN                 NaN   
5  S&P 500 E-Mini Index                 NaN                  43   

               6               7                    10  \
2  ExpirationDate  ExpirationDate  NotionalAmount(000)   
4             NaN             NaN                  NaN   
5             NaN        03/15/19               $5,386   

                                           12  \
2  Value/UnrealizedAppreciation(Depreciation)   
4                                         NaN   
5                                           $   

                                           13  
2  Value/UnrealizedAppr

In [109]:
def get_report_fund(df):
    reportdate = ''
    fundname = ''
    
    col_count = 0
    for col in df.columns:
        check_name = df[col][df[col].apply(lambda x: re.sub(r'\s', '', str(x)).lower()).str.contains('scheduleofinvestments')]
        if len(check_name) >= 1:
            if 'continued' not in str(df.loc[df.index == check_name.index, col].values[0]) and 'audited' in str(df.loc[df.index == check_name.index, col].values[0]):
                print()
                reportdate = str(df.loc[df.index == check_name.index, col].values[0])
                reportdate = re.sub(r'\s', '',reportdate.encode('ascii','ignore').decode().lower()).replace(r'(unaudited)',"").replace(r'schedule',"").replace(r'of',"").replace(r'investments',"")

                fundname = str(df.loc[df.index == check_name.index, df.columns[col_count + 1]].values[0])
                fundname = re.sub(r'\s', '',fundname.encode('ascii','ignore').decode().lower()).replace('(percentagesshownarebasedonnetassets)', '')

        col_count = col_count + 1
    return reportdate, fundname

In [110]:
def check_level_columns(df):
    check_found = False
    for col in df.columns:
        check_name = df[col][df[col].apply(lambda x: re.sub(r'\s', '', str(x)).lower()).str.contains('level1')]
        if len(check_name) > 0:
            check_found = True
    return check_found

In [111]:
def get_holdings(df, reportdate, fundname, headername):
    
    cleaned_df = pd.DataFrame()

    check_level = check_level_columns(df)

    if len(df) > 2 and len(df.columns) >= 3 and not check_level:

        for count, row in df.iterrows():
            name = str(row[0])
            if ':' in name:
                headername = name
            if name.split(' ')[0] == 'Series' or name[0].isdigit():
                name = headername + ' ' + name

            add = pd.DataFrame(row[1:].dropna())
            add = add[~add[add.columns[0]].str.contains('\D+')]

            if name[-1] != r'%' and name != 'adr' and name != 'class' and name != 'securities' \
            and name != 'total' and name != 'affiliate':

                if len(add) == 2:
                    add = add.transpose().dropna()
                    add.columns = ['holdings shares', 'holdings value']
                    add['holdings name'] = name
                    add.loc[:,'reporting_date'] = reportdate
                    add.loc[:,'fund name'] = fundname
                    add.loc[:, 'identifier'] = ''

                    cleaned_df = pd.concat([cleaned_df, add], axis = 0, sort = False)
                    cleaned_df.index = pd.RangeIndex(len(cleaned_df.index))
                    
        
    return cleaned_df, headername
    

In [112]:
def get_keys(holdings):
    for count, row in holdings.iterrows():
        keys = re.findall(r'\([^()]*\)', row['holdings name'])
        keylist = ''
        keycheck = []
        for check in keys:
            if len(check)<= 3:
                keycheck.append(check)
        if len(keys) > 0:
            for k in keys:
                if 'Acquired' not in k:
                    if len(k) <= 3:
                        keylist = keylist + k
                        holdings.loc[count, 'holdings name'] = holdings.loc[count, 'holdings name'].replace(k,'')

                else:
                    holdings.loc[count, 'holdings name'] = holdings.loc[count, 'holdings name'].replace(k,'')
                    holdings.loc[count, 'acq name'] = row['holdings name'].split(',')[0]
                    acq_data = k.split(' ')
                    holdings.loc[count, 'acq date'] = acq_data[1].replace(',','')
                    holdings.loc[count, 'acq cost'] = ''.join(re.findall("\d+", acq_data[-1]))

        holdings.loc[count, 'key'] = ",".join(keycheck)
    return holdings
    

In [113]:
for df in tables:
    #remove all rows and columns of nas
    df = df.dropna(axis = 0, how = 'all')
    df = df.dropna(axis = 1, how = 'all')
    
    if not df.empty:
        
        reportdate, fundname = get_report_fund(df)
        if reportdate != '':
            print(reportdate)
            print(fundname)
            found = True


december31,2018
blackrockadvantageu.s.totalmarketfund,inc.

december31,2018
masteradvantageu.s.totalmarketllc


In [114]:


holdings = pd.DataFrame()
reportdate = ''
fundname = ''

legend = pd.DataFrame()

identifiercount = 0

headername = ''


for df in tables:
    #remove all rows and columns of nas
    df = df.dropna(axis = 0, how = 'all')
    df = df.dropna(axis = 1, how = 'all')
    
    if not df.empty:
        
        #get the report dates and fund names of this dataframe
        reportdate_test, fundname_test = get_report_fund(df)
        if reportdate_test != '':
            found = True
            reportdate = reportdate_test
            fundname = fundname_test
        
        
        
        for count, row in df.iterrows():
            firstrow = re.sub(r'\s', '', str(row[df.columns[0]])).encode('ascii','ignore').decode().lower()
            if '(' in firstrow and ')' in firstrow and len(firstrow) == 3:
                
                if len(holdings.loc[holdings['identifier'] == '']) > 0:
                    identifiercount = identifiercount + 1
                    found = False
                    holdings.loc[holdings['identifier'] == '', 'identifier'] = text + str(identifiercount)
                    
                add_legend = pd.DataFrame(row).transpose()
                add_legend.columns = ['symbol', 'code']
                add_legend.loc[:, 'identifier'] = text + str(identifiercount)
                
                if len(legend) == 0:
                    legend = add_legend.copy()

                else:
                    if add_legend['symbol'].values[0] not in legend.loc[legend['identifier'] ==\
                                                                    text + str(identifiercount), 'symbol'].tolist():
                            legend = pd.concat([legend, add_legend], axis = 0)
                    
                    

        cleaned_df, headername = get_holdings(df, reportdate, fundname, headername)



        if len(cleaned_df) > 0 and found:
            security = cleaned_df.copy()
            security['holdings name'] = security['holdings name'].apply(lambda x: x.encode('ascii','ignore').decode())
            if len(security) > 0:
                if holdings.empty:
                    holdings = security.copy()
                else:
                    holdings = pd.concat([holdings, security], axis = 0)



legend.index = pd.RangeIndex(len(legend.index))
holdings.index = pd.RangeIndex(len(holdings.index))


            
                







In [115]:

holdings

Unnamed: 0,holdings shares,holdings value,holdings name,reporting_date,fund name,identifier
0,8887,2866057,Boeing Co.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1
1,2851,291144,Curtiss-Wright Corp.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1
2,7403,1163826,General Dynamics Corp.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1
3,1330,83790,"HEICO Corp., ClassA","december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1
4,1973,516610,Lockheed Martin Corp.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1
5,26665,4089078,Raytheon Co.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1
6,2965,95206,"American Airlines Group, Inc.","december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1
7,32646,1517386,Southwest Airlines Co.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1
8,25315,345043,"Dana, Inc.","december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1
9,57795,442132,Ford Motor Co.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1


In [116]:
holdings['value multiplier'] = ''
holdings['key'] = ''
holdings['textfile'] = text
holdings['CIK'] = CIK
holdings['acq name'] = ''
holdings['acq date'] = ''
holdings['acq cost'] = ''
holdings['date_filed'] = '' 
holdings['company conformed name'] = ''

date_filed = panel2.loc[panel2['file_read'] == text, 'date_filed'].unique()[0]
conformed_name = panel2.loc[panel2['file_read'] == text, 'company conformed name'].unique()[0]
holdings.loc[:, 'date_filed'] = date_filed
holdings.loc[:, 'company conformed name'] = conformed_name

holdings.index = pd.RangeIndex(len(holdings.index))
print(holdings)

for z in weblink[CIK]:
    if text in z:
        matching_link = z

holdings['weblink'] = matching_link
legend['weblink'] = matching_link
legend['textfile'] = text

    holdings shares holdings value  \
0              8887        2866057   
1              2851         291144   
2              7403        1163826   
3              1330          83790   
4              1973         516610   
5             26665        4089078   
6              2965          95206   
7             32646        1517386   
8             25315         345043   
9             57795         442132   
10            18439         616785   
11           327665        8073666   
12            39538        1712786   
13            26999        1405568   
14            51324        1525863   
15             9524         414580   
16            83787        1102637   
17            28372        2465527   
18            44448         934297   
19            36467        3559909   
20            63588         379620   
21             8714         126702   
22            12804         409600   
23             2272          44213   
24            47011        2166267   
25          

get the acquisition data

In [117]:
holdings = get_keys(holdings)

In [118]:
holdings

Unnamed: 0,holdings shares,holdings value,holdings name,reporting_date,fund name,identifier,value multiplier,key,textfile,CIK,acq name,acq date,acq cost,date_filed,company conformed name,weblink
0,8887,2866057,Boeing Co.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1,,,0001193125-19-049881.txt,230382,,,,20190225,"BlackRock Advantage U.S. Total Market Fund, Inc.",https://www.sec.gov/Archives/edgar/data/230382...
1,2851,291144,Curtiss-Wright Corp.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1,,,0001193125-19-049881.txt,230382,,,,20190225,"BlackRock Advantage U.S. Total Market Fund, Inc.",https://www.sec.gov/Archives/edgar/data/230382...
2,7403,1163826,General Dynamics Corp.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1,,,0001193125-19-049881.txt,230382,,,,20190225,"BlackRock Advantage U.S. Total Market Fund, Inc.",https://www.sec.gov/Archives/edgar/data/230382...
3,1330,83790,"HEICO Corp., ClassA","december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1,,,0001193125-19-049881.txt,230382,,,,20190225,"BlackRock Advantage U.S. Total Market Fund, Inc.",https://www.sec.gov/Archives/edgar/data/230382...
4,1973,516610,Lockheed Martin Corp.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1,,,0001193125-19-049881.txt,230382,,,,20190225,"BlackRock Advantage U.S. Total Market Fund, Inc.",https://www.sec.gov/Archives/edgar/data/230382...
5,26665,4089078,Raytheon Co.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1,,,0001193125-19-049881.txt,230382,,,,20190225,"BlackRock Advantage U.S. Total Market Fund, Inc.",https://www.sec.gov/Archives/edgar/data/230382...
6,2965,95206,"American Airlines Group, Inc.","december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1,,,0001193125-19-049881.txt,230382,,,,20190225,"BlackRock Advantage U.S. Total Market Fund, Inc.",https://www.sec.gov/Archives/edgar/data/230382...
7,32646,1517386,Southwest Airlines Co.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1,,,0001193125-19-049881.txt,230382,,,,20190225,"BlackRock Advantage U.S. Total Market Fund, Inc.",https://www.sec.gov/Archives/edgar/data/230382...
8,25315,345043,"Dana, Inc.","december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1,,,0001193125-19-049881.txt,230382,,,,20190225,"BlackRock Advantage U.S. Total Market Fund, Inc.",https://www.sec.gov/Archives/edgar/data/230382...
9,57795,442132,Ford Motor Co.,"december31,2018",masteradvantageu.s.totalmarketllc,0001193125-19-049881.txt1,,,0001193125-19-049881.txt,230382,,,,20190225,"BlackRock Advantage U.S. Total Market Fund, Inc.",https://www.sec.gov/Archives/edgar/data/230382...


In [119]:
holdings['identifier']

0      0001193125-19-049881.txt1
1      0001193125-19-049881.txt1
2      0001193125-19-049881.txt1
3      0001193125-19-049881.txt1
4      0001193125-19-049881.txt1
5      0001193125-19-049881.txt1
6      0001193125-19-049881.txt1
7      0001193125-19-049881.txt1
8      0001193125-19-049881.txt1
9      0001193125-19-049881.txt1
10     0001193125-19-049881.txt1
11     0001193125-19-049881.txt1
12     0001193125-19-049881.txt1
13     0001193125-19-049881.txt1
14     0001193125-19-049881.txt1
15     0001193125-19-049881.txt1
16     0001193125-19-049881.txt1
17     0001193125-19-049881.txt1
18     0001193125-19-049881.txt1
19     0001193125-19-049881.txt1
20     0001193125-19-049881.txt1
21     0001193125-19-049881.txt1
22     0001193125-19-049881.txt1
23     0001193125-19-049881.txt1
24     0001193125-19-049881.txt1
25     0001193125-19-049881.txt1
26     0001193125-19-049881.txt1
27     0001193125-19-049881.txt1
28     0001193125-19-049881.txt1
29     0001193125-19-049881.txt1
          

In [120]:
legend.loc[:,'restricted'] = 'no'

legend.loc[legend['code'].str.contains("restrict") | legend['code'].str.contains("level 3 security") \
              | legend['code'].str.contains("exempt from registration"), 'restricted'] = 'yes'

restricted = legend.loc[legend['restricted'].str.contains("yes")]
print(restricted)

holdings.loc[:,'restricted'] = 'no'


Empty DataFrame
Columns: [symbol, code, identifier, weblink, textfile, restricted]
Index: []


In [121]:
#get the restricted legend and restricted securities

In [122]:
legend.loc[:,'restricted'] = 'no'

legend.loc[legend['code'].str.contains("restrict") | legend['code'].str.contains("level 3 security") \
              | legend['code'].str.contains("exempt from registration"), 'restricted'] = 'yes'

restricted = legend.loc[legend['restricted'].str.contains("yes")]
print(restricted)

holdings.loc[:,'restricted'] = 'no'

for index, row in restricted.iterrows():
    holdings.loc[(holdings['identifier'] == row['identifier']) & (holdings['textfile'] == row.textfile) \
                   & (holdings['key'].str.contains(row['symbol'])), 'restricted'] = 'yes'
    
restricted_holdings = holdings.loc[(holdings['restricted'] == 'yes')]

Empty DataFrame
Columns: [symbol, code, identifier, weblink, textfile, restricted]
Index: []


In [123]:
#get the restricted holdings

In [124]:
print(restricted_holdings)

Empty DataFrame
Columns: [holdings shares, holdings value, holdings name, reporting_date, fund name, identifier, value multiplier, key, textfile, CIK, acq name, acq date, acq cost, date_filed, company conformed name, weblink, restricted]
Index: []


In [125]:
def getpanel1(CIK, text, panel2, weblink):
    
    url = r'https://www.sec.gov/Archives/edgar/data/' + CIK +  r'/' + text
    tables = pd.read_html(url,encoding="utf-8")
    
    holdings = pd.DataFrame()
    reportdate = ''
    fundname = ''

    legend = pd.DataFrame()

    identifiercount = 0

    headername = ''


    for df in tables:
        #remove all rows and columns of nas
        df = df.dropna(axis = 0, how = 'all')
        df = df.dropna(axis = 1, how = 'all')

        if not df.empty:

            #get the report dates and fund names of this dataframe
            reportdate_test, fundname_test = get_report_fund(df)
            if reportdate_test != '':
                found = True
                reportdate = reportdate_test
                fundname = fundname_test



            for count, row in df.iterrows():
                firstrow = re.sub(r'\s', '', str(row[df.columns[0]])).encode('ascii','ignore').decode().lower()
                if '(' in firstrow and ')' in firstrow and len(firstrow) == 3:

                    if len(holdings.loc[holdings['identifier'] == '']) > 0:
                        identifiercount = identifiercount + 1
                        found = False
                        holdings.loc[holdings['identifier'] == '', 'identifier'] = text + str(identifiercount)

                    add_legend = pd.DataFrame(row).transpose()
                    add_legend.columns = ['symbol', 'code']
                    add_legend.loc[:, 'identifier'] = text + str(identifiercount)

                    if len(legend) == 0:
                        legend = add_legend.copy()

                    else:
                        if add_legend['symbol'].values[0] not in legend.loc[legend['identifier'] ==\
                                                                        text + str(identifiercount), 'symbol'].tolist():
                                legend = pd.concat([legend, add_legend], axis = 0)



            cleaned_df, headername = get_holdings(df, reportdate, fundname, headername)



            if len(cleaned_df) > 0 and found:
                security = cleaned_df.copy()
                security['holdings name'] = security['holdings name'].apply(lambda x: x.encode('ascii','ignore').decode())
                if len(security) > 0:
                    if holdings.empty:
                        holdings = security.copy()
                    else:
                        holdings = pd.concat([holdings, security], axis = 0)



    legend.index = pd.RangeIndex(len(legend.index))
    holdings.index = pd.RangeIndex(len(holdings.index))
    
    
    
    holdings['value multiplier'] = ''
    holdings['key'] = ''
    holdings['textfile'] = text
    holdings['CIK'] = CIK
    holdings['acq name'] = ''
    holdings['acq date'] = ''
    holdings['acq cost'] = ''
    holdings['date_filed'] = '' 
    holdings['company conformed name'] = ''

    date_filed = panel2.loc[panel2['file_read'] == text, 'date_filed'].unique()[0]
    conformed_name = panel2.loc[panel2['file_read'] == text, 'company conformed name'].unique()[0]
    holdings.loc[:, 'date_filed'] = date_filed
    holdings.loc[:, 'company conformed name'] = conformed_name

    holdings.index = pd.RangeIndex(len(holdings.index))

    for z in weblink[CIK]:
        if text in z:
            matching_link = z

    holdings['weblink'] = matching_link
    legend['weblink'] = matching_link
    legend['textfile'] = text
    
    
    holdings = get_keys(holdings)
        

        
    return holdings, legend




## Run on All CIKS

In [None]:
panel1 = pd.DataFrame()


panel1 = pd.DataFrame()
panel1_legend = pd.DataFrame()
error_panel1 = []

#CIK_LIST = ['319108','1097293','230382']

CIK_LIST = ['230382','319108']

for CIK in CIK_LIST:
    txt_files = os.listdir(os.path.join(cur_wd, CIK, 'n-q'))
    
    for i in txt_files:
        if '.csv' not in i and i not in error_panel2:
            try:
                holdings, legend = getpanel1(CIK, i, panel2, weblink)

                if len(holdings) == 0:
                    #print(i)
                    pass
                if panel1.empty:
                    panel1 = holdings.copy()
                    panel1_legend = legend.copy()
                else:
                    panel1 = pd.concat([panel1, holdings], axis = 0, sort = True)
                    panel1_legend = pd.concat([panel1_legend , legend], axis = 0, sort = True)
            except:
                error_panel1.append(r'https://www.sec.gov/Archives/edgar/data/' + CIK +  r'/' + i)

In [None]:
print(panel1.dropna(axis =1, how = 'all'))
panel1.to_csv(os.path.join(output_directory, fund_name + '_panel1.csv'), sep = ',')
panel1_legend.to_csv(os.path.join(output_directory, fund_name + '_panel1_legend.csv'), sep = ',')

In [None]:
panel1_legend.loc[:,'restricted'] = 'no'

panel1_legend.loc[panel1_legend['code'].str.contains("restrict") | panel1_legend['code'].str.contains("level 3 security") \
              | panel1_legend['code'].str.contains("exempt from registration"), 'restricted'] = 'yes'

restricted = panel1_legend.loc[panel1_legend['restricted'].str.contains("yes")]

restricted.to_csv(os.path.join(output_directory, fund_name + '_panel1_restricted_legend.csv'), sep = ',')
print(restricted)

panel1.loc[:,'restricted'] = 'no'

for index, row in restricted.iterrows():
    panel1.loc[(panel1['identifier'] == row['identifier']) & (panel1['textfile'] == row.textfile) \
                   & (panel1['key'].str.contains(row['symbol'])), 'restricted'] = 'yes'
    
restricted_holdings = panel1.loc[(panel1['restricted'] == 'yes')]
restricted_holdings.to_csv(os.path.join(output_directory, fund_name + '_panel1_restricted.csv'), sep = ',')