In [64]:
import os
import pandas as pd
from lxml import html
import requests
import re
import sys
from datetime import datetime

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import time

# Input the CIK list for the fund family you are looking at

In [65]:
fund_name = 'fidelity'


CIK_LIST = ['24238','35331','35315', '722574', '225322', '795422', '35315', '803013', '729218', '205323', '1303459', '751199', \
            '81205', '320351', '354046', '1401097', '35341', '754510', '35348', '275309', '819118', '880195', \
            '1364924', '1061130', '744822', '278001', '719451', '225323', '880709', '708191', '917286', '61397']



get the current data working directory

In [66]:
cur_wd = os.getcwd()
if 'trunk' in cur_wd:
    output_directory = os.path.join("\\".join(cur_wd.split('\\')[0:-1]),'output')
else:
    output_directory = os.path.join("\\".join(cur_wd.split('\\')[0:-1]),'output','working')
    
cur_wd = os.path.join("\\".join(cur_wd.split('\\')[0:-1]),'data')
print('data directory')
print(cur_wd)
print('output directory')
print(output_directory)

data directory
D:\wei\wei_mutual_fund_project\data
output directory
D:\wei\wei_mutual_fund_project\output


# Helper Functions for Analysis

In [67]:
def get_cleaned(lines):
    cleaned = []
    for i in lines:
        #gets rid of tags
        cleaned_i = re.sub('<[^>]*>', ' ', i)
        #gets rid of &nbsp; and replaces with space
        cleaned_i = re.sub('&nbsp;', ' ', cleaned_i)
        #re.sub("<.*?>","",st)
        if not re.match(r'^\s*$', cleaned_i):
            cleaned.append(cleaned_i)
    return cleaned

def find(s, ch):
    return [i for i, ltr in enumerate(s) if ltr == ch]


def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))


def remove_junk(addline):
    addline = re.sub('<[^>]*>', ' ', addline)
    addline = re.sub('&nbsp;', ' ', addline)
    addline = re.sub('&#174;', ' ', addline)
    addline = re.sub('&#8480;', ' ', addline)
    addline = re.sub(r'&reg;',' ',addline)
    addline = re.sub(r'\n', ' ', addline)
    addline = re.sub(r'\t', ' ', addline)
    return addline


In [68]:
weblink = {}
for CIK in CIK_LIST:
    weblink[CIK]  = []
    txt_files = os.listdir(os.path.join(cur_wd, CIK, 'n-q'))
    for i in txt_files:
        weblink[CIK].append(r"https://www.sec.gov/Archives/edgar/data/" + str(CIK)+r"/"  + i)

# Get Panel 2 Information- This should be same for every file

get panel 2

In [69]:
panel2 = pd.DataFrame()
error_panel2 = []

    
for CIK in CIK_LIST:
    txt_files = os.listdir(os.path.join(cur_wd, CIK, 'n-q'))
    
    for i in txt_files:
        if '.csv' not in i:
            try:
                series = {}
                series_keys = ['SERIES-ID', 'OWNER-CIK', 'SERIES-NAME']
                for s in series_keys:
                    series[s] = []

                contract = {}
                contract_keys = ['CLASS-CONTRACT-ID','CLASS-CONTRACT-NAME','CLASS-CONTRACT-TICKER-SYMBOL', 'LAST-READ-SERIES']
                for c in contract_keys:
                    contract[c] = []

                uncleaned_file = open(os.path.join(cur_wd, CIK, 'n-q',i), mode='r') 
                for u in uncleaned_file:
                    

                    if 'COMPANY CONFORMED NAME' in u:
                        conformed_name = u.replace('COMPANY CONFORMED NAME:', '').strip()

                    if 'FILED AS OF DATE:' in u:
                        filed_date = str(u).replace('FILED AS OF DATE:','').strip()

                    #series information
                    for s in series_keys:
                        if s in u:
                                series[s].append(str(u).replace('<' + s + '>', "").strip())
                    for c in contract_keys:
                        if c != 'LAST-READ-SERIES':
                            if c in u:
                                contract[c].append(str(u).replace('<' + c + '>', "").strip())
                                if c == 'CLASS-CONTRACT-ID':
                                    contract['LAST-READ-SERIES'].append(series['SERIES-ID'][-1])


                series = pd.DataFrame.from_dict(series, orient = 'index').T
                contract = pd.DataFrame.from_dict(contract, orient = 'index').T


                for s in series_keys:
                    contract.loc[:, s] = ''

                for ix,row in contract.iterrows():
                    for s in series_keys:
                        contract.loc[contract.index == ix, s] = series.loc[series['SERIES-ID'] == row['LAST-READ-SERIES'],s].values[0]
                contract.loc[:,'file_read'] = i
                contract.loc[:,'date_filed'] = filed_date
                contract.loc[:, 'company conformed name'] = conformed_name
                contract = contract.drop(['LAST-READ-SERIES'], axis=1)

                if panel2.empty:
                    panel2 = contract.copy()
                else:
                    panel2 = pd.concat([panel2, contract], axis = 0)
            except ValueError:
                error_panel2.append(i)
                

get files that did not have panel 2 data 

In [70]:
print(error_panel2)

['0000024238-04-000014.txt', '0000024238-05-000005.txt', '0000795422-05-000048.txt']


# Output Panel 2

In [71]:
print(panel2)
panel2.to_csv(os.path.join(output_directory, fund_name + '_panel2.csv'), sep = ',')

   CLASS-CONTRACT-ID                                CLASS-CONTRACT-NAME  \
0         C000016596                                            Class A   
1         C000016597                                            Class B   
2         C000016598                                            Class C   
3         C000016599                                            Class T   
4         C000016600                                Institutional Class   
5         C000016601                                Fidelity Contrafund   
0         C000016596                                            Class A   
1         C000016597                                            Class B   
2         C000016598                                            Class C   
3         C000016599                                            Class T   
4         C000016600                                Institutional Class   
5         C000016601                                Fidelity Contrafund   
0         C000016596     

# Run Cleaning on One File

In [72]:
def get_info_dict(i):
    
    cf = open(os.path.join(cur_wd, CIK, 'n-q',i), mode='r') 
    tables = {}
    r_add = False
    add = False
    split_qtly = {}
    
    hold = False
    legendFound = False
    acqFound = False
    getNexyLine = False
    
    value_mult = ''
    
    reporting_date = 'not found'
    
    for j in cf:
        j = j.lower()
        
        if reporting_date is 'not found':
            
            if 'date of reporting period' in j and r'<div' in j:
                r_addstring = remove_junk(j)
                date = re.search("(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)\s+\d{1,2},\s+\d{4}",r_addstring)
                if date is not None:
                    reporting_date = date.group()
                
            
            if r'<tr>' in j and r'</tr>' not in j:
                r_addstring = ''
                r_add = True
                
            try:
                if r'</tr>' in j and r'<tr>' not in j:
                    r_addstring = " ".join(r_addstring.split())
                    r_add = False
                    if 'date of reporting period' in r_addstring:
                        date = re.search("(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)\s+\d{1,2},\s+\d{4}",r_addstring)
                        if date is not None:
                            reporting_date = date.group()
            except:
                pass
            
            
            if r_add:
                r_addline = remove_junk(j)
                r_addstring = r_addstring + ' ' + r_addline
                
            
        
        
        if r'value (' in j:
            try:
                value_mult = remove_junk(j)
                value_mult = re.findall('\((.*?)\)',value_mult)
                split_qtly[key]['holdings'].append(value_mult[0])
            except IndexError:
                #this is a value we do not want to scrap
                pass
            
        if 'quarterly holdings' in j:
            getNexyLine = True
            
        if getNexyLine and 'quarterly holdings' not in j:
            if remove_junk(j) != '' and ('fidelity' in j or 'fund' in j or 'index' in j):
                fund_quarterly_name = remove_junk(j)
                fund_quarterly_name = " ".join(fund_quarterly_name.split())
                getNexyLine = False
            
        if '-qtly-' in j:
            key = remove_junk(j)
            split_qtly[key] = {}
            split_qtly[key]['holdings'] = []
            split_qtly[key]['legend'] = []
            split_qtly[key]['acq'] = []
            split_qtly[key]['name'] = fund_quarterly_name
            
            hold = True
            legendFound = False
            acqFound = False
            
        
            
        if r'>legend<' in j:
            hold = False
            legendFound = True
            acqFound = False

        if 'acquisition cost' in j:
            hold = False
            legendFound = False
            acqFound = True
        
        if hold or legendFound or acqFound:
            
            if r'<tr>' in j:
                addstring = ''
                add = True

            try:
                if r'</tr>' in j:
                    addstring = " ".join(addstring.split())
                    add = False
                    if addstring != '':
                        if hold:
                            split_qtly[key]['holdings'].append(addstring)
                        if legendFound and addstring not in split_qtly[key]['legend']:
                            split_qtly[key]['legend'].append(addstring)
                        if acqFound and addstring not in split_qtly[key]['acq']:
                            split_qtly[key]['acq'].append(addstring)
                            
                        if 'date of reporting period' in addstring:
                            print(addstring)
            except:
                pass
                        
                        

            if add:
                addline = remove_junk(j)
                addstring = addstring + ' ' + addline
                

            if legendFound and r'<p' in j:
                try:
                    add_leg = remove_junk(j)
                    if add_leg != '' :
                        if add_leg not in split_qtly[key]['legend']:
                            split_qtly[key]['legend'].append(add_leg)
                except UnboundLocalError:
                    pass
                    #the key has not been found yet
                
            if acqFound and r'<td' in j:
                add_acq = remove_junk(j)
                if add_acq != '':
                    if add_acq not in split_qtly[key]['acq']:
                        split_qtly[key]['acq'].append(add_acq)
                        
    return split_qtly, reporting_date
    
    

In [73]:
CIK  ='24238'
i = '0000024238-04-000014.txt'
text = i
fundNames = panel2.loc[panel2['file_read'] == text, 'SERIES-NAME'].unique()

series_names = panel2.loc[panel2['file_read'] == text, 'SERIES-NAME'].unique().tolist()
series_names = list(map(lambda x:x.lower(),series_names))

if '.csv' not in i:
    for z in weblink[CIK]:
        if i in z:
            matching_link = z
    print(matching_link)
    
    split_qtly, reporting_date = get_info_dict(i)
    key = list(split_qtly.keys())[0]
    print('reporting date')
    print(reporting_date)

https://www.sec.gov/Archives/edgar/data/24238/0000024238-04-000014.txt
reporting date
september 30, 2004


In [74]:
series_names = panel2.loc[panel2['file_read'] == text, 'SERIES-NAME'].unique().tolist()
series_names = list(map(lambda x:x.lower(),series_names))
series_names = list(map(lambda x: "".join(x.split()),series_names))

print(series_names)

print(panel2.loc[panel2['file_read'] == text])

[]
Empty DataFrame
Columns: [CLASS-CONTRACT-ID, CLASS-CONTRACT-NAME, CLASS-CONTRACT-TICKER-SYMBOL, SERIES-ID, OWNER-CIK, SERIES-NAME, file_read, date_filed, company conformed name]
Index: []


clean the legend

In [75]:
len(split_qtly[key]['legend'])

leg_dict = {}

for j in split_qtly.keys():
    count = 0
    split_qtly[j]['cleaned legend'] = []
    leg_dict[j]= {}
    
    restricted_securities = []
    for i in split_qtly[j]['legend']:
        i = i.rstrip().lstrip()
        keys = re.search('\(([^()]*)\)',i)
        try: 
            keys = keys.group()
            if r'(' in i and r')' in i and len(keys) == 3:
                split_qtly[j]['cleaned legend'].append(i.rstrip().lstrip()) 
                if i.rstrip().lstrip()[0:3] not in leg_dict[j].keys():
                    print(i)
                    leg_dict[j][i.rstrip().lstrip()[0:3]] = i.rstrip().lstrip()[3:]
        except:
            pass
        count = count + 1
        
df_legend = pd.DataFrame()

for j in leg_dict.keys():
    try:
        add = pd.DataFrame.from_dict(leg_dict[j], orient = 'index')
        add.columns = ['code']
        add.loc[:, 'identifer'] = j
        add.loc[:, 'weblink'] = matching_link
        add.loc[:, 'textfile'] = text

        if df_legend.empty:
            df_legend = add.copy()
        else:
            df_legend = pd.concat([df_legend, add], axis = 0)
    except ValueError:
        #there is no legend
        pass
        
print(df_legend)


(a) non-income producing
(b) affiliated fund that is available only to investment companies and other accounts managed
(c) includes investment made with cash collateral received from securities on loan.
(d) security exempt from registration under rule 144a of the securities act of 1933.  these
(a) non-income producing
(b) affiliated fund that is available only to investment companies and other accounts managed
(c) includes investment made with cash collateral received from securities on loan.
(d) affiliated company
(e) security exempt from registration under rule 144a of the securities act of 1933.  these
(f) restricted securities - investment in securities not registered under the securities act of 1933
                                                  code           identifer  \
(a)                               non-income producing    anif-qtly-1104     
(b)   affiliated fund that is available only to inv...    anif-qtly-1104     
(c)   includes investment made with cash collateral.

clean the acquisition data

In [76]:
len(split_qtly[key]['acq'])
acq_dict = {}

for j in split_qtly.keys():
    split_qtly[j]['cleaned acq'] = []
    for i in split_qtly[j]['acq']:
        if hasNumbers(i) and r'$' in str(i):
            clean_i = str(i).replace(r'$', r' $')
            if clean_i.count(r"$") == 1 and clean_i.count(r"/") >= 2:
                clean_i = clean_i.split(r' $')
                cost = clean_i[1]
                date = re.findall('\d{1,2}/\d{1,2}/\d{2,4}', clean_i[0])
                date = " - ".join(date)
                clean_i[0] = clean_i[0].replace(date, '')
                clean_i[0] = clean_i[0].lstrip()
                clean_i[0] = clean_i[0].rstrip()
                if clean_i[0] != 'equities':
                    split_qtly[j]['cleaned acq'].append([clean_i[0], date, cost])
    acq_dict[j] = split_qtly[j]['cleaned acq']
    

df_acq = pd.DataFrame()
for j in acq_dict.keys():
    if len(acq_dict[j]) > 0:
        add = pd.DataFrame(acq_dict[j])
        add.columns = ['name', 'acq date', 'acq cost']
        add.loc[:, 'identifer'] = j
        add.loc[:, 'weblink'] = matching_link
        add.loc[:, 'textfile'] = text

        if df_acq.empty:
            df_acq = add.copy()
        else:
            df_acq = pd.concat([df_acq, add], axis = 0)
        

print(df_acq)

                    name acq date acq cost          identifer  \
0  jetsgo corp. warrants   3/1/04        0    con-qtly-1104     
1   jetsgo corp. class b   3/1/04    9,334    con-qtly-1104     

                                             weblink                  textfile  
0  https://www.sec.gov/Archives/edgar/data/24238/...  0000024238-04-000014.txt  
1  https://www.sec.gov/Archives/edgar/data/24238/...  0000024238-04-000014.txt  


get the holdings

In [77]:
def get_keys(leg_dict, j, i):
    legend_keys = list(leg_dict[j].keys())
    keys_to_add = ''

    for k in legend_keys:
        if k in i:
            #i = i.replace(k,'')
            if keys_to_add == '':
                keys_to_add = k
            else:
                keys_to_add = keys_to_add + ',' + k

    return keys_to_add, i

In [78]:
    len(split_qtly[key]['holdings'])
    holdings_dict = {}
    for j in split_qtly.keys():
        
        split_qtly[j]['cleaned holdings'] = []
        for i in split_qtly[j]['holdings']:

            if hasNumbers(i) and i[-1] != r'%' and 'total investment' not in i and 'principalamount' not in i:
                keys_found, cleaned_i = get_keys(leg_dict, j, i)
                cleaned_i  = cleaned_i.replace(r'$', ' ')
                cleaned_i  = cleaned_i.replace(r',', '')
                cleaned_i = cleaned_i.lstrip()
                cleaned_i = cleaned_i.rstrip()
                cleaned_i = " ".join(cleaned_i.split())
                isplit = cleaned_i.split(' ')
                if len(isplit) >= 3:
                    if hasNumbers(isplit[-1]) and hasNumbers(isplit[-2]):
                        if 'warrant' in cleaned_i or 'loan' in cleaned_i or 'lending' in cleaned_i or 'tranche' in \
                        cleaned_i or 'cash central' in cleaned_i :
                            if 'warrant' in cleaned_i:
                                split_qtly[j]['cleaned holdings'].append([i, 'warrant', ''])
                            elif 'loan' in cleaned_i:
                                split_qtly[j]['cleaned holdings'].append([i, 'loan', ''])
                            elif 'trance' in cleaned_i:
                                split_qtly[j]['cleaned holdings'].append([i, 'pooled-security', ''])
                            else:
                                split_qtly[j]['cleaned holdings'].append([i, 'non-equity', '',keys_found])
                            
                        else:
                            non_dig = re.findall(r'\D+', isplit[-2])
                            non_dig = " ".join(non_dig)
                            isplit[-3] = isplit[-3] +' ' + non_dig
                            isplit[-2] = isplit[-2].replace(non_dig, '')
                            name = ' '.join(isplit[0:-2])
                            name = name.lstrip()
                            name = name.rstrip()
                            split_qtly[j]['cleaned holdings'].append([name , isplit[-2], isplit[-1], keys_found])
            elif ':' in i:
                keys_found, cleaned_i = get_keys(leg_dict, j, i)
                cleaned_i  = cleaned_i.replace(r'$', ' ')
                cleaned_i  = cleaned_i.replace(r',', '')
                cleaned_i = cleaned_i.lstrip()
                cleaned_i = cleaned_i.rstrip()
                cleaned_i = " ".join(cleaned_i.split())
                split_qtly[j]['cleaned holdings'].append([i, 'header', '',keys_found])
                
        holdings_dict[j] = split_qtly[j]['cleaned holdings']

    df_holdings = pd.DataFrame()
    for j in holdings_dict.keys():
        add = pd.DataFrame(holdings_dict[j])
        add.columns = ['holdings name', 'holdings shares', 'holdings value', 'keys']
        add.loc[:, 'identifer'] = j
        add.loc[:, 'weblink'] = matching_link
        add.loc[:, 'textfile'] = text

        if df_holdings.empty:
            df_holdings = add.copy()
        else:
            df_holdings = pd.concat([df_holdings, add], axis = 0)

    df_holdings = df_holdings.drop_duplicates(subset = ['holdings name', 'holdings shares', 'holdings value'])
    
    print(df_holdings)

                                         holdings name holdings shares  \
0                                johnson controls inc.            8100   
1                                            lkq corp.             100   
2                                 honda motor co. ltd.            2200   
3                                   toyota motor corp.           24600   
4                              aristocrat leisure ltd.          175000   
5                                    boyd gaming corp.           16500   
6                              buffalo wild wings inc.            2400   
7                             four seasons hotels inc.           29700   
8                                 gtech holdings corp.            6200   
9                                     hilton group plc          150470   
10                      kerzner international ltd. (a)           14700   
11                     krispy kreme doughnuts inc. (a)            2400   
12                              life t

# Get Panel 1 FOR ALL CIKS

In [79]:

def getPanel1_add(split_qtly, matching_link, text, cik):
    
    #get the security legend
    
    leg_dict = {}

    for j in split_qtly.keys():
        count = 0
        split_qtly[j]['cleaned legend'] = []
        leg_dict[j]= {}

        restricted_securities = []
        for i in split_qtly[j]['legend']:
            i = i.rstrip().lstrip()
            keys = re.search('\(([^()]*)\)',i)
            try: 
                keys = keys.group()
                if r'(' in i and r')' in i and len(keys) == 3:
                    split_qtly[j]['cleaned legend'].append(i.rstrip().lstrip()) 
                    if i.rstrip().lstrip()[0:3] not in leg_dict[j].keys():
                        leg_dict[j][i.rstrip().lstrip()[0:3]] = i.rstrip().lstrip()[3:]
            except:
                pass
            count = count + 1

    df_legend = pd.DataFrame()

    for j in leg_dict.keys():
        try:
            add = pd.DataFrame.from_dict(leg_dict[j], orient = 'index')
            add.columns = ['code']
            add.loc[:, 'identifer'] = j
            add.loc[:, 'weblink'] = matching_link
            add.loc[:, 'textfile'] = text

            if df_legend.empty:
                df_legend = add.copy()
            else:
                df_legend = pd.concat([df_legend, add], axis = 0)
        except ValueError:
            #there is no legend
            pass
            
            
    #get the security acquisition data
    
    acq_dict = {}

    for j in split_qtly.keys():
        split_qtly[j]['cleaned acq'] = []
        for i in split_qtly[j]['acq']:
            if hasNumbers(i) and r'$' in str(i):
                clean_i = str(i).replace(r'$', r' $')
                if clean_i.count(r"$") == 1 and clean_i.count(r"/") >= 2:
                    clean_i = clean_i.split(r' $')
                    cost = clean_i[1].lstrip().rstrip()
                    date = re.findall('\d{1,2}/\d{1,2}/\d{2,4}', clean_i[0])
                    date = " - ".join(date)
                    date = date.lstrip().rstrip()
                    name = clean_i[0].replace(date, '')
                    name = name.lstrip()
                    name = name.rstrip()
                    if name != 'equities':
                        split_qtly[j]['cleaned acq'].append([name, date, cost])
        acq_dict[j] = split_qtly[j]['cleaned acq']

    df_acq = pd.DataFrame()
    for j in acq_dict.keys():
        if len(acq_dict[j]) > 0:
            add = pd.DataFrame(acq_dict[j])
            add.columns = ['acq name', 'acq date', 'acq cost']
            add.loc[:, 'identifer'] = j
            add.loc[:, 'weblink'] = matching_link
            add.loc[:, 'textfile'] = text
            add.loc[:, 'CIK'] = cik
            add.loc[:, 'fund name'] = split_qtly[j]['name']

            if df_acq.empty:
                df_acq = add.copy()
            else:
                df_acq = pd.concat([df_acq, add], axis = 0)
            
    #get the holdings data
    
    holdings_dict = {}
    value_mult = ''
    for j in split_qtly.keys():
        
        leg_dict[j]
        split_qtly[j]['cleaned holdings'] = []
        for i in split_qtly[j]['holdings']:
            
            if r'value (' in i:
                value_mult = remove_junk(i)
                value_mult = str(re.findall('\((.*?)\)',value_mult)[0])

            elif hasNumbers(i) and i[-1] != r'%' and 'total investment' not in i and 'principalamount' not in i \
            and r'cost)' not in i and r'(cost' not in i and 'net asset' not in i:
                keys_found, cleaned_i = get_keys(leg_dict, j, i)
                cleaned_i  = cleaned_i.replace(r'$', ' ')
                cleaned_i  = cleaned_i.replace(r',', '')
                cleaned_i = cleaned_i.lstrip()
                cleaned_i = cleaned_i.rstrip()
                cleaned_i = " ".join(cleaned_i.split())
                isplit = cleaned_i.split(' ')
                if len(isplit) >= 3:
                    if hasNumbers(isplit[-1]) and hasNumbers(isplit[-2]):
                        if 'warrant' in cleaned_i or 'loan' in cleaned_i or 'lending' in cleaned_i or 'tranche' in \
                        cleaned_i or 'cash central' in cleaned_i or r'%' in cleaned_i:
                            if 'warrant' in cleaned_i:
                                split_qtly[j]['cleaned holdings'].append([i, 'warrant', '',keys_found, value_mult])
                            elif 'loan' in cleaned_i and 'tranche' not in cleaned_i:
                                split_qtly[j]['cleaned holdings'].append([i, 'loan', '',keys_found, value_mult])
                            elif 'tranche' in cleaned_i:
                                split_qtly[j]['cleaned holdings'].append([i, 'pooled-security', '',keys_found, value_mult])
                            elif 'cash central' in cleaned_i:
                                split_qtly[j]['cleaned holdings'].append([i, 'cash central fund', '',keys_found, value_mult])
                            elif r'%' in cleaned_i and r'/' in cleaned_i:
                                split_qtly[j]['cleaned holdings'].append([i, 'corporate bond', '',keys_found, value_mult])
                            elif r'%' in cleaned_i and r'/' not in cleaned_i:
                                split_qtly[j]['cleaned holdings'].append([i, 'pref stock', '',keys_found, value_mult])
                            else:
                                split_qtly[j]['cleaned holdings'].append([i, 'non-equity', '',keys_found, value_mult])
                        else:
                            non_dig = re.findall(r'\D+', isplit[-2])
                            non_dig = " ".join(non_dig)
                            isplit[-3] = isplit[-3] +' ' + non_dig
                            isplit[-2] = isplit[-2].replace(non_dig, '')
                            name = ' '.join(isplit[0:-2])
                            name = name.lstrip()
                            name = name.rstrip()
                            split_qtly[j]['cleaned holdings'].append([name , isplit[-2], isplit[-1],keys_found, value_mult])
            elif ':' in i:
                if 'total investment' not in i and 'principalamount' not in i \
                and r'cost)' not in i and r'(cost' not in i and 'net asset' not in i:
                    keys_found, cleaned_i = get_keys(leg_dict, j, i)
                    cleaned_i  = cleaned_i.replace(r'$', ' ')
                    cleaned_i  = cleaned_i.replace(r',', '')
                    cleaned_i = cleaned_i.lstrip()
                    cleaned_i = cleaned_i.rstrip()
                    cleaned_i  = cleaned_i.replace(r':', '')
                    cleaned_i = " ".join(cleaned_i.split())
                    split_qtly[j]['cleaned holdings'].append([i, 'header', '',keys_found, value_mult])
            else:
                pass
                
        holdings_dict[j] = split_qtly[j]['cleaned holdings']

    df_holdings = pd.DataFrame()
    for j in holdings_dict.keys():
        add = pd.DataFrame(holdings_dict[j])
        add.columns = ['holdings name', 'holdings shares', 'holdings value', 'key', 'value multiplier']
        add.loc[:, 'identifer'] = j
        add.loc[:, 'weblink'] = matching_link
        add.loc[:, 'textfile'] = text
        add.loc[:, 'CIK'] = cik
        add.loc[:, 'fund name'] = split_qtly[j]['name']

        if df_holdings.empty:
            df_holdings = add.copy()
        else:
            df_holdings = pd.concat([df_holdings, add], axis = 0)

    df_holdings = df_holdings.drop_duplicates(subset = ['holdings name', 'holdings shares', 'holdings value'])
    df_acq = df_acq.drop_duplicates(subset = ['acq name', 'acq date', 'acq cost'])
    
    
    return df_legend, df_acq, df_holdings
        
    

In [80]:
panel1 = pd.DataFrame()
panel1_legend = pd.DataFrame()
error_panel1 = []

for CIK in CIK_LIST:
    txt_files = os.listdir(os.path.join(cur_wd, CIK, 'n-q'))
    
    for i in txt_files:
        if '.csv' not in i and i not in error_panel2:
            for z in weblink[CIK]:
                if i in z:
                    matching_link = z
            print(matching_link)
            
            try:
                split_qtly, reporting_date = get_info_dict(i)
                
                #get the panel1 data
                df_legend, df_acq, df_holdings = getPanel1_add(split_qtly, matching_link, i, CIK)
                
                #combine the holdings and acq data in one panel
                df_holdings = pd.concat([df_holdings, df_acq], axis = 0)
                
                #add the conformed data and date filed to panel 1 by reading the date_filed
                date_filed = panel2.loc[panel2['file_read'] == i, 'date_filed'].unique()[0]
                conformed_name = panel2.loc[panel2['file_read'] == i, 'company conformed name'].unique()[0]
                
                df_holdings.loc[:, 'date_filed'] = date_filed
                df_holdings.loc[:, 'reporting_date'] = reporting_date
                df_holdings.loc[:, 'company conformed name'] = conformed_name
                
                #add the new holdings data modified to panel 1 and also add legend information to output panels
                if panel1.empty:
                    panel1 = df_holdings.copy()
                    panel1_legend = df_legend.copy()
                else:
                    panel1 = pd.concat([panel1, df_holdings] , axis = 0)
                    panel1_legend = pd.concat([panel1_legend, df_legend], axis = 0)
            except:
                print("Unexpected error:", sys.exc_info()[0])
                error_panel1.append(i)
                print(CIK)
                print(i)
         
print('errors')
print(error_panel1)

panel1_legend.to_csv(os.path.join(output_directory, fund_name + '_panel1_legend.csv'), sep = ',')

https://www.sec.gov/Archives/edgar/data/24238/0000024238-06-000007.txt
https://www.sec.gov/Archives/edgar/data/24238/0000024238-07-000011.txt


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




https://www.sec.gov/Archives/edgar/data/24238/0000024238-07-000024.txt
https://www.sec.gov/Archives/edgar/data/24238/0000024238-13-000025.txt
https://www.sec.gov/Archives/edgar/data/24238/0000035348-11-000024.txt
https://www.sec.gov/Archives/edgar/data/24238/0000035402-12-000012.txt
https://www.sec.gov/Archives/edgar/data/24238/0000315700-15-000076.txt
https://www.sec.gov/Archives/edgar/data/24238/0000354046-10-000009.txt
https://www.sec.gov/Archives/edgar/data/24238/0000356494-14-000054.txt
https://www.sec.gov/Archives/edgar/data/24238/0000722574-08-000117.txt
https://www.sec.gov/Archives/edgar/data/24238/0000722574-09-000089.txt


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




https://www.sec.gov/Archives/edgar/data/24238/0000722574-09-000267.txt
https://www.sec.gov/Archives/edgar/data/24238/0000722574-10-000311.txt
https://www.sec.gov/Archives/edgar/data/24238/0000722574-11-000359.txt
https://www.sec.gov/Archives/edgar/data/24238/0000729218-08-000022.txt
https://www.sec.gov/Archives/edgar/data/24238/0000795422-12-000136.txt
https://www.sec.gov/Archives/edgar/data/24238/0000878467-06-000026.txt
https://www.sec.gov/Archives/edgar/data/24238/0000878467-14-000949.txt
https://www.sec.gov/Archives/edgar/data/24238/0000880195-13-000956.txt
https://www.sec.gov/Archives/edgar/data/24238/0001379491-15-001530.txt
https://www.sec.gov/Archives/edgar/data/24238/0001379491-16-004311.txt
https://www.sec.gov/Archives/edgar/data/24238/0001379491-16-007548.txt
https://www.sec.gov/Archives/edgar/data/24238/0001379491-17-003201.txt
https://www.sec.gov/Archives/edgar/data/24238/0001379491-17-007752.txt
https://www.sec.gov/Archives/edgar/data/24238/0001379491-18-002500.txt
https:

map the header information

In [81]:
panel1 = panel1.reset_index(drop=True)
for index, row in panel1.iterrows():
    if row['holdings shares'] == 'header':
        lastheader = str(row['holdings name'].replace(r':', ''))
    if not pd.isnull(row['holdings name']):
        if str(row['holdings name'][0:len('class')]).lower() == 'class':
            panel1.loc[panel1.index == index, 'holdings name'] = lastheader + r'_' + str(row['holdings name'])
        elif str(row['holdings name'][0:len('series')]).lower() == 'series':
            panel1.loc[panel1.index == index, 'holdings name'] = lastheader + r'_' + str(row['holdings name'])
        elif str(row['holdings name'][0:len('warrants')]).lower() == 'warrants':
            panel1.loc[panel1.index == index, 'holdings name'] = lastheader + r'_' + str(row['holdings name'])
        elif str(row['holdings name'][0:len('adr')]).lower() == 'adr':
            panel1.loc[panel1.index == index, 'holdings name'] = lastheader + r'_' + str(row['holdings name'])            
        elif r'%' in row['holdings name'] and  hasNumbers(row['holdings name'][0]):
            panel1.loc[panel1.index == index, 'holdings name'] = lastheader + r'_' + str(row['holdings name'])

panel1 = panel1.loc[panel1['holdings shares'] != 'header']  
panel1.to_csv(os.path.join(output_directory, fund_name + '_panel1.csv'), sep = ',')

get the restricted securities

In [82]:
panel1_legend.loc[:,'restricted'] = 'no'

acq = panel1.loc[~panel1['acq date'].isnull()]

panel1_legend.loc[panel1_legend['code'].str.contains("restrict") | panel1_legend['code'].str.contains("level 3 security") \
              | panel1_legend['code'].str.contains("exempt from registration"), 'restricted'] = 'yes'

restricted = panel1_legend.loc[panel1_legend['restricted'].str.contains("yes")]
restricted.to_csv(os.path.join(output_directory, fund_name + '_panel1_restricted_legend.csv'), sep = ',')

panel1.loc[:,'restricted'] = 'no'

for index, row in restricted.iterrows():
    panel1.loc[(panel1['identifer'] == row['identifer']) & (panel1['textfile'] == row.textfile) \
                   & (panel1['key'].str.contains(index)), 'restricted'] = 'yes'

#add the acquisition data

print(len(panel1))
restricted_panel1 = panel1.loc[(panel1['restricted'] == 'yes')]
print(len(restricted_panel1))
restricted_panel1 = pd.concat([panel1.loc[(panel1['restricted'] == 'yes')], acq], axis = 0)
print(len(restricted_panel1))
restricted_panel1.to_csv(os.path.join(output_directory, fund_name + '_panel1_restricted.csv'), sep = ',')

  from ipykernel import kernelapp as app


39029
1924
3372


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


