In [16]:
import pandas as pd
import numpy as np
import json
from bs4 import BeautifulSoup
import re
import urllib.request
import urllib.parse

In [99]:
def get_all_urls(base_url,interval,count=-1):
    """ baseUrl is https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193
        the next page is 
        https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193&type=&dateb=&owner=include&start=80
    """
    all_urls = []
    header = {'User-Agent':'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    try: 
        request = urllib.request.Request(base_url,headers=header)
        file = urllib.request.urlopen(request)
    except:
        print("Fail to open", base_url)
        return all_urls
    all_urls.append(file)
    url_ext = '&type=&dateb=&owner=include&start='
    index = interval
    while True:
        try: 
            new_url = base_url+url_ext+str(index)
            print('opening',new_url)
            file = urllib.request.urlopen(new_url)
        except:
            print("Fail to open",new_url)
            break
        index += interval
        all_urls.append(file)
        if len(all_urls) > 10:
            break
    return all_urls

def parse_officer_info(tags, sub_tag='td'):
    name = ''
    date = ''
    title = ''
    owner_info = []
    all_tds = tags.findAll(sub_tag)
    if len(all_tds) != 4:
        return owner_info
    matchOfficer = re.search('^officer:\s+(.*?)$', all_tds[3].string)
    if matchOfficer:
        title = '"'+matchOfficer.group(1)+'"'
    else:
        return owner_info
    date = all_tds[2].string
    name = '"'+all_tds[0].string+'"' 
    return (name, date, title)
def parse_stock_transaction(tags, sub_tag='td'):
    all_tds = tags.findAll(sub_tag)
    transaction_info = []
    if len(all_tds) != 12 or (len(all_tds)==12 and all_tds[4].string != '4'):
        return transaction_info
    sell_or_buy = all_tds[0].string
    date = all_tds[1].string
    name = '"'+all_tds[3].string+'"'
    transaction_type = all_tds[5].string
    nums_of_transacted = all_tds[7].string
    total_securities = all_tds[8].string
    
    return (name,date,sell_or_buy,transaction_type, nums_of_transacted, total_securities)

def write_list_to_file(file_obj, to_write_list):
    for my_line in to_write_list:
        my_line = ' '.join(str(item) for item in my_line)
        file_obj.write(my_line+"\n")
    return
def parse_sec_url_obj(file_obj):
    if len(file_obj) == 0 or file_obj == None:
        return 0
    # parse the 1st page to get the owner info once
    owner_infos = []
    owner_html = file_obj[0].read().decode('utf-8')
    owner_sp = BeautifulSoup(owner_html)
    company_name = owner_sp.title.string
    owner_pattern = re.search('Ownership Information:\s+(.*?)$', company_name)
    company_name = '_'.join(owner_pattern.group(1).split())+".csv"
    output = open(company_name,'w')
    print("Writing",company_name,"file ...")
    for tr in owner_sp.findAll('tr'):
        owner_info = parse_officer_info(tr)
        if len(owner_info) > 0:
            owner_infos.append(owner_info)
            
    if len(owner_infos)==0:
        print("Error: no owner info")
        return -1
    write_list_to_file(output, owner_infos)

    index = 0
    for single_url_obj in file_obj:
        transaction_infos = []
        my_html = ""
        if index == 0:
            my_html = owner_html
        else:
            my_html = single_url_obj.read().decode('utf-8')
        sp = BeautifulSoup(my_html)
        for tr in sp.findAll('tr'):
            transaction_info = parse_stock_transaction(tr)
            if len(transaction_info) > 0:
                transaction_infos.append(transaction_info)
        write_list_to_file(output, transaction_infos)
        index += 1
    output.close()
    return

In [101]:
myurl = 'https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193'
all_urls = get_all_urls(myurl,80)
parse_sec_url_obj(all_urls)






opening https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193&type=&dateb=&owner=include&start=80
opening https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193&type=&dateb=&owner=include&start=160
opening https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193&type=&dateb=&owner=include&start=240
opening https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193&type=&dateb=&owner=include&start=320
opening https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193&type=&dateb=&owner=include&start=400
opening https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193&type=&dateb=&owner=include&start=480
opening https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193&type=&dateb=&owner=include&start=560
opening https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193&type=&dateb=&owner=include&start=640
opening https://www.sec.gov/cgi-bin/own-disp?action=getissuer&CIK=0000320193&type=&dateb=

Writing APPLE_INC.csv file ...


In [91]:
hello = "my name is hujin zhen"
print('_'.join(hello.split()))

my_name_is_hujin_zhen
