Visualizing insider connections for twitter from form 3,4,5 regulatory disclosures with an animated network graph.

In [1]:
from datamule import PremiumDownloader, Downloader, Portfolio

downloader = PremiumDownloader()
ciks_to_download = set(['1418091'])  # Use a set to avoid duplicates
ciks_downloaded = set()  # Use a set for downloaded CIKs too

for iteration in range(2):
    print(f"Starting iteration {iteration}")
    output_dir = f'twitter_345/{iteration}'
    
    # Get CIKs that haven't been downloaded yet
    ciks_to_process = list(ciks_to_download - ciks_downloaded)
    
    print(f"Processing {ciks_to_process} to {output_dir}")
    downloader.download_submissions(
        submission_type=['3','4','5'],
        output_dir=output_dir,
        cik=ciks_to_process,
        )
    
    # Add processed CIKs to downloaded set
    ciks_downloaded.update(ciks_to_process)
    
    portfolio = Portfolio(output_dir)
    for submission in portfolio.submissions:
        for document in submission.document_type(['3','4','5']):
            try:
                document.parse()
                cik = document.data['metadata']['reportingOwner']['cik']
                ciks_to_download.add(cik)
            except Exception as e:
                print(f"Error parsing document {document.filename}: {e}") # 3,4,5 Parser currently can only parse XML. So 2005 era filings will fail.


Starting iteration 0
Processing ['1418091'] to twitter_345/0

Cost: $0.003980000000 downloads + $0.004853621000 row reads = $0.008833621000
Balance: $9.675979633000


Processing files: 100%|██████████| 398/398 [00:03<00:00, 118.22it/s]



Processing completed in 3.38 seconds
Loading 398 submissions


Loading submissions: 100%|██████████| 398/398 [00:00<00:00, 6803.77it/s]


Starting iteration 1
Processing ['0001219230', '0001590849', '0001590850', '0001197957', '0001590848', '0001590847', '0001623646', '0001610312', '0001504326', '0001672336', '0001590953', '0001590945', '0001119838', '0001363391', '0001377489', '0001590401', '0001294397', '0001237860', '0001590852', '0001609815', '0001590803', '0001590851', '0001635648', '0001613438', '0001513441', '0001593642', '0001193339', '0001624030', '0001644368', '0001591174'] to twitter_345/1

Cost: $0.010180000000 downloads + $0.004853621000 row reads = $0.015033621000
Balance: $9.660946012000


Processing files: 100%|██████████| 1018/1018 [00:08<00:00, 126.50it/s]



Processing completed in 8.05 seconds
Loading 1018 submissions


Loading submissions: 100%|██████████| 1018/1018 [00:00<00:00, 1422.62it/s]


In [10]:
from datamule import Document 

document = Document(filename='twitter_345/0/000110465914036575/a4.xml',type='3')
document.parse()
print(document.data)

{'metadata': {'schemaVersion': 'X0306', 'documentType': '4', 'periodOfReport': '2014-05-06', 'dateOfOriginalSubmission': '', 'form3HoldingsReported': '', 'form4TransactionsReported': '', 'issuer': {'cik': '0001418091', 'name': 'TWITTER, INC.', 'tradingSymbol': 'TWTR'}, 'reportingOwner': {'cik': '0001590803', 'name': 'RTLC Management, LLC', 'address': {'street1': '260 EAST BROWN STREET', 'street2': 'SUITE 380', 'city': 'BIRMINGHAM', 'state': 'MI', 'zip': '48009', 'stateDescription': ''}, 'relationship': {'isDirector': '0', 'isOfficer': '0', 'isTenPercentOwner': '1', 'isOther': '0', 'officerTitle': ''}}, 'signature': {'name': '/s/ Viqar Shariff, Vice President, RTLC Management, LLC', 'date': '2014-05-08'}}, 'holdings': [{'type': 'non-derivative', 'securityTitle': {'value': 'Common Stock'}, 'postTransactionAmounts': {'sharesOwned': {'value': '0'}}, 'ownershipNature': {'directOrIndirect': 'I'}, 'transactionDate': {'value': '2014-05-06'}, 'transactionCoding': {'formType': '4', 'code': 'J', 

In [2]:
# Parse every 3,4,5 document
from tqdm import tqdm

d_list = []
total_submissions = len(portfolio.submissions)
for submission in tqdm(portfolio, total=total_submissions, desc="Processing submissions"):
    for form in submission.document_type(['3','4','5']):
        form.parse()
        d_list.append({
            'origin': form.data['metadata']['issuer']['cik'],
            'destination': form.data['metadata']['reportingOwner']['cik']
        })

# Save the edge list to a CSV file
import pandas as pd

# Create DataFrame
df = pd.DataFrame(d_list)

# Write to CSV file without duplicates
df.drop_duplicates().to_csv('data/twitter_345_edge_list.csv', index=False)

Processing submissions: 100%|██████████| 1018/1018 [00:00<00:00, 1558.96it/s]
