In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import re
import csv
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite

# Get the biggest investment managers
* scraping the website: https://www.advratings.com/top-asset-management-firms containing the list of the top asset mangement firms
* processing the name of the of the company
* storing the list of all the companies in funds_list

In [4]:
funds_list = []

url = 'https://www.advratings.com/top-asset-management-firms'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

for row in soup.findAll('table')[0].tbody.findAll('tr'):
    company = str(row.findAll('td')[1].contents)
    company = re.split(r'<|>', company)
    if(len(company) > 2):
        #exluding any special chars and wite spaces from company names
        company = ''.join(e for e in company[2] if e.isalnum())
        funds_list.append(company.upper())
    else:
        company = re.split(r'([\'|\'])', company[0])
        #exluding any special chars and wite spaces from company names
        company = ''.join(e for e in company[2] if e.isalnum())
        funds_list.append(company.upper())

#deleting the first record (remainder of the header)
funds_list = funds_list[1:]
len(funds_list)


56

# Creating a dictionary of {Year : URL list} 
* Getting path to all 13F-HR filing per quarter 
* Each file corresponds to quater 1, the files are for years 2021 - 2018 
* Choosing only files from **funds_list** - list of the top asset investment managers

In [5]:
all_years_urls = {}
path = 'https://www.sec.gov/Archives/'
companies = []

count1 = 0
count2 = 0

file_2021 = open('/home/ivana/Environments/env/Data_Preprocessing/13F_2021.txt', 'r')
file_2020 = open('/home/ivana/Environments/env//Data_Preprocessing/13F_2020.txt', 'r')
file_2019 = open('/home/ivana/Environments/env//Data_Preprocessing/13F_2019.txt', 'r')
file_2018 = open('/home/ivana/Environments/env//Data_Preprocessing/13F_2018.txt', 'r')

files = [file_2021, file_2020, file_2019, file_2018]

for file in files:
    forms_url = []
    for line in file:
        
        #parsing out the company name from the list
        company = re.findall(r'13F-HR\s*\d*([\D+\s\D+]*)\s*\d*', line)
        
        #string processing to get uniform formatting
        company = ''.join(e for e in company)
        company = company.replace(' ', '')
        company = re.sub('\d', '', company)
        company = company.upper()

        
        #finding the investment managers that match the list of the top investment mangers *fund_list*
        for name in funds_list:
            if (company in name or name in company) and len(company) > 3:
                splitted = line.split()
                forms_url.append(path + splitted[-1])
                
    #adding a key:value pair to a dict. - contains 
    all_years_urls[file.name.split('/')[-1]] = forms_url

#finding out how many 
len(all_years_urls.get('13F_2021.txt'))


77

# Getting a nested dictionary {cik : { issuer : total_amount } }
* {cik1 : { issuer1 : total_amount, issuer2 : total_amount ...etc}, cik2 : {} ..etc }
* for each investment manager *cik* we get a dictionary of all of the companies it invested into = *issuer*
* for each issuer company *issuer* we get a value corresponding to the **total amount** of stocks
* you can obtain data for a desired year (1st quarter of 2021 - 2018)

In [6]:

#list of all year (keys in all_years_urls)
years = ['13F_2021.txt', '13F_2020.txt', '13F_2019.txt', '13F_2018.txt']

data_dict = {} #nested dictionary for each CIK contains a dictionary of {issuer : total_amount}


#networksCollection = db.createCollection(name = 'Networks') only 1st time run

# getting the data frame for a given year 
for url in all_years_urls.get(years[0]): 
    page = requests.get(url)
    data = page.text
    soup = BeautifulSoup(data, "lxml")

    cik_key = url.split('/')
    cik_key = cik_key[-1].split('-')
    cik_key = cik_key[0]
    data_dict[cik_key] = {}
    
    stocklist = soup.find_all('infotable')

    for s in stocklist:

        if s.find("ns1:nameofissuer") != None:
            # Company name
            n = s.find("ns1:nameofissuer").string
            if n in data_dict[cik_key].keys():
                #Create only a record if the issuer is unique, oterwise sum the amount of stocks
                data_dict[cik_key][n] = data_dict[cik_key].get(n) + int(s.find("ns1:shrsorprnamt").find("ns1:sshprnamt").string)# Company name
            else:
                data_dict[cik_key][n] = int(s.find("ns1:shrsorprnamt").find("ns1:sshprnamt").string)
        
        else:
            n = s.find("nameofissuer").string
            if n in data_dict[cik_key].keys():
                #Create only a record if the issuer is unique, oterwise sum the amount of stocks
                data_dict[cik_key][n] = data_dict[cik_key].get(n) + int(s.find("shrsorprnamt").find("sshprnamt").string)
            else:
                data_dict[cik_key][n] = int(s.find("shrsorprnamt").find("sshprnamt").string)
                



In [None]:
data_dict

# Making Graphs with NetworkX

In [None]:
G = nx.Graph()
G.add_nodes_from(list(data_dict.keys()), bipartite = 0)
for cik in data_dict.keys():
    issuers_dict = data_dict.get(cik)
    G.add_nodes_from(list(issuers_dict.keys()), bipartite = 1)
    edges = []
    for issuer in issuers_dict:
        tuple = (cik, issuer, {'amount' : issuers_dict.get(issuer)})
        edges.append(tuple)
    G.add_edges_from(edges)


#print(G.edges)
    