In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import re
import csv
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Get the biggest investment managers
* scraping the website: https://www.advratings.com/top-asset-management-firms containing the list of the top asset mangement firms
* processing the name of the of the company
* storing the list of all the companies in funds_list

In [2]:
funds_list = []

url = 'https://www.advratings.com/top-asset-management-firms'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

for row in soup.findAll('table')[0].tbody.findAll('tr'):
    company = str(row.findAll('td')[1].contents)
    company = re.split(r'<|>', company)
    if(len(company) > 2):
        #exluding any special chars and wite spaces from company names
        company = ''.join(e for e in company[2] if e.isalnum())
        funds_list.append(company.upper())
    else:
        company = re.split(r'([\'|\'])', company[0])
        #exluding any special chars and wite spaces from company names
        company = ''.join(e for e in company[2] if e.isalnum())
        funds_list.append(company.upper())

#deleting the first record (remainder of the header)
funds_list = funds_list[1:]
len(funds_list)


56

# Creating a dictionary of {Year : URL list} 
* Getting path to all 13F-HR filing per quarter 
* Each file corresponds to quater 1, the files are for years 2021 - 2018 
* Choosing only files from **funds_list** - list of the top asset investment managers

In [3]:
all_years_urls = {}
path = 'https://www.sec.gov/Archives/'
companies = []

file_2020_Q2 = open('/home/ivana/Environments/Data_Preprocessing/13F/2020Q2.txt', 'r')
file_2020_Q3 = open('/home/ivana/Environments/Data_Preprocessing/13F/2020Q3.txt', 'r')
file_2020_Q4 = open('/home/ivana/Environments/Data_Preprocessing/13F/2020Q4.txt', 'r')

file_2021 = open('/home/ivana/Environments/Data_Preprocessing/13F/13F_2021.txt', 'r')
file_2020 = open('/home/ivana/Environments/Data_Preprocessing/13F/13F_2020.txt', 'r')
file_2019 = open('/home/ivana/Environments/Data_Preprocessing/13F/13F_2019.txt', 'r')
file_2018 = open('/home/ivana/Environments/Data_Preprocessing/13F/13F_2018.txt', 'r')




files = [file_2021, file_2020, file_2019, file_2018, file_2020_Q2, file_2020_Q3, file_2020_Q4]


for file in files:
    forms_url = []
    for line in file:
        
        #parsing out the company name from the list
        company = re.findall(r'13F-HR\s*\d*([\D+\s\D+]*)\s*\d*', line)
        
        #string processing to get uniform formatting
        company = ''.join(e for e in company)
        company = company.replace(' ', '')
        company = re.sub('\d', '', company)
        company = company.upper()

        
        #finding the investment managers that match the list of the top investment mangers *fund_list*
        for name in funds_list:
            if (company in name or name in company) and len(company) > 3:
                splitted = line.split()
                forms_url.append(path + splitted[-1])
                
    #adding a key:value pair to a dict. - contains 
    all_years_urls[file.name.split('/')[-1]] = forms_url


#finding out how many 
len(all_years_urls.get('2020Q4.txt'))


61

# Getting a nested dictionary {cik : { issuer : total_amount } }

### Long running time no need to run -> the results per years are stored in **data20XX.json**

* {cik1 : { issuer1 : total_amount, issuer2 : total_amount ...etc}, cik2 : {} ..etc }
* for each investment manager *cik* we get a dictionary of all of the companies it invested into = *issuer*
* for each issuer company *issuer* we get a value corresponding to the **total amount** of stocks
* you can obtain data for a desired year (1st quarter of 2021 - 2018)

In [None]:

#list of all year (keys in all_years_urls)
years = ['13F_2021.txt', '13F_2020.txt', '13F_2019.txt', '13F_2018.txt', '2020Q2.txt', '2020Q3.txt', '2020Q4.txt']

data_dict = {} #nested dictionary for each CIK contains a dictionary of {issuer : total_amount}
names_dict = {} #a dictionary linking cik to the name 


# getting the data frame for a given year 
for url in all_years_urls.get(years[1]): 
    page = requests.get(url)
    data = page.text
    soup = BeautifulSoup(data, "lxml")
    
    cik_pattern = r'\s*CENTRAL INDEX KEY:\s*(\w[\w*|\s*]*)\n'
    cik_key = re.findall(cik_pattern, data)
    cik_key = str(cik_key[0]) if cik_key != [] else None
    data_dict[cik_key] = {}
    
    name_pattern = r'\s*COMPANY CONFORMED NAME:\s*(\w[\w*|\s*]*)\n*'
    name = re.findall(name_pattern, data)
    name = name[0].split('\n') if name != [] else None 
    names_dict[cik_key] = name[0] if name != None else None
 
    
    stocklist = soup.find_all('infotable')

    for s in stocklist:

        if s.find("ns1:nameofissuer") != None:
            # Company name
            n = s.find("ns1:nameofissuer").string
            if n in data_dict[cik_key].keys():
                #Create only a record if the issuer is unique, oterwise sum the amount of stocks
                data_dict[cik_key][n] = data_dict[cik_key].get(n) + int(s.find("ns1:shrsorprnamt").find("ns1:sshprnamt").string)# Company name
            else:
                data_dict[cik_key][n] = int(s.find("ns1:shrsorprnamt").find("ns1:sshprnamt").string)
        
        else:
            n = s.find("nameofissuer").string
            if n in data_dict[cik_key].keys():
                #Create only a record if the issuer is unique, oterwise sum the amount of stocks
                data_dict[cik_key][n] = data_dict[cik_key].get(n) + int(s.find("shrsorprnamt").find("sshprnamt").string)
            else:
                data_dict[cik_key][n] = int(s.find("shrsorprnamt").find("sshprnamt").string)
                



## Storing data in json format
*No need to run if json files are present in the directory*

In [14]:
#Removing ciks/issuers with empty dictionary
data_dict = dict(filter(lambda sub: sub[1], data_dict.items()))
names_dict = dict(filter(lambda sub: sub[1], names_dict.items()))

with open('/home/ivana/Environments/Data_Preprocessing/json/data2020.json', 'w') as json_file:
    json.dump(data_dict, json_file)

with open('/home/ivana/Environments/Data_Preprocessing/json/names2020.json', 'w') as json_file:
    json.dump(names_dict, json_file)

## Loading the json data into dict
### Just copy this step to notebook where you do your analysis to retrieve a dict of data for desired year

In [2]:
with open('/home/ivana/Environments/Data_Preprocessing/json/data2021.json') as json_file:
    data_dict = json.load(json_file)
    
with open('/home/ivana/Environments/Data_Preprocessing/json/names2020_Q4.json') as json_file:
    names_dict = json.load(json_file)

# Making Graphs with NetworkX

In [5]:
G = nx.Graph()
G.add_nodes_from(list(data_dict.keys()), bipartite = 0)
for cik in data_dict.keys():
    issuers_dict = data_dict.get(cik)
    G.add_nodes_from(list(issuers_dict.keys()), bipartite = 1)
    edges = []
    for issuer in issuers_dict:
        tuple = (cik, issuer, {'amount' : issuers_dict.get(issuer)})
        edges.append(tuple)
    G.add_edges_from(edges)

    

# Visualization with PyVis
## Making a smaller sample of data for better visualization
* considering only issuers that more than once investment manager invested in
* once the two investment manger are connected - only one of their investmet is dispayed
* RED NODES = issuers
* BLUE NODES = CIKs

In [9]:
#Making a smaller graph

def visulailze_network(data_dict):
    G_small = nx.Graph()
    nt = Network()
    count = 0


    #G_small.add_nodes_from(list(data_dict.keys()), bipartite = 0)
    for cik1 in data_dict.keys():
        issuers_dict1 = data_dict.get(cik1)
        issuers_set1 = set(issuers_dict1.keys())

        for cik2 in data_dict.keys():
            issuers_dict2 = data_dict.get(cik2)
            issuers_set2 = set(issuers_dict2.keys())
            common = issuers_set1.intersection(issuers_set2)

            if(bool(common) and cik1 != cik2):
                nt.add_node(cik1, color = "blue")
                nt.add_node(cik2, color = "blue")
                for issuer in list(common)[:1]:

                    nt.add_node(issuer, color = "red")
                    nt.add_edge(cik1, issuer)
                    nt.add_edge(cik2, issuer)





In [3]:
def get_sizes(data_dict):
    sizes = {}
    for cik1 in data_dict.keys():
        issuers_dict1 = data_dict.get(cik1)
        issuers_set1 = set(issuers_dict1.keys())
    
        for cik2 in data_dict.keys():
            issuers_dict2 = data_dict.get(cik2)
            issuers_set2 = set(issuers_dict2.keys())
            common = issuers_set1.intersection(issuers_set2)

            if(bool(common) and cik1 != cik2):
                
                if cik1 in sizes.keys():
                    sizes[cik1] += 1
                else: 
                    sizes[cik1] = 1              
                
    return sizes

                


In [4]:
def visualize_network(data_dict):
    
    G_small = nx.Graph()
    nt = Network()
    count = 0
    
    sizes = get_sizes(data_dict)
    for cik1 in sizes:
        issuers_dict1 = data_dict.get(cik1)
        issuers_set1 = set(issuers_dict1.keys())
        for cik2 in sizes:
            issuers_dict2 = data_dict.get(cik2)
            issuers_set2 = set(issuers_dict2.keys())
            common = issuers_set1.intersection(issuers_set2)
            
            if(bool(common) and cik1 != cik2):
                nt.add_node(cik1, color = "blue", size = sizes.get(cik1))
                nt.add_node(cik2, color = "blue", size = sizes.get(cik2))
                nt.add_edge(cik1, cik2)
    return nt

In [5]:
nt = visualize_network(data_dict)

# Visualization of the grapj
nt.show_buttons(filter_=['physics'])
nt.show('nx.html')

NameError: name 'nt' is not defined