In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import os
import csv
import json
import requests
import pandas as pd
from pandas_datareader import data as pd_data
from yahooquery import Ticker

# Get setup from key snippets from Parse_CSV_for_companies 

## Methods

In [11]:
# Function to see if organization is a university
def is_academia(organization: str):
    keywords = ['University', 'College', 'Academy']
    
    # https://stackoverflow.com/questions/8122079/python-how-to-check-a-string-for-substrings-from-a-list
    tf = any(map(organization.__contains__, keywords))
    return tf





def check_usa_mkts(data):
    usa_mkts = ['NYSE', 'NASDAQ', 'AMEX', 
               'BSE', 'CBOE', 'CBOT', 
               'CME', 'CHX', 'ISE', 
               'MS4X', 'NSX', 'PHLX']
    
    df = pd.DataFrame(data)
    match = df.loc[df['exchDisp'].isin(usa_mkts)]
    
    if len(match) == 0:
        symbol = df['symbol'][0]
        exchange = df['exchDisp'][0]
        name = df['name'][0]
        usa = 'N'
    
    else:
        symbol = match['symbol'][0]
        exchange = match['exchDisp'][0]
        name = match['name'][0]
        usa = 'Y'
        
    return symbol, exchange, name, usa

def get_and_parse_query(query):
    r = requests.get(query)
    fdata = r.text.split('(', 1)[1]
    fdata = fdata.rsplit(')', 1)[0]
    data = json.loads(fdata)
    data = data['ResultSet']['Result']
    
    return data




def id_company_size(market_cap):
    sizes = ['small', 'medium', 'large', 'v-large']
    thresh = [0, 10e9, 25e9, 50e9]
    
    if market_cap > thresh[3]:
        size = sizes[3]
    elif market_cap > thresh[2]:
        size = sizes[2]
    elif market_cap > thresh[1]:
        size = sizes[1]
    else:
        size = sizes[0]
        
    return size


def get_market_cap(symbol):
    cap = pd_data.get_quote_yahoo(symbol)['marketCap']
    cap = cap[symbol]
    
    return cap

In [53]:
path_data = '../datasets/WHO-covid19-clinicaltrials.csv'
df = pd.read_csv(path_data)
developers = df['Developer'].tolist()
companies = []


# GET LIST OF COMPANIES
# For each row, split to individual organizations
for developer in developers:
    organizations = developer.split('/')
    
    # For each organization, clean up formatting and add to companies list if not academia
    for organization in organizations:
        organization = organization.replace('\n', ' ')
        if not is_academia(organization):
            companies.append(organization)

      
    
# GET COMPANY SYMBOLS AND EXCHANGES    
symbols = []
exchanges = []
ynames = []
am_tf = []

for i, company in enumerate(companies):
    fco = company.replace('.', '').replace(' ', '%20')
    query = f'http://d.yimg.com/autoc.finance.yahoo.com/autoc?query={fco}&region=1&lang=en&callback=YAHOO.Finance.SymbolSuggest.ssCallback'
    
    entry = get_and_parse_query(query)        

    if len(entry) == 0:
        print(f'[{i}] {company} found no matches')
        symbols.append('n/a')
        exchanges.append('n/a')
        ynames.append('n/a')
        am_tf.append('n/a')
        
    
    else:
        symbol, exchange, yname, am = check_usa_mkts(entry)
        symbols.append(symbol)
        exchanges.append(exchange)
        ynames.append(yname)
        am_tf.append(am)
        print(f'[{i}] {company} matched with {yname} as {symbol} on {exchange}')
        
        
        
# GET MARKET CAPS AND CLASSIFY SIZE OF COMPANY
market_caps = []
market_sizes = []


for company, symbol in zip(companies, symbols):
    if symbol != 'n/a':
        try:
            cap = get_market_cap(symbol)
            size = id_company_size(cap)
            
            market_caps.append(cap)
            market_sizes.append(size)
            
            print(f'{company} [Market Cap]: ${cap:,} - {size.upper()}')
            
            
        except:
            market_caps.append('n/a')
            market_sizes.append('n/a')
            print(f'{company} not found on Yahoo! Finance')
    else:
        market_caps.append('n/a')
        market_sizes.append('n/a')
        print(f'{company} not available')
        
        
        
        
# Format dollar entries
fmarket_caps = []
for cap in market_caps:
    if cap != 'n/a':
        fcap = f'${cap:,}'
    else:
        fcap = cap
    
    fmarket_caps.append(fcap)
    
    

# CREATE NEW DATAFRAME
df2 = pd.DataFrame({'Company': companies, 'Yahoo Listed Co.': ynames, 
                    'Symbol': symbols, 'Exchange': exchanges, 
                    'Market Cap': fmarket_caps, 'Company Size': market_sizes, 
                    'Is American': am_tf})
df2.head(23)

[0] AstraZeneca matched with AstraZeneca PLC as AZN on NYSE
[1] CanSino Biological Inc. found no matches
[2] Beijing Institute of Biotechnology found no matches
[3] Moderna matched with Moderna, Inc. as MRNA on OTC Markets
[4] NIAID found no matches
[5] Wuhan Institute of Biological Products found no matches
[6] Sinopharm matched with Sinopharm Group Co., Ltd. as SHTDY on OTC Markets
[7] Beijing Institute of Biological Products found no matches
[8] Sinopharm matched with Sinopharm Group Co., Ltd. as SHTDY on OTC Markets
[9] Sinovac matched with Sinovac Biotech Ltd. as SVA on NASDAQ
[10] Novavax matched with Novavax, Inc. as NVAX on NASDAQ
[11] BioNTech matched with BioNTech SE as BNTX on NASDAQ
[12] Fosun Pharma matched with Shanghai Fosun Pharmaceutical (Group) Co., Ltd. as SFOSF on OTC Markets
[13] Pfizer matched with Pfizer Inc. as PFE on NYSE
[14] Inovio Pharmaceuticals matched with Inovio Pharmaceuticals, Inc. as INO on NASDAQ
[15] Genexine Consortium found no matches
[16] Gamaley

Unnamed: 0,Company,Yahoo Listed Co.,Symbol,Exchange,Market Cap,Company Size,Is American
0,AstraZeneca,AstraZeneca PLC,AZN,NYSE,"$137,126,248,448",v-large,Y
1,CanSino Biological Inc.,,,,,,
2,Beijing Institute of Biotechnology,,,,,,
3,Moderna,"Moderna, Inc.",MRNA,OTC Markets,"$22,863,685,632",medium,N
4,NIAID,,,,,,
5,Wuhan Institute of Biological Products,,,,,,
6,Sinopharm,"Sinopharm Group Co., Ltd.",SHTDY,OTC Markets,"$8,092,185,600",small,N
7,Beijing Institute of Biological Products,,,,,,
8,Sinopharm,"Sinopharm Group Co., Ltd.",SHTDY,OTC Markets,"$8,092,185,600",small,N
9,Sinovac,Sinovac Biotech Ltd.,SVA,NASDAQ,"$460,249,280",small,Y


# Look up each company

## Attempt by using the clearbit autocomplete API
method recommended on https://medium.com/the-red-fish/automate-finding-a-company-url-with-a-company-name-on-google-sheets-for-free-in-3-easy-steps-7ea77280bcdc

In [54]:
# def get_company_url(name):
#     QUERY_BASE = 'https://autocomplete.clearbit.com/v1/companies/suggest?query=' 
#     QUERY = f'{QUERY_BASE}{name}'

#     r = requests.get(QUERY)
#     entry = json.loads(r.text)
    
#     if len(entry) > 0:
#         return entry[0]['domain']
#     else:
#         return 'n/a'

In [66]:
# domains = []

# for ii in range(len(df2)):
#     co, yco, sym, exch, cap, size, usa = df2.iloc[ii].T.values
    
#     if yco != 'n/a':
#         print(f'Checking {yco}...')
#         domain = get_company_url(yco.replace('.', '').replace(' ', '%20'))
        
#         if domain == 'n/a':
#             domain = get_company_url(co.replace('.', '').replace(' ', '%20'))
            
#         print(f'Returned {domain}')
#         domains.append(domain)
#     else:
#         print(f'Skipping {co}...')
#         domains.append('n/a')

Checking AstraZeneca PLC...
Returned astrazeneca.com
Skipping CanSino Biological Inc....
Skipping Beijing Institute of Biotechnology...
Checking Moderna, Inc....
Returned modernatx.com
Skipping NIAID...
Skipping Wuhan Institute of Biological Products...
Checking Sinopharm Group Co., Ltd....
Returned sinopharm.com
Skipping Beijing Institute of Biological Products...
Checking Sinopharm Group Co., Ltd....
Returned sinopharm.com
Checking Sinovac Biotech Ltd....
Returned sinovac.com
Checking Novavax, Inc....
Returned novavax.com
Checking BioNTech SE...
Returned biontech.de
Checking Shanghai Fosun Pharmaceutical (Group) Co., Ltd....
Returned fosunpharma.com
Checking Pfizer Inc....
Returned pfizer.com
Checking Inovio Pharmaceuticals, Inc....
Returned inovio.com
Skipping Genexine Consortium...
Skipping Gamaleya Research Institute...
Skipping Clover Biopharmaceuticals Inc....
Checking GlaxoSmithKline plc...
Returned gsk.com
Checking Dynavax Technologies Corporation...
Returned n/a
Skipping Anhu

## Approach by searching via Ticker with yahooquery package
Package found here https://pypi.org/project/yahooquery/

Snippet example found here https://stackoverflow.com/questions/41527912/how-to-get-company-website-from-a-finance-ticker-stock-symbol

In [75]:
domains = []
for ii in range(len(df2)):
    co, yco, sym, exch, cap, size, usa = df2.iloc[ii].T.values
    
    if yco != 'n/a':
        t = Ticker(sym, asynchronous=True)
        data = t.asset_profile
        domains.append(data[sym]['website'])
    else:
        domains.append('n/a')

In [76]:
df2_urls = df2.copy()
df2_urls['URLs'] = domains
pd.set_option('display.max_columns', None)
df2_urls

Unnamed: 0,Company,Yahoo Listed Co.,Symbol,Exchange,Market Cap,Company Size,Is American,URLs
0,AstraZeneca,AstraZeneca PLC,AZN,NYSE,"$137,126,248,448",v-large,Y,http://www.astrazeneca.com
1,CanSino Biological Inc.,,,,,,,
2,Beijing Institute of Biotechnology,,,,,,,
3,Moderna,"Moderna, Inc.",MRNA,OTC Markets,"$22,863,685,632",medium,N,http://www.modernatx.com
4,NIAID,,,,,,,
5,Wuhan Institute of Biological Products,,,,,,,
6,Sinopharm,"Sinopharm Group Co., Ltd.",SHTDY,OTC Markets,"$8,092,185,600",small,N,http://www.sinopharmgroup.com.cn
7,Beijing Institute of Biological Products,,,,,,,
8,Sinopharm,"Sinopharm Group Co., Ltd.",SHTDY,OTC Markets,"$8,092,185,600",small,N,http://www.sinopharmgroup.com.cn
9,Sinovac,Sinovac Biotech Ltd.,SVA,NASDAQ,"$460,249,280",small,Y,http://www.sinovacbio.com


http://www.astrazeneca.com
