In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import os
import csv
import json
import urllib
import requests
import time
import pandas as pd
from pandas_datareader import data as pd_data
from yahooquery import Ticker
from bs4 import BeautifulSoup

  from pandas.util.testing import assert_frame_equal


# Get setup from key snippets from Parse_CSV_for_companies 

## Methods

In [4]:
# Function to see if organization is a university
def is_academia(organization: str):
    keywords = ['University', 'College', 'Academy']
    
    # https://stackoverflow.com/questions/8122079/python-how-to-check-a-string-for-substrings-from-a-list
    tf = any(map(organization.__contains__, keywords))
    return tf





def check_usa_mkts(data):
    usa_mkts = ['NYSE', 'NASDAQ', 'AMEX', 
               'BSE', 'CBOE', 'CBOT', 
               'CME', 'CHX', 'ISE', 
               'MS4X', 'NSX', 'PHLX']
    
    df = pd.DataFrame(data)
    match = df.loc[df['exchDisp'].isin(usa_mkts)]
    
    if len(match) == 0:
        symbol = df['symbol'][0]
        exchange = df['exchDisp'][0]
        name = df['name'][0]
        usa = 'N'
    
    else:
        symbol = match['symbol'][0]
        exchange = match['exchDisp'][0]
        name = match['name'][0]
        usa = 'Y'
        
    return symbol, exchange, name, usa

def get_and_parse_query(query):
    r = requests.get(query)
    fdata = r.text.split('(', 1)[1]
    fdata = fdata.rsplit(')', 1)[0]
    data = json.loads(fdata)
    data = data['ResultSet']['Result']
    
    return data




def id_company_size(market_cap):
    sizes = ['small', 'medium', 'large', 'v-large']
    thresh = [0, 10e9, 25e9, 50e9]
    
    if market_cap > thresh[3]:
        size = sizes[3]
    elif market_cap > thresh[2]:
        size = sizes[2]
    elif market_cap > thresh[1]:
        size = sizes[1]
    else:
        size = sizes[0]
        
    return size


def get_market_cap(symbol):
    cap = pd_data.get_quote_yahoo(symbol)['marketCap']
    cap = cap[symbol]
    
    return cap

In [5]:
path_data = '../datasets/WHO-covid19-clinicaltrials.csv'
df = pd.read_csv(path_data)
developers = df['Developer'].tolist()
companies = []


# GET LIST OF COMPANIES
# For each row, split to individual organizations
for developer in developers:
    organizations = developer.split('/')
    
    # For each organization, clean up formatting and add to companies list if not academia
    for organization in organizations:
        organization = organization.replace('\n', ' ')
        if not is_academia(organization):
            companies.append(organization)

      
    
# GET COMPANY SYMBOLS AND EXCHANGES    
symbols = []
exchanges = []
ynames = []
am_tf = []

for i, company in enumerate(companies):
    fco = company.replace('.', '').replace(' ', '%20')
    query = f'http://d.yimg.com/autoc.finance.yahoo.com/autoc?query={fco}&region=1&lang=en&callback=YAHOO.Finance.SymbolSuggest.ssCallback'
    
    entry = get_and_parse_query(query)        

    if len(entry) == 0:
        print(f'[{i}] {company} found no matches')
        symbols.append('n/a')
        exchanges.append('n/a')
        ynames.append('n/a')
        am_tf.append('n/a')
        
    
    else:
        symbol, exchange, yname, am = check_usa_mkts(entry)
        symbols.append(symbol)
        exchanges.append(exchange)
        ynames.append(yname)
        am_tf.append(am)
        print(f'[{i}] {company} matched with {yname} as {symbol} on {exchange}')
        
        
        
# GET MARKET CAPS AND CLASSIFY SIZE OF COMPANY
market_caps = []
market_sizes = []


for company, symbol in zip(companies, symbols):
    if symbol != 'n/a':
        try:
            cap = get_market_cap(symbol)
            size = id_company_size(cap)
            
            market_caps.append(cap)
            market_sizes.append(size)
            
            print(f'{company} [Market Cap]: ${cap:,} - {size.upper()}')
            
            
        except:
            market_caps.append('n/a')
            market_sizes.append('n/a')
            print(f'{company} not found on Yahoo! Finance')
    else:
        market_caps.append('n/a')
        market_sizes.append('n/a')
        print(f'{company} not available')
        
        
        
        
# Format dollar entries
fmarket_caps = []
for cap in market_caps:
    if cap != 'n/a':
        fcap = f'${cap:,}'
    else:
        fcap = cap
    
    fmarket_caps.append(fcap)
    
    

# CREATE NEW DATAFRAME
df2 = pd.DataFrame({'Company': companies, 'Yahoo Listed Co.': ynames, 
                    'Symbol': symbols, 'Exchange': exchanges, 
                    'Market Cap': fmarket_caps, 'Company Size': market_sizes, 
                    'Is American': am_tf})
df2.head(23)

[0] AstraZeneca matched with AstraZeneca PLC as AZN on NYSE
[1] CanSino Biological Inc. found no matches
[2] Beijing Institute of Biotechnology found no matches
[3] Moderna matched with Moderna, Inc. as MRNA on NASDAQ
[4] NIAID found no matches
[5] Wuhan Institute of Biological Products found no matches
[6] Sinopharm matched with Sinopharm Group Co., Ltd. as SHTDY on OTC Markets
[7] Beijing Institute of Biological Products found no matches
[8] Sinopharm matched with Sinopharm Group Co., Ltd. as SHTDY on OTC Markets
[9] Sinovac matched with Sinovac Biotech Ltd. as SVA on NASDAQ
[10] Novavax matched with Novavax, Inc. as NVAX on NASDAQ
[11] BioNTech matched with BioNTech SE as BNTX on NASDAQ
[12] Fosun Pharma matched with Shanghai Fosun Pharmaceutical (Group) Co., Ltd. as SFOSF on OTC Markets
[13] Pfizer matched with Pfizer Inc. as PFE on NYSE
[14] Inovio Pharmaceuticals matched with Inovio Pharmaceuticals, Inc. as INO on NASDAQ
[15] Genexine Consortium found no matches
[16] Gamaleya Res

Unnamed: 0,Company,Yahoo Listed Co.,Symbol,Exchange,Market Cap,Company Size,Is American
0,AstraZeneca,AstraZeneca PLC,AZN,NYSE,"$140,554,649,600",v-large,Y
1,CanSino Biological Inc.,,,,,,
2,Beijing Institute of Biotechnology,,,,,,
3,Moderna,"Moderna, Inc.",MRNA,NASDAQ,"$21,742,589,952",medium,Y
4,NIAID,,,,,,
5,Wuhan Institute of Biological Products,,,,,,
6,Sinopharm,"Sinopharm Group Co., Ltd.",SHTDY,OTC Markets,"$8,092,427,776",small,N
7,Beijing Institute of Biological Products,,,,,,
8,Sinopharm,"Sinopharm Group Co., Ltd.",SHTDY,OTC Markets,"$8,092,427,776",small,N
9,Sinovac,Sinovac Biotech Ltd.,SVA,NASDAQ,"$460,249,280",small,Y


# Look up each company

## Attempt by using the clearbit autocomplete API
method recommended on https://medium.com/the-red-fish/automate-finding-a-company-url-with-a-company-name-on-google-sheets-for-free-in-3-easy-steps-7ea77280bcdc

In [6]:
# def get_company_url(name):
#     QUERY_BASE = 'https://autocomplete.clearbit.com/v1/companies/suggest?query=' 
#     QUERY = f'{QUERY_BASE}{name}'

#     r = requests.get(QUERY)
#     entry = json.loads(r.text)
    
#     if len(entry) > 0:
#         return entry[0]['domain']
#     else:
#         return 'n/a'

In [7]:
# domains = []

# for ii in range(len(df2)):
#     co, yco, sym, exch, cap, size, usa = df2.iloc[ii].T.values
    
#     if yco != 'n/a':
#         print(f'Checking {yco}...')
#         domain = get_company_url(yco.replace('.', '').replace(' ', '%20'))
        
#         if domain == 'n/a':
#             domain = get_company_url(co.replace('.', '').replace(' ', '%20'))
            
#         print(f'Returned {domain}')
#         domains.append(domain)
#     else:
#         print(f'Skipping {co}...')
#         domains.append('n/a')

## Approach by searching via Ticker with yahooquery package
Package found here https://pypi.org/project/yahooquery/

Snippet example found here https://stackoverflow.com/questions/41527912/how-to-get-company-website-from-a-finance-ticker-stock-symbol

In [8]:
domains = []
for ii in range(len(df2)):
    co, yco, sym, exch, cap, size, usa = df2.iloc[ii].T.values
    
    if yco != 'n/a':
        t = Ticker(sym, asynchronous=True)
        data = t.asset_profile
        domains.append(data[sym]['website'])
    else:
        domains.append('n/a')

In [9]:
# df2_urls = df2.copy()
# df2_urls['URLs'] = domains
# pd.set_option('display.max_columns', None)
# df2_urls

# Get each company's press release landing page

In [10]:
from googlesearch import search 

def search_google(query, num_results = 1):
    results = []
    num_ppg = min([num_results, 10])
    for i in search(query,           # The query you want to run
                tld = 'com',         # The top level domain
                lang = 'en',         # The language
                num = num_ppg,       # Number of results per page
                start = 0,           # First result to retrieve
                stop = num_results,  # Last result to retrieve
                pause = 3.0,         # Lapse between HTTP requests
               ):
        
        results.append(i)
    
    if len(results) == 0:
        results = ['n/a']
        
    return results


def prune_domain(url:str):
    cut_chars = ['https://', 'http://', 'www.']
    
    for cut in cut_chars:
        url = url.replace(cut, '')
    
    if url[-1] == '/':
        url = url[0:-1]
    
    return url


    

In [11]:
domains_pr = []
for co, domain in zip(companies, domains):
    if domain != 'n/a':
        print('----------------------------------------')
        print(f'Searching for PR page on {domain} ...')
        
        QUERY = f'site:{prune_domain(domain)} press releases'
        print(f'QUERY: \'{QUERY}\'')
        pr_url = search_google(QUERY)
        
        print(f'Found: {pr_url[0]}\n\n')
        
        domains_pr.append(pr_url[0])
        
        time.sleep(5)
        
    else:
        print('----------------------------------------')
        print(f'Skipping {co}')
        print('...\n\n')
        domains_pr.append('n/a')

print(domains_pr)

----------------------------------------
Searching for PR page on http://www.astrazeneca.com ...
QUERY: 'site:astrazeneca.com press releases'
Found: https://www.astrazeneca.com/media-centre/press-releases.html


----------------------------------------
Skipping CanSino Biological Inc.
...


----------------------------------------
Skipping Beijing Institute of Biotechnology
...


----------------------------------------
Searching for PR page on http://www.modernatx.com ...
QUERY: 'site:modernatx.com press releases'
Found: https://investors.modernatx.com/news-releases/


----------------------------------------
Skipping NIAID
...


----------------------------------------
Skipping Wuhan Institute of Biological Products
...


----------------------------------------
Searching for PR page on http://www.sinopharmgroup.com.cn ...
QUERY: 'site:sinopharmgroup.com.cn press releases'
Found: http://ir.sinopharmgroup.com.cn/


----------------------------------------
Skipping Beijing Institute of

In [12]:
# For some reason, calling this on its own will give me the actual press release page, but in the loop it doesn't work
#domain = 'http://dynavax.com'
domain = 'http://www.novavax.com'
QUERY = f'site:{domain} press releases'
print(QUERY)
pr_url = search_google(QUERY)
print(pr_url[0])

# FIXED

site:http://www.novavax.com press releases
https://www.novavax.com/page/17/cadila-pharmaceuticals-collaboration


In [13]:
# query = "selenium"
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
# url = 'https://www.google.com?q=' + query
# res = requests.get(url, headers=headers)
# print(res)

In [14]:
# Method based off https://hackernoon.com/how-to-scrape-google-with-python-bo7d2tal
# def scrape_google(query):
#     query = query.replace(' ', '%20')
#     URL_base = f'https://google.com/search?q={query}'
#     USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
#     headers = {'user-agent' : USER_AGENT}
#     r = requests.get(URL_base, headers = headers)
    
#     if r.status_code == 200:
#         soup = BeautifulSoup(r.content, 'html.parser')
        
#         results = []
#         for i, g in enumerate(soup.find_all('div', class_='r')):
#             anchors = g.find_all('a')
#             if anchors:
#                 link = anchors[0]['href']
#                 title = g.find('h3').text
#                 item = {
#                     "title": title,
#                     "link": link
#                 }
#                 results.append(item)
        
#         return results
    
#     else:
#         print(r.status_code)
#         return 'n/a'

In [15]:
# domain = 'https://www.dynavax.com/'
# QUERY = f'site:{domain} press releases'
# print(QUERY)
# r = scrape_google(QUERY)

In [16]:
# for result in r:
#     print(result['link'])

# print(r)

In [17]:
# import requests
# from bs4 import BeautifulSoup
# import re

# def scrape_google2(query, num_results = 10):
    
#     search = query
#     results = num_results
    
#     page = requests.get(f"https://www.google.com/search?q={search}&num={results}")
#     soup = BeautifulSoup(page.content, "html.parser")
#     links = soup.findAll("a")
    
#     results = []
#     for link in links :
#         link_href = link.get('href')
#         if "url?q=" in link_href and not "webcache" in link_href:
#             results.append(link.get('href').split("?q=")[1].split("&sa=U")[0])
            
#     return results

In [18]:
# https://stackoverflow.com/questions/43530930/how-to-access-top-five-google-result-links-using-beautifulsoup
# domain = 'dynavax.com'
# QUERY = f'site:{domain} press releases'
# print(QUERY)
# results = scrape_google2(QUERY)
# for result in results:
#     print(result)


In [19]:
df2_urls = df2.copy()
df2_urls['Home URL'] = domains
df2_urls['Press Release URL'] = domains_pr
pd.set_option('display.max_columns', None)
df2_urls = df2_urls.drop_duplicates()
df2_urls.to_csv('../datasets/compiled_company_info.csv', index = False)

In [21]:
df2_urls.head(len(df2_urls))

Unnamed: 0,Company,Yahoo Listed Co.,Symbol,Exchange,Market Cap,Company Size,Is American,Home URL,Press Release URL
0,AstraZeneca,AstraZeneca PLC,AZN,NYSE,"$140,554,649,600",v-large,Y,http://www.astrazeneca.com,https://www.astrazeneca.com/media-centre/press...
1,CanSino Biological Inc.,,,,,,,,
2,Beijing Institute of Biotechnology,,,,,,,,
3,Moderna,"Moderna, Inc.",MRNA,NASDAQ,"$21,742,589,952",medium,Y,http://www.modernatx.com,https://investors.modernatx.com/news-releases/
4,NIAID,,,,,,,,
5,Wuhan Institute of Biological Products,,,,,,,,
6,Sinopharm,"Sinopharm Group Co., Ltd.",SHTDY,OTC Markets,"$8,092,427,776",small,N,http://www.sinopharmgroup.com.cn,http://ir.sinopharmgroup.com.cn/
7,Beijing Institute of Biological Products,,,,,,,,
9,Sinovac,Sinovac Biotech Ltd.,SVA,NASDAQ,"$460,249,280",small,Y,http://www.sinovacbio.com,http://www.sinovacbio.com/?optionid=754
10,Novavax,"Novavax, Inc.",NVAX,NASDAQ,"$4,791,271,936",small,Y,http://www.novavax.com,https://ir.novavax.com/press-releases


# Parse through press release pages

In [None]:
# scrapy vs beautifulsoup