In [1]:
# || Header ||
import numpy as np
import pandas as pd
import math
from scipy import stats
import linecache
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
from importlib import reload

In [2]:
keywords = ['Biosimilar', 'Otezla', 'Omecamtiv mecarbil', 'Aimovig', 'AMG 510', 'Neulasta', 'Onpro', 'M&A', 'Enbrel', 'Expense', 'Guidance',
           'Payer mix', 'BiTE', 'Parsabiv', 'BD', 'margin', 'COVID Impact', 'Adaptive colab' ,'Inventory', 'Telemedicine', 'PD1', 'Aimovig',
           'IL2', 'Drug pricing', 'Omecamtiv mecarbil', 'Tezepelumab', 'prolia', 'executive orders', 'neulasta onpro', 'Sotorasib', 'COVID',
           'AMG510', 'Repatha', 'EVENITY']
keywords = [i.lower() for i in keywords]    # make case insensitive
keywords = list(set(keywords))              # remove duplicates

In [3]:
# Notebook functions
def pdfparser(data):
    '''
    Converts .pdf to one long string
    '''
    fp = open(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'text'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()

    return data

def findCompany(my_list):
    '''
    Reads company name along with its abbreviation
    Input should be first 100 words of file as a list of strings
    Outputs as a string, e.g. 'Amgen, Inc, (AMGN)'
    '''
    for i in range(len(my_list)):
        if len(my_list[i]) > 8:
            if my_list[i][-4:].isnumeric() and my_list[i][-5] == '-' and my_list[i][-8:-5].isalpha():
                date_ind = i
                break
    for i in range(date_ind, len(my_list)):
        if my_list[i] in ['Q1', 'Q2', 'Q3', 'Q4']:
            quarter_ind = i
            break

    return ' '.join(my_list[date_ind+1: quarter_ind])

def removePunctuation(my_list, ref=',.?!:;\''):
    '''
    Given a list of strings, removes any punctuation found in the list regardless of its position
    Returns as list of strings
    '''
    ref = list(ref)
    
    my_list = list(' '.join(my_list))
    indices = [i for i in range(len(my_list)) if my_list[i] in ref]
    
    for i in range(len(indices)):
        del my_list[indices[-(i+1)]]
        
    return ''.join(my_list).split()

def removeHeaderFooter(qa_paras, company):
    '''
    Removes header/footer found between pages.
    Input should be list of paragraphs and company name, ref findCompany()
    Outputs as list of paragraphs
    '''
    len_footer = len(company.split()) + 16

    for i in range(len(qa_paras)):
        my_para = qa_paras[i].split()

        footer_end = 0
        for j in range(len(my_para)):
            if my_para[j] == 'FactSet':
                if my_para[j+1] == 'CallStreet,' and my_para[j+2] == 'LLC':
                    footer_end = j+2
                    break

        if footer_end > 0:
            if len(my_para) == footer_end+1:
                my_para = my_para[:footer_end-len_footer+1]
            else:
                my_para = my_para[:footer_end-len_footer+1] + my_para[footer_end+1:]
            qa_paras[i] = ' '.join(my_para)

    return qa_paras

def checkKeyword(my_list, keyword):
    '''
    Checks if a given keyword or pair of keywords is found in the list of strings.
    Returns either True or False
    '''
    def checkPlurals(my_str, keyword):
        if keyword == my_str:
            return True
        elif keyword+'s' == my_str:
            return True
        elif keyword+'es' == my_str:
            return True
        elif keyword[:-1]+'ies' == my_str:
            return True
        elif keyword+'\'s' == my_str:
            return True
        else:
            return False

    if len(keyword.split()) == 1:
        for i in range(len(my_list)):
            if checkPlurals(my_list[i], keyword):
                return True
        return False
    
    if len(keyword.split()) > 1:
        len_pair = len(keyword.split())

        my_dict = {}
        for i in range(len_pair):
            a = my_list[i:]+(len_pair-len(my_list[i:])%len_pair)*['padding']
            my_dict['l'+str(i+1)] = np.reshape(a, (int(len(a)/len_pair), len_pair))

        for i in range(len_pair):
            for j in range(len(my_dict['l'+str(i+1)])):
                if checkPlurals(' '.join(my_dict['l'+str(i+1)][j]), keyword):
                    return True
        return False

def searchPairs(my_list, search_pair):
    '''
    Searches list of words for pair of search words and returns the index
        at which the pair begins
    '''
    len_pair = len(search_pair.split())
    
    my_dict = {}
    for i in range(len_pair):
        a = my_list[i:]+(len_pair-len(my_list[i:])%len_pair)*['padding']
        my_dict['l'+str(i+1)] = np.reshape(a, (int(len(a)/len_pair), len_pair))
    
    indices = []
    for i in range(len_pair):
        for j in range(len(my_dict['l'+str(i+1)])):
            if ' '.join(my_dict['l'+str(i+1)][j]).lower() == search_pair.lower():
                indices += [int(len_pair*j+i)]

    return indices

def pullCleanManagementDiscussion(my_dir, filename):
    '''
    Given directory of file and filename, reads .pdf and pulls management discussion section
        while removing header/footer
    Returns as list of paragraphs
    '''
    # pulls all text from pdf
    words = pdfparser(my_dir+filename).split()
    
    # isolates Q&A section
    management_section = words[searchPairs(words, 'management discussion section')[0]:searchPairs(words, 'question and answer section')[0]]

    # separates Q&A section into paragraphs
    management_paras = []
    start, stop = 0, -1
    for i in range(len(management_section)):
        if management_section[i][:5] == '.....':
            stop = i
            management_paras += [' '.join(management_section[start:stop])]
            start = stop + 1
    del management_paras[0]

    # Removes header/footer from Q&A paragraphs
    company = findCompany(words[:50])
    management_paras = removeHeaderFooter(management_paras, company)
    
    return management_paras

def pullCleanQA(my_dir, filename):
    '''
    Given directory of file and filename, reads .pdf and pulls Q&A paragraphs
        while removing header/footer
    Returns as list of paragraphs
    '''
    # pulls all text from pdf
    words = pdfparser(my_dir+filename).split()
    
    # isolates Q&A section
    qa_section = words[searchPairs(words, 'question and answer section')[0]:]

    # separates Q&A section into paragraphs
    qa_paras = []
    start, stop = 0, -1
    for i in range(len(qa_section)):
        if qa_section[i][:5] == '.....':
            stop = i
            qa_paras += [' '.join(qa_section[start:stop])]
            start = stop + 1
    del qa_paras[0]

    # Removes header/footer from Q&A paragraphs
    company = findCompany(words[:50])
    qa_paras = removeHeaderFooter(qa_paras, company)
    
    return qa_paras

def keywordsByQuestioner(qa_paras, keywords=keywords):
    '''
    Goes through Q&A and counts the number of questions that mention a given keyword
    Returns two dictionaries where the value in key:value pairs are the names of the question asker
    and in the second dictionary, the value is the number of people who have mentioned that keyword
    '''
    keyword_dict = {i:[] for i in keywords}

    for i in range(len(qa_paras)):
        my_para = removePunctuation([n.lower() for n in qa_paras[i].split()])

        # determine if question paragraph and pull name
        if 'q' not in my_para:
            continue
        else:
            for j in range(len(my_para)):
                if my_para[j] == 'q':
                    name = 'q' if j==0 else ' '.join(qa_paras[i].split()[:j])
                    break

            # search through paragraph for keywords
            for j in range(len(keywords)):
                if checkKeyword(my_para, keywords[j]):
                    keyword_dict[keywords[j]] += [name]

    # remove duplicate names
    keyword_count = {}
    for i in range(len(keywords)):
        keyword_dict[keywords[i]] = list(set(keyword_dict[keywords[i]]))
        if len(keyword_dict[keywords[i]]) > 0:
            keyword_count[keywords[i]] = len(keyword_dict[keywords[i]])
        else:
            del keyword_dict[keywords[i]]
    keyword_dict = sorted(keyword_dict.items(), key=lambda x: -len(x[1]))
    keyword_count = sorted(keyword_count.items(), key=lambda x: -x[1])
    
    return keyword_dict, keyword_count

def keywordsByManagement(management_paras, keywords=keywords):
    '''
    Counts number of times a keyword is mentioned in the management discussion section
    '''
    keyword_count = {i:0 for i in keywords}
    
    my_list = removePunctuation(' '.join(management_paras).lower().split())
    for i in range(len(keywords)):
        if len(keywords[i].split()) == 1:
            for j in range(len(my_list)):
                if my_list[j].lower() == keywords[i]:
                    keyword_count[keywords[i]] += 1
        elif len(keywords[i].split()) > 1:
            len_pair = len(keywords[i].split())
            
            for j in range(len_pair):
                temp_list = my_list[j:]+(len_pair-len(my_list[j:])%len_pair)*['p@dding']
                temp_list = np.reshape(temp_list, (int(len(temp_list)/len_pair), len_pair))
                
                for k in range(len(temp_list)):
                    if ' '.join(temp_list[k]).lower() == keywords[i]:
                        keyword_count[keywords[i]] += 1
                        
    for i in range(len(keywords)):
        if keyword_count[keywords[i]] == 0:
            del keyword_count[keywords[i]]
    
    return sorted(keyword_count.items(), key=lambda x: -x[1])

In [4]:
my_dir = '/home/andy/OneDrive/Python/forMinh/pdfs/'   # directory containing .pdf file
filename = 'AMGN 1Q 2020 EPS Call 4 30 2020.pdf'

In [5]:
qa_paras = pullCleanQA(my_dir, filename)
qa_dict, qa_count = keywordsByQuestioner(qa_paras)

In [6]:
qa_dict

[('covid', ['Matthew Harrison', 'q', 'Geoff Meacham', 'Carter Gould']),
 ('otezla', ['q', 'Geoff Meacham', 'Alethia Young']),
 ('biosimilar', ['q', 'Michael Schmidt', 'Ronny Gal']),
 ('amg510', ['Michael Yee', 'Terence Flynn']),
 ('margin', ['Jay Olson']),
 ('enbrel', ['q']),
 ('telemedicine', ['Robyn Karnauskas']),
 ('prolia', ['q']),
 ('evenity', ['Robyn Karnauskas']),
 ('bd', ['Geoff Meacham']),
 ('aimovig', ['Cory Kasimov']),
 ('drug pricing', ['q']),
 ('m&a', ['q'])]

In [7]:
qa_count

[('covid', 4),
 ('otezla', 3),
 ('biosimilar', 3),
 ('amg510', 2),
 ('margin', 1),
 ('enbrel', 1),
 ('telemedicine', 1),
 ('prolia', 1),
 ('evenity', 1),
 ('bd', 1),
 ('aimovig', 1),
 ('drug pricing', 1),
 ('m&a', 1)]

In [8]:
management_paras = pullCleanManagementDiscussion(my_dir, filename)
management_count = keywordsByManagement(management_paras)

In [9]:
management_count

[('otezla', 12),
 ('guidance', 8),
 ('prolia', 6),
 ('evenity', 5),
 ('expense', 5),
 ('covid', 5),
 ('onpro', 3),
 ('bite', 3),
 ('amg 510', 3),
 ('aimovig', 3),
 ('enbrel', 2),
 ('telemedicine', 2),
 ('parsabiv', 2),
 ('omecamtiv mecarbil', 2),
 ('margin', 1),
 ('inventory', 1),
 ('biosimilar', 1),
 ('neulasta', 1),
 ('tezepelumab', 1),
 ('repatha', 1)]

In [10]:
with open('qa_count.out', 'w') as file_out:
    for pairs in qa_count:
        file_out.write('%s,  %s\n'%(pairs[0], pairs[1]))
file_out.close()

In [11]:
with open('management_count.out', 'w') as file_out:
    for pairs in management_count:
        file_out.write('%s,  %s\n'%(pairs[0], pairs[1]))
file_out.close()

In [None]:
my_dir = '/home/andy/OneDrive/Python/forMinh/pdfs/'
filename = 'CORRECTED TRANSCRIPT_ Amgen, Inc.(AMGN-US), Q2 2020 Earnings Call, 28-July-2020 5_00 PM ET.pdf'

In [None]:
qa_paras = pullCleanQA(my_dir, filename)
qa_dict, qa_count = keywordsByQuestioner(qa_paras)

In [None]:
management_paras = pullCleanManagementDiscussion(my_dir, filename)
management_count = keywordsByManagement(management_paras)