In [1]:
# || Header || v 1.0.0
import numpy as np
import pandas as pd
import math
from scipy import stats
import linecache
import ipywidgets as widgets
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
import os
from importlib import reload
from rake_nltk import Rake
import keyword_lib as lib

In [2]:
financial_keywords = ['investment management', 'investment', 'finance' ]
keywords = ['Biosimilar', 'Otezla', 'Omecamtiv mecarbil', 'Aimovig', 'AMG 510', 'Neulasta', 'Onpro', 'M&A', 'Enbrel', 'Expense', 'Guidance',
           'Payer mix', 'BiTE', 'Parsabiv', 'BD', 'margin', 'COVID Impact', 'Adaptive colab' ,'Inventory', 'Telemedicine', 'PD1', 'Aimovig',
           'IL2', 'Drug pricing', 'Omecamtiv mecarbil', 'Tezepelumab', 'prolia', 'executive orders', 'neulasta onpro', 'Sotorasib', 'COVID',
           'AMG510', 'Repatha', 'EVENITY']
keywords = [i.lower() for i in keywords] + [i.lower() for i in financial_keywords]    # make case insensitive and append
keywords = list(set(keywords))                                                        # remove duplicates

In [3]:
# Finds all .pdf files in given directory
my_dir = '/home/andy/OneDrive/Python/forMinh/pdfs/'
pdfs = []
for filename in os.listdir(my_dir):
    if filename.endswith('.pdf'):
        pdfs += [filename]

In [4]:
# Creates a dataframe of all .pdf files with columns [object, company, quarter]
df = pd.DataFrame(columns=['object', 'company', 'quarter'])
for filename in pdfs:
    f = lib.Transcript(my_dir, filename)
    df = df.append({'object': f, 'company': f.company, 'quarter': f.quarter}, ignore_index=True)

In [None]:
# Widget to choose which file, according to company name and financial quarter, and generate an output
c_select = widgets.Dropdown(options = ['Select Company'] + list(set(df.company)), description='Company:')
q_select = widgets.Dropdown(options = ['Select Quarter'], description='Quarter:')
r_select = widgets.Dropdown(options = ['Management', 'Q&A'], description='Report Type:')
button = widgets.Button(description='Run')
output = widgets.Output()

def update_quarter(*args):
    c_indices = df.index[df.company == c_select.value].tolist()
    q_select.options=df.quarter[c_indices]
c_select.observe(update_quarter, 'value')

def showOutput(btn):
    output.clear_output()
    c_indices = df.index[df.company == c_select.value].tolist()
    my_index = df.index[df.quarter[c_indices] == q_select.value][0]
    return_value = (c_select.value, q_select.value, r_select.value)
    if r_select.value == 'Management':
        report = df.object[my_index].keywordsByManagement(keywords)
    elif r_select.value == 'Q&A':
        report = df.object[my_index].keywordsByQuestioner(keywords)[1]
    with output:
        print(return_value[2]+' report for '+return_value[0]+' '+return_value[1])
        display(pd.DataFrame(report, columns=['keyword', 'count']))

button.on_click(showOutput)

ui=widgets.HBox([c_select, q_select, r_select, button])
display(ui, output)