In [None]:
# || Header || v 2.0.0
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math
from scipy import stats
import linecache
import ipywidgets as widgets
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
import os
from importlib import reload
from rake_nltk import Rake
import keyword_lib as lib
from nltk.corpus import words
import string

In [None]:
financial_keywords = ['investment management', 'investment', 'finance' ]
keywords = ['Biosimilar', 'Otezla', 'Omecamtiv mecarbil', 'Aimovig', 'AMG 510', 'Neulasta', 'Onpro', 'M&A', 'Enbrel', 'Expense', 'Guidance',
           'Payer mix', 'BiTE', 'Parsabiv', 'BD', 'margin', 'COVID Impact', 'Adaptive colab' ,'Inventory', 'Telemedicine', 'PD1', 'Aimovig',
           'IL2', 'Drug pricing', 'Omecamtiv mecarbil', 'Tezepelumab', 'prolia', 'executive orders', 'neulasta onpro', 'Sotorasib', 'COVID',
           'AMG510', 'Repatha', 'EVENITY', 'biospecific']
keywords = [i.lower() for i in keywords] + [i.lower() for i in financial_keywords]    # make case insensitive and append
keywords = list(set(keywords))                                                        # remove duplicates

In [None]:
# Finds all .pdf files in given directory
my_dir = '/home/andy/OneDrive/Python/forMinh/more_pdfs/'
pdfs = []
for filename in os.listdir(my_dir):
    if filename.endswith('.pdf'):
        pdfs += [filename]

In [None]:
# Creates a dataframe of all .pdf files with columns [object, company, quarter]
df = pd.DataFrame(columns=['object', 'company', 'quarter'])
for filename in pdfs:
    f = lib.Transcript(my_dir, filename)
    df = df.append({'object': f, 'company': f.company, 'quarter': f.quarter}, ignore_index=True)

In [None]:
# Widget to choose which file, according to company name and financial quarter, and generate an output
c_select = widgets.Dropdown(options = ['Select Company'] + list(set(df.company)), description='Company:')
q_select = widgets.Dropdown(options = ['Select Quarter'], description='Quarter:')
s_select = widgets.Dropdown(options = ['Rake', 'Manual'], description='Search Type:')
r_select = widgets.Dropdown(options = ['Management', 'Q&A'], description='Report Type:')
button = widgets.Button(description='Run')
output = widgets.Output()

def update_quarter(*args):
    c_indices = df.index[df.company == c_select.value].tolist()
    q_select.options=df.quarter[c_indices]
c_select.observe(update_quarter, 'value')

def update_report(*args):
    if s_select.value == 'Rake':
        r_select.options = ['Management']
    elif s_select.value == 'Manual':
        r_select.options = ['Management', 'Q&A']
s_select.observe(update_report, 'value')

def showOutput(btn):
    output.clear_output()
    c_indices = df.index[df.company == c_select.value].tolist()
    my_index = np.where(np.array(df.quarter[c_indices]) == q_select.value)[0][0]
    return_value = (c_select.value, q_select.value, s_select.value, r_select.value)
    if s_select.value == 'Manual' and r_select.value == 'Management':
        report = df.object[my_index].keywordsByManagementManual(keywords)
    elif s_select.value == 'Manual' and r_select.value == 'Q&A':
        report = df.object[my_index].keywordsByQuestionerManual(keywords)[1]
    elif s_select.value == 'Rake' and r_select.value == 'Management':
        report = df.object[my_index].keywordsByManagementRake()
    with output:
        print(return_value[3]+' report '+' ('+return_value[2]+') '+'for '+return_value[0]+' '+return_value[1])
        
        threshold = np.round(np.sum([i[1] for i in report])/4)

        indices = []
        my_val = 0
        for i in range(len(report)):
            my_val += report[i][1]
            if my_val >= threshold:
                my_val = 0
                indices += [i]

        r1 = pd.DataFrame(report[:indices[0]+1], columns=['keyword', 'rank'])
        r2 = pd.DataFrame(report[indices[0]+1:indices[1]+1], columns=['keyword', 'rank'])
        r3 = pd.DataFrame(report[indices[1]+1:indices[2]+1], columns=['keyword', 'rank'])
        r4 = pd.DataFrame(report[indices[2]+1:], columns=['keyword', 'rank'])

        lib.display_side_by_side(r4,r3,r2,r1, titles=['Quantile 4','Quantile 3','Quantile 2','Quantile 1'])

button.on_click(showOutput)

ui=widgets.HBox([c_select, q_select, s_select, r_select, button])
display(ui, output)

In [None]:
# Widget to track occurences of keywords through financial quarters
%matplotlib notebook
c_select = widgets.Dropdown(options = ['Select Company'] + list(set(df.company)), description='Company:')
r_select = widgets.Dropdown(options = ['Management', 'Q&A'], description='Report Type:')
button = widgets.Button(description='Run')
k_select = widgets.Dropdown(options = ['Select Keyword'], description='Keyword:')
plot_button = widgets.Button(description='Plot')
val_holder = widgets.Dropdown(description='temporary value holder')
output = widgets.Output()

ui=widgets.HBox([c_select, r_select, button, k_select, plot_button])
display(ui)

def preprocess(btn):
    c_indices = df.index[df.company == c_select.value].tolist()

    if r_select.value == 'Management':
        my_dict = {}
        for index in c_indices:
            my_dict[df.quarter[index]] = df.object[index].keywordsByManagementManual(keywords)
    elif r_select.value == 'Q&A':
        my_dict = {}
        for index in c_indices:
            my_dict[df.quarter[index]] = df.object[index].keywordsByQuestionerManual(keywords)

    avail_keywords = []
    for i in my_dict:
        for j in my_dict[str(i)]:
            avail_keywords += [j[0]]
    avail_keywords = list(set(avail_keywords))

    val_holder.options = my_dict
    k_select.options = avail_keywords

button.on_click(preprocess)
fig = plt.figure()

def plotKeyword(btn):
    plt.clf()

    my_dict = val_holder.options
    my_keyword = k_select.value
    keyword_freq = []
    for i in my_dict:
        my_val = 0
        for j in my_dict[str(i)]:
            if j[0] == my_keyword:
                my_val = j[1]
        keyword_freq += [(str(i), my_val)]

    plt.plot([i[0] for i in keyword_freq], [i[1] for i in keyword_freq], 's')

    plt.xlabel('Financial Quarter')
    plt.title('Mentions of \"'+str(k_select.value)+'\"')
    plt.grid(False)

plot_button.on_click(plotKeyword)

#### Scrap section. Working on adding Rake to Q&A section

In [None]:
my_dict = val_holder.options

In [None]:
my_dict

In [None]:
my_keyword = k_select.value
keyword_freq = []
for i in my_dict:
    my_val = 0
    for j in my_dict[str(i)]:
        if j[0] == my_keyword:
            my_val = j[1]
    keyword_freq += [(str(i), my_val)]

In [None]:
keyword_freq

In [None]:
df

In [None]:
r = Rake()

In [None]:
r.extract_keywords_from_text(df.object[0].qa_paras[0])

In [None]:
a = r.get_word_degrees()

In [None]:
my_company = df.object[0].company

In [None]:
names = []
for i in df.object[0].management_paras:
    my_words = lib.removePunctuation(i.split()[:20], ref=string.punctuation)
    if my_words[0].lower() != 'operator':
        names += [my_words[0].lower(), my_words[1].lower()]
        for i in range(2, len(my_words)):
            if my_words[i].lower() not in exclusion_list:
                names += [my_words[i].lower()]
            else:
                break
names = list(set(names))

In [None]:
a = sorted(a.items(), key=lambda x: -x[1])

In [None]:
indices = []
for i in range(len(a)):
    if not checkForms(a[i][0], exclusion_list) and not a[i][0].isnumeric() and a[i][0] not in my_company.lower().split() and a[i][0] not in names:
        my_val = 0
        for c in a[i][0]:
            if c in string.punctuation:
                my_val += 1
        if my_val == 0:
            indices += [i]
b = combinePlurals([a[i] for i in indices])

In [None]:
b