# PDF to Questionnaires Pipeline Demo

### Section Cutting functions

In [1]:
from PyPDF2 import PdfReader, PdfWriter
import pandas as pd
import os
import fitz
from re import search
import openai
from transformers import GPT2TokenizerFast
import rapidfuzz

In [2]:
# function for finding "relevant" pages where the questionnaires could occur.
def page_finder(protocol):
    # list of potential keywords to find relevant sections
    potential_sections = [
    "objective",
    "study endpoints",
    "event",
    "assessments",
    "activities",
    "abbreviations",
    "endpoint",
    "evaluation",
    "measure",
    "design",
    "synopsis",
    "questionnaire",
    "outcome",
    "patient reported",
    "definitions",
    "flow chart",
    "visits",
    "schedule",
    ]
    # table of contents
    # input_pdf = PdfReader(open(protocol, 'rb'))
    try:
        pdf = fitz.open(protocol)
        toc = pdf.get_toc()
        # header = page number
        section_titles = {}
        for item in toc:
            section_titles[item[1]] = item[2]
        # page numbers to extract
        page_nos = []
        for title in section_titles.keys():
            for potential in potential_sections:
                if search(potential, title.lower()):
                    page_nos.append(section_titles[title])
        page_nos = list(dict.fromkeys(page_nos))
        page_nos2 = page_nos.copy()
        for page in page_nos:
            page_nos2.append(page + 1)
        page_nos2 = list(dict.fromkeys(page_nos2))
        # try:
        #     if page_nos2[-1] >= len(input_pdf.pages):
        #         page_nos2.remove(page_nos2[-1])
        # except:
        #     print(protocol, "finder")
        return page_nos2
    except:
        print(protocol)

In [3]:
# take page numbers from page_finder and cut those pages out making a new pdf at the specified path
def pdf_writer(inpath,  outpath, page_nos):
    input_pdf = PdfReader(open(inpath,'rb'))
    output_pdf = PdfWriter()
    for i in page_nos:
        page = input_pdf.pages[i]
        output_pdf.add_page(page)
    with open(outpath, 'wb') as f:
        output_pdf.write(f)
        # ok_count += 1

        print("ok")

### Section Cutting

Page numbers

In [4]:
page_nos = page_finder(r"C:\Users\Jakub\Documents\zazu\openai-quickstart-python\protocols\clinical_trial_rank_0005.pdf")

Cut PDF

In [5]:
pdf_writer(r"C:\Users\Jakub\Documents\zazu\openai-quickstart-python\protocols\clinical_trial_rank_0005.pdf",
            r"C:\Users\Jakub\Documents\zazu\openai-quickstart-python\demo_app\cut_trial_rank_0005.pdf",
              page_nos)

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\Jakub\\Documents\\zazu\\openai-quickstart-python\\demo_app\\cut_trial_rank_0005.pdf'

### Questionnaire Extraction functions

In [None]:
openai.organization = "org-us16wmNswbfs7htSVY2eiaYh" # zazu
openai.api_key = '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'

In [None]:
# cut text into chunks small enough for the GPT API
def chunker(text):
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    chunks = []
    page = 0
    while page < len(text):
        page_count = 0
        chunk_tokens = 0
        chunk = ""
        # add pages until the page or token limit is reached
        while chunk_tokens <= 3500 and page_count < 5 and page < len(text):
            chunk += text.iloc[page][2]
            chunk_tokens += len(tokenizer(chunk)[0])
            page += 1
            page_count += 1
        chunks.append(chunk)
    return chunks


In [None]:
# takes list of chunks and runs them through the GPT API, returns list of answers, one for each chunk
def gpt(chunks):
    answers = []
    for chunk in chunks:
        completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
                {"role": "system", "content": """You are a helpful assistant that extracts validated clinical questionnaires and puts them into comma separated list with each questionnaire on a new line. Do not ask questions.
                If you can't find any questionnaires say: 'None found'.
                example text: nation', 'EMA', 'European Medicines Agency', 'DLQI', 'Dermatology Life Quality Index', 'EQ-5D', 'European Quality of Life 5-Dimension Questionnaire', 'FCBP', 'Females of childbearing potential', 'FDA', 'Food and Drug Administration'
                example answer: 
                - EQ-5D, European Quality of Life 5-Dimension Questionnaire
                - DLQI, Dermatology Life Quality Index"""},
                {"role": "user", "content": chunk},
            ]
        )
        answer = completion["choices"][0]["message"]["content"]
        answers.append(answer)

    return answers

### Questionnaire Extraction

In [None]:
data = pd.read_csv(r"C:\Users\Jakub\Documents\zazu/openai-quickstart-python/text/clinical_trial_rank_0005/text.csv")
text = data.astype(str)

In [None]:
chunks = chunker(text)

Token indices sequence length is longer than the specified maximum sequence length for this model (1214 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
answers = gpt(chunks)

In [None]:
for answer in answers:
    print(answer)

None found.
None found.
- FACT-G, Functional Assessment of Cancer Therapy-General
None found.
None found.
None found.
None found.
None found.
None found.
None found.
None found.
None found.
None found.
None found.
- FACT-G, FACT-G QUESTIONNAIRE, VERSION 4, ENGLISH AND SWAHILI
None found.


###
GPT Answers

- None found.
- FACT-G, Functional Assessment of Cancer Therapy-General
- None found.
- FACT-G, FACT-G QUESTIONNAIRE, VERSION 4, ENGLISH AND SWAHILI

### Fuzzy Matching Functions

In [None]:
# csv of questionnaires
mapi = pd.read_csv(r"C:\Users\Jakub\Documents\zazu\openai-quickstart-python\timings\mapi_list_sn_ln.csv")
mapi.iloc[3752][0] = ""

In [None]:
# function for sorting
def key(tup):
    return tup[1]

In [None]:
# clean the answers before trying to match
def answer_cleaning(my_list):
    temp_list1 = []
    temp_list2 = []
    temp_list3 = []
    temp_list4 = []
    for x in my_list:
        if not x == 'None found.':
            # if not x == 'No validated clinical questionnaires were used in this text.':
            temp_list1.append(x)
    for y in temp_list1:
        split = y.split('\n')
        for s in split:
            temp_list2.append(s)
    for z in temp_list2:
        if not 'not a questionnaire' in z:
            temp_list3.append(z)
    for string in temp_list3:
        tokens = string.split()
        # word 'questionnaire' skews answers so remove it
        tokens = [token if token != 'questionnaire' else '' for token in tokens]
        tokens = [token if token != 'Questionnaire' else '' for token in tokens]
        string = ' '.join(tokens)
        temp_list4.append(string)
    return temp_list4

In [None]:
# my version of the rapidfuzz.process.extract_iter() which allows for change of weights of the processor
# compares the string with all the long names in the csv above
def weighted_iter_long(string):
    matches = []
    choices = mapi.long_name
    cut_off = 0.7

    for choice in choices:
        score = rapidfuzz.distance.Levenshtein.normalized_similarity(string, choice, processor=rapidfuzz.utils.default_process, weights=(0.999999,1,1))
        # remove incomplete answers where '...' occurs
        if score >= cut_off and "..." not in choice:
            matches.append((choice, score))
    
    # sort the list in descending order
    matches = sorted(matches, key=key, reverse=True)
    
    return matches

In [None]:
# same as above but for the short names
# splits the string to compare the individual tokens to the short names
def weighted_iter_short(string):
    matches = []
    tokens = string.split()
    tokens = [token if token != 'questionnaire' else '' for token in tokens]
    choices = mapi.short_name
    cut_off = 0.7
    for token in tokens:
        for choice in choices:
            score = rapidfuzz.distance.Levenshtein.normalized_similarity(token, choice, processor=rapidfuzz.utils.default_process, weights=(1,0.999999,1))
            if score >= cut_off and "..." not in choice:
                matches.append((choice, score))
        
    matches = sorted(matches, key=key, reverse=True)
    
    return matches

In [None]:
# join three algorithms to match long name
# all normalized scores are added together to find the closest match
# cutoffs can be changed to increase or the decrease number of potential answers but only the top answer is outputted 
def long_compound(string, long=True):

    weighted = weighted_iter_long(string)
    ratio = []
    for answer in weighted:
        y = rapidfuzz.fuzz.partial_ratio(string, answer[0])/100
        score = answer[1] + y
        ratio.append((answer[0], score))
    actual = []
    for answer in ratio:
        x = rapidfuzz.distance.Levenshtein.normalized_similarity(string, answer[0], weights=(1,1,1))
        score = answer[1] + x
        actual.append((answer[0], score))
    
    actual = sorted(actual, key=key, reverse=True)
    cutoff = 1.6
    if len(actual) > 1:
        if actual[1][1] >= cutoff:
            actual = actual[0]
        else:
            actual = ()
    elif len(actual) == 1:
        if actual[0][1] >= cutoff:
            actual = actual[0]
        else:
            actual = ()
    else:
        actual = ()
    return actual

In [None]:
# same as above but for short names
def short_compound(string):
    tokens = string.split()
    weighted = weighted_iter_short(string)
    ratio = []
    for answer in weighted:
        for token in tokens:
            y = rapidfuzz.fuzz.partial_ratio(token, answer[0])/100
            score = answer[1] + y
            ratio.append((answer[0], score))
    actual = []
    for answer in ratio:
        for token in tokens:
            x = rapidfuzz.distance.Levenshtein.normalized_similarity(string, answer[0], weights=(1,1,1))
            score = answer[1] + x
            actual.append((answer[0], score))

    actual = list(filter(None, actual))
    actual = list(dict.fromkeys(actual))
    actual = sorted(actual, key=key, reverse=True)
    
    cutoff = 2
    if len(actual) > 1:
        if actual[1][1] >= cutoff:
            actual = actual[0]
        else:
            actual = ()
    elif len(actual) == 1:
        if actual[0][1] >= cutoff:
            actual = actual[0]
        else:
            actual = ()
    else:
        actual = ()
    return actual

In [None]:
# join the short + long name functions, taking the best score 
# s stands for short name and l for long name
def joint(string):
    # print(string)
    short = short_compound(string)
    long  = long_compound(string)
    # print(string, short , long)
    best = (long, 'l')
    if short and not long:
        best = (short,'s')
    if short and long:
        if short[1] > long[1]:
            best = (short,'s')
    if best[0]:
        best = best[0][0], best[1]
    return best

In [None]:
# cleans the questionnaires and matches long and short names, outputting a dictionary
def questionnaire_output(my_list):
    new = list(filter(None, my_list))
    almost = {}
    short = []
    long = []
    for q in new:
        if q[1] == 's':
            short.append(q[0])
        else:
            long.append(q[0])
    short = list(filter(None, short))
    long = list(filter(None, long))
    for q in short:
        idx = mapi.index[mapi['short_name'] == q]
        if mapi.iloc[idx[0]][0] not in almost:
            almost[mapi.iloc[idx[0]][0]] = mapi.iloc[idx[0]][1]

    for q in long:
        idx = mapi.index[mapi['long_name'] == q]
        if mapi.iloc[idx[0]][0] not in almost:
            almost[mapi.iloc[idx[0]][0]] = mapi.iloc[idx[0]][1]


    return almost

### Fuzzy Matching

In [None]:
cleaned_answers = answer_cleaning(answers)

In [None]:
cleaned_answers = answer_cleaning(cleaned_answers)

In [None]:
questionnaires = []
for answer in cleaned_answers:
    questionnaires.append(joint(answer))

In [None]:
questionnaires

[('Functional Assessment of Cancer Therapy - General', 'l'), ('FACT-G', 's')]

In [None]:
final = questionnaire_output(questionnaires)

In [None]:
final

{'FACT-G': 'Functional Assessment of Cancer Therapy - General'}

### Final List
{'FACT-G': 'Functional Assessment of Cancer Therapy - General'}

### Timings - To be done