# Import statements
## We rely on regex, spacy, pandas and pyinflect

In [1]:
import re
import spacy
import pandas as pd
import pyinflect

## Let's load a small model to handle dependency parsing of the text.

In [2]:
nlp = spacy.load('en_core_web_sm')

## Read in sample data, this is csv, with clinical notes segmented into the subjective and objective portions

In [3]:
df = pd.read_csv('/home/karl/PycharmProjects/tenseflection/examples.csv') # update your path
df

Unnamed: 0.1,Unnamed: 0,Full Note,High Quality Note,Top 100 Note
0,0,"SUBJECTIVE:, This 23-year-old white female pr...",Yes,
1,1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...",Yes,1.0
2,2,"PAST MEDICAL HISTORY:, Significant for hypert...",Yes,
3,3,"CHIEF COMPLAINT:, Urinary retention.,HISTORY ...",Yes,1.0
4,4,"CHIEF COMPLAINT: , Right distal ureteral calcu...",Yes,
...,...,...,...,...
135,135,"CHIEF COMPLAINT: , Severe back pain and sleepi...",Yes,1.0
136,136,"CHIEF COMPLAINT:, Status epilepticus.,HISTORY...",Yes,1.0
137,137,"City, State,Dear Dr. Y:,I had the pleasure of ...",Yes,1.0
138,138,"REASON FOR CONSULTATION: , New-onset seizure.,...",Yes,1.0


## This is the main body of the functions used to convert the notes to past tense.
## This is a portion of the use-case for the start-up, but not the main deep learning NLP task.


In [4]:
def containsnumber(value):
    """
    This is a helper fuction for the past_tense function
    This checks if a string contains a character that is a digit
    :param value:
    :return:
    """
    for character in value:
        if character.isdigit():
            return True
    return False

def past_tenser(txt):

    """
    This is the main function that performs the generation of an all past tense medical note.
    First person subject and direct objects are also handled. We handle this by substituting
    out the words I|we|our etc. for their respective 3rd person counterparts or the noun phrase:
     "the provider"
    :param txt:
    :return:
    """

    # Handling first person
    txt = re.sub(r"\b(I|Myself|myself|My self|my self|Me|me)\b","The provider", txt)
    txt = re.sub(r"\b(I|Myself|myself|My self|my self|Me|me)\b","the provider", txt)
    txt = re.sub(r"\b(we)\b","they", txt)
    txt = re.sub(r"\b(Ourselves|ourselves|Our self|our self|Me|me)\b","their", txt)
    txt = re.sub(r"\b(My)\b", "The provider's", txt)
    txt = re.sub(r"\b(my)\b", "the provider's", txt)
    txt = re.sub(r"\b(Mine)\b", "The provider's", txt)
    txt = re.sub(r"\b(mine)\b", "the provider's", txt)
    txt = re.sub(r"\b(presents)\b", "presented", txt)
    txt = re.sub(r"\b(today)\b", "", txt)


    # Begin blocks for changing to past tense.

    # Noting here that sometimes SpaCy dependency parser mistakes verbs that have same spelling
    # as nouns i.e. presents - will mistake the verb for a noun and so it will fail, in this case
    # we manually handle presents with regex to maked presented. We could write a custom script to
    # handle this type of exceptions, but that is beyond the scope of this module.

    doc_dep = nlp(txt)
    for i in range(len(doc_dep)):

        tok = doc_dep[i]

        try:
            # Assigning previous tokens, or previous previous tokens, and next etc.

            prev_tok = doc_dep[i - 1]
            prev_prev_tok = doc_dep[i - 2]

            next_tok = doc_dep[i + 1]
            next_next_tok = doc_dep[i + 2]



            if tok.pos_ == 'AUX' and str(tok.text).lower() == 'have':
                txt = txt.replace(" {} ".format(tok.text), " had ")

            if tok.pos_ == 'AUX' and str(tok.text).lower() == 'will':
                txt = txt.replace(" {} ".format(tok.text), " would ")


            # this next block is a complicated logic block to handle verbs without a preceding AUX verb, PLUS!!!:
            # "This patient is xx year old" or "This is a dd-year-old"
            # we don't want is to inflect to was - implies they are deceased

            elif tok.tag_ in ['VB', 'VBP', 'VBZ', 'VBG'] and (prev_tok.pos_ != 'AUX' and prev_prev_tok.pos_ != 'AUX') and \
                    prev_tok.tag_ != 'TO' and (
                    (str(prev_tok.text.lower()) != 'patient') and (
                    str(prev_prev_tok.text.lower()) != 'the')) and not (
                    ((str(next_tok.text.lower()) != 'a') and (containsnumber(next_next_tok.text)))
            ):
                # We are using replace and we have to be careful not to replace substrings of words
                txt = txt.replace(" {} ".format(tok.text), " {} ".format(tok._.inflect("VBD")))


            # Handle the auxillary verb have -> had

        except IndexError:
            # If you access the children of the last token, you get index errors, so we suppress this
            # This should not affect output as it would be exceedingly rare and grammatically incorrect
            # to end on a token that is a verb that needs to be made past tense.

            continue

    return txt


## Run the function on the notes

In [5]:
df['Past Tense'] = df['Full Note'].apply(lambda x: past_tenser(x))

## Examine the output vs. original

In [7]:
for i, j in enumerate(df['Past Tense'][0:9]):
    print('\nExample Index: ', i)
    print('\nExample Pre-processed', df['Full Note'][i])
    print('\nPost-processed combined: ', j)
    print('################################################################################################################')


Example Index:  0

Example Pre-processed SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were 