# NLP medical transcripts

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)

import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# regex
import re
import string

# nlp clean raw data
# import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

# stemming
from nltk.stem import WordNetLemmatizer

# pickle
import pickle

## 1. EDA

### 1a. Cleaning in excel
* In excel, removed categories: autopsy, dentistry, cosmetic/plastic surgery, sleep medicine, sleep language, surgery


In [2]:
df = pd.read_excel("mts_v1.xlsx")

In [3]:
df.head(1)

Unnamed: 0,id,description,medical_specialty,diagnosis,transcription,keywords
0,0,A 23-year-old white female presents with complaint of allergies.,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to h...","allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, eryth..."


In [None]:
# check for duplicates
#df[df['transcription'].duplicated()].head(5)

In [4]:
# Drop the rows where at least one element is missing
df = df.dropna()

In [5]:
# create a column for length of transcription
df['len_transcript'] = df.transcription.apply(lambda x: len(x.split(' ')))

In [None]:
# check for duplicates
#df[df['transcription'].duplicated()].head(5)

In [6]:
# drop rows with same duplicated transcription
df = df.drop_duplicates(subset = ['transcription']) 

In [None]:
# double check no duplicated
#df2[df2['transcription'].duplicated()]

In [None]:
# check for transcript < 9
# df[df.len_transcript < 9].sort_values(by='len_transcript', ascending=False)

In [7]:
# create a subset
# drop rows where transcription len < 9
df2 = df[df.len_transcript > 9]

In [None]:
# check if removed
# df2[df2.len_transcript < 9]

In [8]:
df2.head(2)

Unnamed: 0,id,description,medical_specialty,diagnosis,transcription,keywords,len_transcript
0,0,A 23-year-old white female presents with complaint of allergies.,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to h...","allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, eryth...",226
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying s...","bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, wei...",375


In [9]:
df2.shape

(1740, 7)

In [None]:
#plt.figure(figsize=[10,8])
#df2.medical_specialty.value_counts().plot.barh()

### 1b. Create corpus

In [10]:
# column of text is transcription
# corpus list of strings 
corpus = list(df2.transcription)

In [11]:
corpus[0]

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

In [12]:
print("total documents :", len(corpus))

total documents : 1740


In [13]:
totalLength = []
for i, doc in enumerate(corpus):
    
    # split words in doc 
    totalLength.append(len(doc.split(' ')))
    
print("total length of all documents: ", sum(totalLength))

total length of all documents:  749819


In [14]:
average = sum(totalLength)/ len(totalLength)
print('average length', int(average))
print('max length', max(totalLength))
print('min length', min(totalLength))

average length 430
max length 2096
min length 10


In [None]:
# save corpus as pickle
#with open("corpus_transcripts.pickle", "wb") as f:
    #pickle.dump(corpus, f)

## Load corpus

In [15]:
# load pickle
# open pickle
with open("corpus_transcripts.pickle", "rb") as f:
    corpus = pickle.load(f)

In [16]:
corpus[92]

"PREOPERATIVE DIAGNOSIS:,  Left inguinal hernia.,POSTOPERATIVE DIAGNOSIS:, Left direct and indirect inguinal hernia.,PROCEDURE PERFORMED:, Repair of left inguinal hernia with Prolene mesh.,ANESTHESIA: , IV sedation with local.,COMPLICATIONS:,  None.,DISPOSITION:  ,The patient tolerated the procedure well and was transferred to Recovery in stable condition.,SPECIMEN: , Hernia sac, as well as turbid fluid with gram stain, which came back with no organisms from the hernia sac.,BRIEF HISTORY:  ,This is a 53-year-old male who presented to Dr. Y's office with a bulge in the left groin and was found to have a left inguinal hernia increasing over the past several months.  The patient has a history of multiple abdominal surgeries and opted for an open left inguinal hernial repair with Prolene mesh.,INTRAOPERATIVE FINDINGS: , The patient was found to have a direct as well as an indirect component to the left inguinal hernia with a large sac.  The patient was also found to have some turbid fluid 

## 2. Preprocess

### 2a. Clean, tokenize, remove stop words

In [17]:
# tokenize 
def isValidToken(token):
    # single letter
    # double letters, like roman numerals
    if len(token) <= 3:
        return False
    
    # check for any digits in token, e.g. h8, 8
    # essentially removes any token that contains a digit
    if re.search('\d', token) != None:
        return False
    
    return True

In [18]:
def cleanDoc(corpus):
    
    newCorpus = []
    for i in corpus:

        # remove all punctuations
        new_i = re.sub('[%s]' % re.escape(string.punctuation), ' ', i)
        
        # tokenize each doc --> list of words
        tokens = word_tokenize(new_i)
        
        new_tokens = []
        bad_tokens = []
        stem = []
        
        
        # check for valid tokens
        for token in tokens:
            # lowercase all tokens
            token = token.lower()
            
            lemmaTrue = []
            if isValidToken(token):
                #lemmatize
                wn = WordNetLemmatizer()
                lemma = wn.lemmatize(token)
                new_tokens.append(lemma)
                    
            else:
                bad_tokens.append(token)

        # put tokens back in a list
        newCorpus.append(" ".join(new_tokens))
    
    # print("Bad tokens:", set(bad_tokens))
    return newCorpus

In [19]:
# create a list of strings
cleanCorpus = cleanDoc(corpus)

In [20]:
len(cleanCorpus)

1740

In [21]:
cleanCorpus[0]

'subjective this year white female present with complaint allergy used have allergy when lived seattle think they worse here past tried claritin zyrtec both worked short time then seemed lose effectiveness used allegra also used that last summer began using again week doe appear working very well used over counter spray prescription nasal spray doe have asthma doest require daily medication this doe think flaring medication only medication currently ortho cyclen allegra allergy known medicine allergy objective vitals weight pound blood pressure heent throat mildly erythematous without exudate nasal mucosa erythematous swollen only clear drainage seen were clear neck supple without adenopathy lung clear assessment allergic rhinitis plan will zyrtec instead allegra again another option will loratadine doe think prescription coverage that might cheaper sample nasonex spray each nostril given three week prescription written well'

In [None]:
# save as a pickle 
#with open("cleanCorpus.pickle", "wb") as f:
    #pickle.dump(cleanCorpus, f)