In [1]:
import pandas as pd
import numpy as np

import string
import datetime
todaysdate = datetime.datetime.now().strftime("%Y-%m-%d")
print(todaysdate)
import matplotlib.pyplot as plt
import random
import nltk
import re
from textblob import TextBlob

g_map = {'F': 1, 'M': 2, 'OTHER': 3, '': 0}

def to_string(s):
    """
    makes to string and changes encoding
    
    """
    try:
        return str(s)
    except:
        #Change the encoding type if needed
        return s.encode('utf-8')
    
def flatten(l):
    return [item for sublist in l for item in sublist]

pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def pos_check(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_dic[flag]:
                cnt += 1
    except:
        pass
    return cnt

# df['noun_count'] = df['NOTE_TEXT'].apply(lambda x: pos_check(x, 'noun'))
# df['verb_count'] = df['NOTE_TEXT'].apply(lambda x: pos_check(x, 'verb'))
# df['adj_count'] = df['NOTE_TEXT'].apply(lambda x: pos_check(x, 'adj'))
# df['adv_count'] = df['NOTE_TEXT'].apply(lambda x: pos_check(x, 'adv'))
# df['pron_count'] = df['NOTE_TEXT'].apply(lambda x: pos_check(x, 'pron'))
# print(df[['noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']].head(10))


se_map = {'No Side Effects': 0,
          'Mild Side Effects': 1,
          'Moderate Side Effects':2,
          'Severe Side Effects' :3,
          'Extremely Severe Side Effects': 4}

effect_map = {'Ineffective': 0,
              'Marginally Effective':1,
              'Moderately Effective': 2,
          'Considerably Effective' :3,
          'Highly Effective': 4}


pd.options.display.max_colwidth = 200

train = pd.read_csv("../data/drugLibTrain_raw.tsv", delimiter='\t')
train = train.rename({'Unnamed: 0': 'uid'}, axis=1)
test = pd.read_csv("../data/drugLibTest_raw.tsv", delimiter='\t')
test = test.rename({'Unnamed: 0': 'uid'}, axis=1)
review_text_columns = ['benefitsReview', 'sideEffectsReview', 'commentsReview']

def replace_missing_texts(train, review_text_columns):
    for i in review_text_columns:
        train[i] = train[i].fillna("fillna")
        test[i] = test[i].fillna("fillna")
    
print(list(train.columns))

train[[ 'urlDrugName', 'rating', 'effectiveness', 'sideEffects',
       'condition', 'benefitsReview', 'sideEffectsReview', 'commentsReview']].sample(2)

2019-02-26
['uid', 'urlDrugName', 'rating', 'effectiveness', 'sideEffects', 'condition', 'benefitsReview', 'sideEffectsReview', 'commentsReview']


Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
808,botox,10,Highly Effective,No Side Effects,aging,the treatment (eyes & forehead) was highly effective and lasted 6 months each of the 3 times I was treated. I have a very deep vertical line between my eyes which was all but invisible with each t...,none,"Botox was administered to my forehead and outer eye area (crows feet) via small gauge needle. Totally painless. It took about 3 days for the drug to take effect but once it did, it kept the wrinkl..."
2856,triphasil,10,Highly Effective,No Side Effects,contraception,"general medical benefits: contraception, hormone management. Personal benefits: i controlled my weight better, skin looked more radiant and menstruation pains were alot less than when i was not ...",side effects - when leaving the pill - i started losing my hair - alot of it,"daily dosage, usually taken at night time, managed my weight, and helped with my skin as well as a contraceptive"


In [2]:
from collections import Counter
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
import pprint
articles = list(train['commentsReview'][0:20])#'benefitsReview'])
import unicodedata
import spacy
from nltk.tokenize import RegexpTokenizer
tknzr = RegexpTokenizer()
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


def normalizeString(s):
    tknzr = TweetTokenizer()
    s = " ".join(tknzr.tokenize(s))
    s = re.sub(r"\d", "d", s)
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?]+)", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?-]+", r" ", s)
    s = to_string(s)
    
    return s

def tokenize_multiple_return_strings(articles):
    art_new = [normalizeString(ar) for ar in articles]
    tokens = flatten([word_tokenize(ar) for ar in articles])
    
# Convert the tokens into lowercase: lower_tokens
    lower_tokens = [t.lower() for t in tokens]

# Create a Counter with the lowercase tokens: bow_simple
    bow_simple = Counter(lower_tokens)

# Print the 10 most common tokens
    print(bow_simple.most_common(10))
    return art_new
    
# tokenize_multiple_return_strings(articles[2:5])
#[re.findall(pattern='depress', string=x) for x in set(pd.DataFrame(train.condition.value_counts()).index)]

In [9]:
# Import the necessary modules
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.stem import PorterStemmer



def preprocessing(text):
    text2 = normalizeString(text)

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]

    tokens = [word.lower() for word in tokens]

    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]

    tokens = [word for word in tokens if len(word) >= 3]

    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens

    tagged_corpus = pos_tag(tokens)

    Noun_tags = ['NN', 'NNP', 'NNPS', 'NNS']
    Verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token, tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token, 'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token, 'v')
        else:
            return lemmatizer.lemmatize(token, 'n')

    pre_proc_text = " ".join([prat_lemmatize(token, tag) for token, tag in tagged_corpus])

    return pre_proc_text


print("tagger!")
processed_reviews_columns = ["LemmatiZ" + i for i in review_text_columns]
print(processed_reviews_columns)


def preproc_reviews(train, test, colname):
    train_text = train[colname].to_list()
    test_text = test[colname].to_list()
#
    x_train_preprocessed = []
    for i in train_text:
        x_train_preprocessed.append(preprocessing(i))

    x_test_preprocessed = []
    for i in test_text:
        x_test_preprocessed.append(preprocessing(i))
        
    return x_train_preprocessed, x_test_preprocessed



#[preprocessing(article) for article in articles]

tagger!
['LemmatiZbenefitsReview', 'LemmatiZsideEffectsReview', 'LemmatiZcommentsReview']


In [4]:
len(train_text)

#
x_train_preprocessed = []
for i in train_text:
    x_train_preprocessed.append(preprocessing(i))

x_test_preprocessed = []
for i in test_text:
    x_test_preprocessed.append(preprocessing(i))

print("INTRO TO GENSIM")
# what is a word vector/embeddings
# trained from larger corpus
# multidimensional array
# with these vectors, caN see relationships between the words
# e.g., span is to madrid as italy is to rome
# based on how words how near the words are in the text
# LDA statistical analysis of texts
# gensim allows you to build corpora
# documents list of strings

my_documents = list(train['condition'])

tokenize_multiple_return_strings(my_documents)

NameError: name 'train_text' is not defined

In [None]:
# Word vectors are multi-dimensional mathematical representations of words created using deep learning methods. They
# give us insight into relationships between words in a corpus.
tokenized_docs = [word_tokenize(doc.lower()) for doc in my_documents]
print("********tokenized docs******")
print(tokenized_docs)
tokenized_docs = [word_tokenize(doc.lower()) for doc in my_documents]
print("*****Gensim Dictionary*****")
dictionary = Dictionary(tokenized_docs)
print(dictionary)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
print("****Gensim Corpus****")
print(corpus)

In [None]:

# from gensim.corpora.dictionary import Dictionary
articles = tokenized_docs
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)

# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

doc = corpus[4]

bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)
import itertools
from collections import defaultdict
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True)

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

In [None]:
five_step_rating_vars = ['effectiveness', 'sideEffects']
free_form_text_vars = ['benefitsReview', 'sideEffectsReview', 'commentsReview']
from afinn import Afinn
example_text = "muscle pain, loss of mobility, depresion, headaches i was admitted to hospital with chest pains, thought to be heart problems, it was caused by the tightening of my chest muscles"
afinnSentiScorer = Afinn(emoticons=True)

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
print(example_text)
print('**************\nAfinn Sentiment Score:\n', afinnSentiScorer.score(example_text))

analyzer = SentimentIntensityAnalyzer()
print('Vader Sentiment Intensity Analyzer scores:\n', analyzer.polarity_scores(example_text))

In [None]:
print('\nEFFECTIVENESS\n===========================\n')
print(train['effectiveness'].value_counts())
print('===========================\n')
print(train['effectiveness'].describe())
print('\n===========================\n\nSIDE EFFECTS\n===========================\n')
print(train['sideEffects'].value_counts())
print('===========================\n')
print(train['sideEffects'].describe())

In [None]:
import seaborn as sns
moonrise =  ["#F3DF6C", "#CEAB07", "#D5D5D3", "#24281A","#798E87",
             "#C27D38", "#CCC591", "#29211F","#85D4E3", "#F4B5BD",
             "#9C964A", "#CDC08C", "#FAD77B"]

grandBudapest = ["#E6A0C4", "#C6CDF7", "#5B1A18", "#D67236",
                  "#D8A499", "#7294D4","#F1BB7B", "#FD6467"]

sns.set(style="ticks", palette=grandBudapest)#"ch:.25")

plt.figure(figsize=(10, 8))
# Draw a nested boxplot to show bills by day and time
sns.boxplot(x="sideEffects", y="rating",
            data=train)

sns.despine(offset=5, trim=True)


In [None]:
sns.set(style="whitegrid")


# "Melt" the dataset to "long-form" or "tidy" representation
tidy_data = pd.melt(train, "sideEffects", var_name="rating")

# Initialize the figure
f, ax = plt.subplots()
sns.despine(bottom=True, left=True)

# Show each observation with a scatterplot
sns.stripplot(x="value", y="rating", hue="sideEffects",
              data=tidy_data, dodge=True, jitter=True,
              alpha=.25, zorder=1)

# Show the conditional means
sns.pointplot(x="value", y="rating", hue="sideEffects",
              data=tidy_data, dodge=.532, join=False, palette="dark",
              markers="d", scale=.75, ci=None)

# Improve the legend 
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[3:], labels[3:], title="",
          handletextpad=0, columnspacing=1,
          loc="lower right", ncol=3, frameon=True)

In [None]:
import calendar
list(calendar.day_abbr)
# outfile = open(pn + str(todaysdate) + '_N2C2_test_Jun1_2018.txt', 'a')
# counter = 0
# df = pd.read_csv(pn +'N2C2_test_Jun1_2018.csv')
# print(df.info())
# for index, row in df.iterrows():
#     counter += 1
#     new_note = clean_mimic(row['text'])
#     print(counter)
#     outfile.write(new_note + os.linesep)
# outfile.close()
# # #     print(new_note)
# print("total docs written out", str(counter))


In [None]:
df.reindex(list(calendar.day_abbr), level='day')

In [33]:
## izzy metzger Feb 26th. pre-processing texts 
## this file pre-processes text either in a csv, json, or bz2 
# Usage: python3 preprocessing.py <inFile> <csv|bz2|txt> <outFile> <csv|bz2|txt>
from __future__ import print_function
import string
import re
import unicodedata
import sys
import datetime
import ujson
import bz2
import os

# Import the necessary modules
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.stem import PorterStemmer

import numpy
import pandas as pd

import argparse

example_text_pharmacytextbook = """P.J. has indicated that she is injecting insulin to treat her diabetes . What questions might be asked to evaluate P.J.’s use of and response to insulin ?

M.B., a 35-year-old man, presents to the ED with a chief complaint of chest palpitations for 4 hours. He relates a history of many similar self-terminating episodes since he was a teenager. He took an unknown medication 5 years ago that decreased the occurrence of the pal- pitations, but he stopped taking it because of side effects. M.B.’s vital signs are BP, 96/68 mm Hg; pulse, 226 beats/minute, irregular; respiratory rate, 15 breaths/ minute; and temperature, 98.7◦F. A rhythm strip confirms AF, with a QRS width varying from 0.08 to 0.14 seconds. To control the ventricular rate, 10 mg IV verapamil is admin- istered for 2 minutes. Within 2 minutes of completing the infusion, VF is noted on the monitor. M.B. is defibrillated, and normal sinus rhythm is restored. A subsequent ECG demonstrates a P-R interval of 100 ms (normal, 120 to 200 ms) and delta waves, compatible with WPW. What is WPW syndrome? """

second_example_text_mimic = """
In the Emergency Department, he was found to have systolic blood pressure of 85.  He was given 6 liters of intravenous fluids and transiently started on dopamine for a systolic blood pressure in the 80.s

PAST MEDICAL HISTORY: 
1.  Coronary artery disease with diffuse 3-vessel disease; right-dominant, status post proximal left circumflex stent in [**2682-5-27**] with occlusion of the distal left circumflex;
status post right coronary artery stent on [**2682-7-14**] (no percutaneous coronary intervention to 99% diagonal left circumflex, 80% small proximal left anterior descending
artery, or 80% small distal left anterior descending artery)
2.  Congestive heart failure (with an ejection fraction of 15% to 20%)
3.  Type 2 diabetes with neuropathy.
4.  Hypertension.
5.  Diverticulosis
"""

def mkdir_if_not_exist(path):
    """
    function to make directory
    path is a string (e.g., data/text_outputs)
    """
    if not os.path.isdir(path):
        os.mkdir(path)

# plain ASCII, THNX stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

print(sent_tokenize(example_text_pharmacytextbook))
def remove_brackets(s):
    """
    since mimic has lots of brackets
    input:string s
    output s without brackets but yyyy-mm-dd instead except using "d" for digit
    """
    s = re.sub('\[\*\*.*\*\*\]', 'dddd-dd-dd', s).replace('  ', ' ')
    return s

example_mimiciii = "His admission electrocardiogram demonstrated a sinus rhythm,[NEWLINE] nonspecific inferior/lateral T wave changes, low QRS voltages[NEWLINE] in the limb leads, and T wave changes in V5 and V6 when[NEWLINE] compared with an electrocardiogram dated [**2682-8-30**]"

print(remove_brackets(example_mimiciii))

print([word for sent in nltk.sent_tokenize(remove_brackets(second_example_text_mimic)) for word in
              word_tokenize(sent)])
def normalizeString(s):
    #tknzr = TweetTokenizer()
    #s = " ".join(tknzr.tokenize(s))
    #s = re.sub(r"\d", "d", s)
    s = unicodeToAscii(s.lower().strip())
    #s = re.sub(r"([^a-zA-Z.!?-].\d+)", r"\1", s)
    s = re.sub(r"[^a-zA-Z.!?-;]+", r" ", s)
    s = to_string(s).strip()
    
    return s

def preProc(text):
    """
    text is a single string, for example: 
    text = "Prednisone 40 mg once per day times three days; then 20mg once per day x 11 days."
    """
    text2 = normalizeString(text)

    tokens = [word for sent in sent_tokenize(text2) for word in
              word_tokenize(sent)]

    tokens = [word.lower() for word in tokens]

    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]

    tokens = [word for word in tokens if len(word) >= 3]

    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens

    tagged_corpus = pos_tag(tokens)

    Noun_tags = ['NN', 'NNP', 'NNPS', 'NNS']
    Verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

    lemmatizer = WordNetLemmatizer()

    def pratLemmatiz(token, tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token, 'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token, 'v')
        else:
            return lemmatizer.lemmatize(token, 'n')

    pre_proc_text = " ".join([pratLemmatiz(token, tag) for token, tag in tagged_corpus])

    return pre_proc_text

def tokenize_multiple_return_strings(articles):
    """
    articles: a list of strings e.g.,
    articles = zeroStartTrain['NOTE_TEXT'].to_list()
    """
    articles_new = [preProc(ar) for ar in articles]
    pprint.pprint(articles_new)
    tokens = flatten([word_tokenize(ar) for ar in articles_new])
    
# Convert the tokens into lowercase: lower_tokens
    lower_tokens = [t.lower() for t in tokens if t not in list(string.punctuation)]

# Create a Counter with the lowercase tokens: bow_simple
    bow_simple = Counter(lower_tokens)

# Print the 10 most common tokens
    print(bow_simple.most_common(10))
    #print(list(string.punctuation))
    return articles_new

articles = [example_text_pharmacytextbook, second_example_text_mimic]
tokenize_multiple_return_strings(articles)

['P.J.', 'has indicated that she is injecting insulin to treat her diabetes .', 'What questions might be asked to evaluate P.J.’s use of and response to insulin ?', 'M.B., a 35-year-old man, presents to the ED with a chief complaint of chest palpitations for 4 hours.', 'He relates a history of many similar self-terminating episodes since he was a teenager.', 'He took an unknown medication 5 years ago that decreased the occurrence of the pal- pitations, but he stopped taking it because of side effects.', 'M.B.’s vital signs are BP, 96/68 mm Hg; pulse, 226 beats/minute, irregular; respiratory rate, 15 breaths/ minute; and temperature, 98.7◦F.', 'A rhythm strip confirms AF, with a QRS width varying from 0.08 to 0.14 seconds.', 'To control the ventricular rate, 10 mg IV verapamil is admin- istered for 2 minutes.', 'Within 2 minutes of completing the infusion, VF is noted on the monitor.', 'M.B.', 'is defibrillated, and normal sinus rhythm is restored.', 'A subsequent ECG demonstrates a P-R

error: bad character range ?-; at position 10

In [None]:
def preProc(text):
    """
    text is a single string, for example: 
    text = "Prednisone 40 mg once per day times three days; then 20mg once per day x 11 days."
    """
    text2 = normalizeString(text)

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              word_tokenize(sent)]

    tokens = [word.lower() for word in tokens]

    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]

    tokens = [word for word in tokens if len(word) >= 3]

    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens

    tagged_corpus = pos_tag(tokens)

    Noun_tags = ['NN', 'NNP', 'NNPS', 'NNS']
    Verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

    lemmatizer = WordNetLemmatizer()

    def pratLemmatiz(token, tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token, 'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token, 'v')
        else:
            return lemmatizer.lemmatize(token, 'n')

    pre_proc_text = " ".join([pratLemmatiz(token, tag) for token, tag in tagged_corpus])

    return pre_proc_text



# Command line options
optargs = [
    ('--inFile', {
        'type': str,
        'default': 'data/training_example.csv',
        'help': 'Path to dataset with csv of text to pre-process',
    }),
    ('--inFileType', {
        'type': str,
        'default': 'csv',
        'help': 'arguments include csv, json, bz2, or plaintext readers',
    }),
	    ('--outFileDir', {
        'type': str,
        'default': 'data/pre_proc_texts/',
        'help': 'path to directory for writing out pre-proc texts',
    }),
    ('--outFileType', {
        'type': str,
        'default': 'txt_doc',
        'help': 'Path to writing out clean notes (for example, for embeddings)',
    }),
   
]

parser = argparse.ArgumentParser()
for opt, config in optargs:
    parser.add_argument(opt, **config)

args = parser.parse_args()
mkdir_if_not_exist(args.outFileDir)
todaysdate = datetime.datetime.now().strftime("%Y-%m-%d")
print(todaysdate)

if __name__ == '__main__':
    '''Convert data and normalize it in different ways.
        For example: if you want to convert a text file to a fully cleaned file, then you run as following:
            python3 preprocessing_text.py MIMIC2TEXT text <input_file> <output_file>
        Input data format is just path to text file.
    '''
    if sys.argv[1].upper() == "MIMIC2TEXT":
        MIMIC2TEXT(sys.argv[2],sys.argv[3])
    elif sys.argv[1].upper() == "ZEROSTART2TEXT":
        ZEROSTART2TEXT(sys.argv[2],sys.argv[3])
    elif sys.argv[1].upper() == "ZEROSTART2CSV":
        ZEROSTART2CSV(sys.argv[2],sys.argv[3])
    elif sys.argv[1].upper() == "TEXT2TEXT":
        TEXT2TEXT(sys.argv[2], sys.argv[3])
    else:
        print("Argument error: sys.argv[1] should belongs to \"MIMIC2TEXT/ZEROSTART2TEXT/ZEROSTART2CSV/TEXT2TEXT\"")

In [None]:
from os import open

outfile = open(args.outFileDir + str(todaysdate) + '_N2C2_test_Jun1_2018.txt', 'a')
counter = 0
df = pd.read_csv(pn +'N2C2_test_Jun1_2018.csv')
print(df.info())
for index, row in df.iterrows():
    counter += 1
    new_note = clean_mimic(row['text'])
    print(counter)
    outfile.write