## NLP_Data_Processing

Prior to running this code, complete the these notebooks: 
* NLP_Data_Loading

Please note: open Jupyter Notebooks using Anaconda Prompt: jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
You will need this increaset data rate for NLP Data Processing. See more here: https://stackoverflow.com/questions/43288550/iopub-data-rate-exceeded-in-jupyter-notebook-when-viewing-image

In [1]:
## General Dependencies
import re
import numpy as np
import pandas as pd
from pprint import pprint
import sys, os
import glob
from tika import parser # pip install tika
import inspect
import datetime
import pickle5 as pickle

## Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim import models
#from gensim.models.coherencemodel import CoherenceModel
from gensim.models import CoherenceModel
from gensim.models import LdaModel
from gensim.models.wrappers import LdaMallet
from gensim.models import ldaseqmodel


## Preprocessing
import spacy
import nltk as nltk
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

## Plotting
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import ast

## Other Libraries
from operator import itemgetter

## ScikitLearn
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

  from PIL import PILLOW_VERSION
  from PIL import PILLOW_VERSION


### Load data from previous notebooks

In [2]:
## Load data csv as a dataframe

final_df = pd.read_csv("output/loading/final_df.csv", index_col=0) 
final_df.head()


Unnamed: 0,metadata,content,status,title,file_name,unique_id
0,"{'Application-Name': 'Microsoft Office Word', ...",R – Allen Pratt CDH Lurie Peds\r\n\r\nR – Alle...,200,,R - Allen Pratt CDH Lurie peds.doc,0
1,"{'Application-Name': 'Microsoft Office Word', ...",R – Arie Habis – CDH Lurie Peds\r\n\r\nR – Ari...,200,,R - Arie Habis - CDH Lurie Peds.doc,1
2,"{'Application-Name': 'Microsoft Office Word', ...",R – Bauer and Karen Lake Forest Grays Lake\r\n...,200,,R - Bauer and Karen Lake Forest Grays Lake.doc,2
3,"{'Application-Name': 'Microsoft Office Word', ...",R – Bonomo – Residency Programs West\r\n\r\n\r...,200,,R - Bonomo - Residency programs west.doc,3
4,"{'Application-Name': 'Microsoft Office Word', ...",R – Chawla – CDH – Lurie Cancer Center\r\n\r\n...,200,,R - Chawla - CDH - Lurie ca center.doc,4


### Pre-process the text to lower case, remove special characters, etc.

In [3]:
## Pre-process the text to lower case, remove special characters, etc. 
## https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.X7RHltBKiUn
## Test regex here: https://pythex.org/

def preprocess(text):
    
    ## Lowercase words
    text_lower = text.lower()
    
    ## Remove Emails from text
    ## if you need to match a \, you can precede them with a backslash to remove their special meaning: \\.
    ## \S matches any non-whitespace character; this is equivalent to the class [^ \t\n\r\f\v].
    ## \s Matches any whitespace character; this is equivalent to the class [ \t\n\r\f\v]
    ## Code below matches any character, then an @ sign, then more characters, end matching when a white space is found.
    text_email = re.sub('\\S*@\\S*\\s?', '', text_lower) 
    
    ## Remove URLS from text
    ## https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python/40823105#40823105
    ## text_urls = re.sub(r'http\S+', '', text_email)
    ## https://www.geeksforgeeks.org/python-check-url-string/
    text_urls = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",'', text_email)
    
    
    ## Remove tabs and new lines from text
    ## https://stackoverflow.com/questions/16355732/how-to-remove-tabs-and-newlines-with-a-regex
    ## \s : Matches any single whitespace character; this is equivalent to the class [ \t\n\r\f\v]
    ## + : 1 or more occurrences of the pattern to its left
    text_spaces = re.sub(r'\s+',' ',text_urls)
        
    ## Remove \n line breaks from text
    text_space_character = text_spaces.replace('\n','')
    
    ## Remove \t tabs from text
    text_tab_character = text_space_character.replace('\t','')
    
    ## Remove special characters and numbers
    ## \W matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_]
    ## \d matches any decimal digit; this is equivalent to the class [0-9]
    text_numbers = re.sub("(\\d|\\W)+"," ",text_tab_character)
    
    ## Remove tags
    ##text_tags = re.sub("","",text_numbers)

    ## Remove special characters and space, but leave in periods and numbers
    ## ^ means any character except. So [^5] will match any character except '5'
    ## [^a-zA-Z0-9_] matches any non-alphanumeric character.
    ## text_special = re.sub('[^A-Za-z0-9.]+|\s',' ',text_tab_character)
    
    ## Remove dashes and underscores
    text_lines = re.sub('_|-', "", text_numbers)
    
    ## Remove a sepcial list of terms
    ## The prune list is similar to "stop words" ... just easier to add/remove words on the fly
    ## https://stackoverflow.com/questions/15435726/remove-all-occurrences-of-words-in-a-string-from-a-python-list
    
    PRUNE_LIST = ['right reserved section',
                   'reserved section',
                   "length word byline", 
                   "byline", 
                   "word byline",
                   "journal code",
                   "load date", 
                   "english", 
                   "dr", 
                   "publication type magazine",
                   "type magazine",
                   "magazine",
                   "type newspaper",
                   "publication type newspaper",
                   'newspaper',
                   "group right reserved",
                   'section:',
                   'copyright',
                   'body',
                   'length:',
                   'keywords:',
                   'introduction',
                   'page',
                   'methodology',
                   'table',
                   'discussion',
                   'conclusions',
                   'references',
                   'classification',
                   'language',
                   'industry',
                   'geographic',
                   'load-date',
                   'end of document',
                   'mg dl',
                   'mg'
                   
                  ]

    remove = '|'.join(PRUNE_LIST)
    regex = re.compile(r'\b('+remove+r')\b', flags=re.IGNORECASE)
    text_special_remove = regex.sub("", text_lines)

    return text_special_remove

## New column "preprocess" is formed from applying pre_process function to each item in the "content" column in dataframe
final_df['preprocess'] = final_df['content'].apply(lambda x:preprocess(x))

print(final_df['preprocess'][0])

#https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

r allen pratt cdh lurie peds r allen pratt cdh lurie peds  q so let me just tell you maybe an  i don t know how much jeff has already told you but it might be helpful for me to give just a quick intro and then ask you some questions would that be alright a yeah he hasn t told me anything just an email that he and you had a good conversation about the program q okay great well the program is reach which stands for the research and research enabling accelerator for chicago hospitals but what it really means is that we ve recognized in northwestern university that research tends to be focused in the central region right downtown in streeterville yet people don t all live downtown in streeterville there s a lot of diversity and a lot of richness that is in our suburban locations and now that northwestern has done a lot of expansion into the suburban locations we d like to be able to support research not just downtown but all over the place so nucats might not be a familiar term to you it s

### Tokenize the data using Gensim Utils Simple Preprocess

In [4]:
## Tokenize the data using Gensim Utils Simple Preprocess

def tokenize(text):
    token_list = gensim.utils.simple_preprocess(str(text), deacc=True)  # deacc=True removes punctuations
    return token_list

## New column "tokens" is formed from applying pre_process function to each item in the "content" column in dataframe
final_df['tokens'] = final_df['preprocess'].apply(lambda x:tokenize(x))


print(final_df['tokens'][0])

['allen', 'pratt', 'cdh', 'lurie', 'peds', 'allen', 'pratt', 'cdh', 'lurie', 'peds', 'so', 'let', 'me', 'just', 'tell', 'you', 'maybe', 'an', 'don', 'know', 'how', 'much', 'jeff', 'has', 'already', 'told', 'you', 'but', 'it', 'might', 'be', 'helpful', 'for', 'me', 'to', 'give', 'just', 'quick', 'intro', 'and', 'then', 'ask', 'you', 'some', 'questions', 'would', 'that', 'be', 'alright', 'yeah', 'he', 'hasn', 'told', 'me', 'anything', 'just', 'an', 'email', 'that', 'he', 'and', 'you', 'had', 'good', 'conversation', 'about', 'the', 'program', 'okay', 'great', 'well', 'the', 'program', 'is', 'reach', 'which', 'stands', 'for', 'the', 'research', 'and', 'research', 'enabling', 'accelerator', 'for', 'chicago', 'hospitals', 'but', 'what', 'it', 'really', 'means', 'is', 'that', 'we', 've', 'recognized', 'in', 'northwestern', 'university', 'that', 'research', 'tends', 'to', 'be', 'focused', 'in', 'the', 'central', 'region', 'right', 'downtown', 'in', 'streeterville', 'yet', 'people', 'don', 'all

### Remove Stopwords using a custom stopword list

In [5]:
## Remove Stopwords using a custom stopword list


def remove_stopwords(text):
    
    ## Open stop words text file and save to stop_set variable
    with open("stop_words.txt", 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        f.close()

    ## The stopword list comes from the Terrier pacakge with 733 words and another 86 custom terms: 
    ## https://github.com/kavgan/stop-words/blob/master/terrier-stop.txt
    ## https://github.com/kavgan/stop-words/blob/master/minimal-stop.txt
    
    ## Other stopword list options can be reviewed here:
    ## https://medium.com/towards-artificial-intelligence/stop-the-stopwords-using-different-python-libraries-ffa6df941653

    ## Remove stop words from token_list
    ## https://stackoverflow.com/questions/29771168/how-to-remove-words-from-a-list-in-python
    token_nostop_list = [word for word in text if word not in stop_set]
        

    return token_nostop_list

## New column "no_stop" is formed from applying pre_process function to each item in the "content" column in dataframe
final_df['no_stop'] = final_df['tokens'].apply(lambda x:remove_stopwords(x))


print(final_df['no_stop'][0])

['allen', 'pratt', 'cdh', 'lurie', 'peds', 'allen', 'pratt', 'cdh', 'lurie', 'peds', 'let', 'just', 'tell', 'maybe', 'don', 'know', 'jeff', 'told', 'helpful', 'give', 'just', 'quick', 'intro', 'ask', 'questions', 'alright', 'yeah', 'hasn', 'told', 'just', 'email', 'conversation', 'program', 'okay', 'great', 'well', 'program', 'reach', 'stands', 'research', 'research', 'enabling', 'accelerator', 'chicago', 'hospitals', 'means', 've', 'recognized', 'northwestern', 'university', 'research', 'tends', 'focused', 'central', 'region', 'right', 'downtown', 'streeterville', 'people', 'don', 'live', 'downtown', 'streeterville', 'diversity', 'richness', 'suburban', 'locations', 'now', 'northwestern', 'expansion', 'suburban', 'locations', 'able', 'support', 'research', 'just', 'downtown', 'place', 'nucats', 'familiar', 'term', 'northwestern', 'university', 'clinical', 'translational', 'sciences', 'institute', 'offer', 'research', 'supports', 'example', 'data', 'capture', 'tools', 'useful', 'statis

### Create Bigram and Trigram Tokens from non-stop word data

"Phrase modeling is another approach to learning combinations of tokens that together represent meaningful multi-word concepts. We can develop phrase models by looping over the the words in our corpus and looking for words that co-occur (i.e., appear one after another) together much more frequently than you would expect them to by random chance. The formula our phrase models will use to determine whether two tokens $A$ and $B$ constitute a phrase is:


                                                count(AB) - count_{min}
                                                - - - - - - - - - - - -   * N > threshold
                                                count(A) * count(B)

* count(A) is the number of times token $A$ appears in the corpus
* count(B) is the number of times token $B$ appears in the corpus
* count(AB) is the number of times the tokens $A\ B$ appear in the corpus in order
* N is the total size of the corpus vocabulary
* count_{min} is a user-defined parameter to ensure that accepted phrases occur a minimum number of times
* threshold is a user-defined parameter to control how strong of a relationship between two tokens the model requires before accepting them as a phrase

Once our phrase model has been trained on our corpus, we can apply it to new text. When our model encounters two tokens in new text that identifies as a phrase, it will merge the two into a single new token.

Phrase modeling is superficially similar to named entity detection in that you would expect named entities to become phrases in the model (so new york would become new_york). But you would also expect multi-word expressions that represent common concepts, but aren't specifically named entities (such as happy hour) to also become phrases in the model.

We turn to the indispensible gensim library to help us with phrase modeling — the Phrases class in particular."

Text from: <https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb>

In [6]:
# Convert the "no_stop" column in dataframe to a list to use in the build_bigrams_trigrams() function

def convert_nostop_to_list(final_df):
    nostop_list = []
    nostop_list = final_df['no_stop'].tolist()
    return nostop_list

nostop_list = convert_nostop_to_list(final_df)

In [7]:
## Create Bigram and Trigram Tokens from non-stop word data, and then compare to stopword

def build_bigrams_trigrams(text):
#     print("This is the text:", text)
#     print("---------------------------------------")

    
    ##Building Bigram & Trigram Models
    ##higher threshold fewer phrases.
    bigram = gensim.models.Phrases(text, min_count=5, threshold=100) 
    ## min_count: Ignore all words and bigrams with total collected count lower than this value. Previously 5.
    ## threshold: Represent a score threshold for forming the phrases (higher means fewer phrases).Previously 100.
    trigram = gensim.models.Phrases(bigram[text], threshold=100)
#    quadgram = gensim.models.Phrases(trigram[text], threshold=200)
#     print(bigram)
#     print(trigram)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

#     print(bigram_mod)
#     print(trigram_mod)

    return bigram_mod, trigram_mod

bigram_mod, trigram_mod = build_bigrams_trigrams(nostop_list)

In [8]:
def make_bigrams(text, bigram_mod):
    bigram_token = []
    bigram_token.append(bigram_mod[text])
    
    return bigram_token

## New column "bigrams" is formed from applying make_bigrams function to each item in the "no_stp" column in dataframe
final_df['bigrams']  = final_df['no_stop'].apply(lambda x:make_bigrams(x, bigram_mod))

print(final_df['bigrams'][0])

[['allen', 'pratt', 'cdh', 'lurie', 'peds', 'allen', 'pratt', 'cdh', 'lurie', 'peds', 'let', 'just', 'tell', 'maybe', 'don', 'know', 'jeff', 'told', 'helpful', 'give', 'just', 'quick', 'intro', 'ask', 'questions', 'alright', 'yeah', 'hasn', 'told', 'just', 'email', 'conversation', 'program', 'okay', 'great', 'well', 'program', 'reach', 'stands', 'research', 'research', 'enabling', 'accelerator', 'chicago', 'hospitals', 'means', 've', 'recognized', 'northwestern', 'university', 'research', 'tends', 'focused', 'central', 'region', 'right', 'downtown', 'streeterville', 'people', 'don', 'live', 'downtown', 'streeterville', 'diversity', 'richness', 'suburban_locations', 'now', 'northwestern', 'expansion', 'suburban_locations', 'able', 'support', 'research', 'just', 'downtown', 'place', 'nucats', 'familiar', 'term', 'northwestern', 'university', 'clinical', 'translational_sciences', 'institute', 'offer', 'research', 'supports', 'example', 'data_capture', 'tools', 'useful', 'statistical', 'su

In [9]:
def make_trigrams(text, trigram_mod, bigram_mod ):
    trigram_token = []
    trigram_token.append(trigram_mod[bigram_mod[text]])
    return trigram_token

## New column "trigrams" is formed from applying make_trigrams function to each item in the "no_stop" column in dataframe
final_df['trigrams']  = final_df['no_stop'].apply(lambda x:make_trigrams(x, trigram_mod, bigram_mod))
print(final_df['trigrams'][0])

[['allen', 'pratt', 'cdh', 'lurie', 'peds', 'allen', 'pratt', 'cdh', 'lurie', 'peds', 'let', 'just', 'tell', 'maybe', 'don', 'know', 'jeff', 'told', 'helpful', 'give', 'just', 'quick', 'intro', 'ask', 'questions', 'alright', 'yeah', 'hasn', 'told', 'just', 'email', 'conversation', 'program', 'okay', 'great', 'well', 'program', 'reach', 'stands', 'research', 'research', 'enabling', 'accelerator', 'chicago', 'hospitals', 'means', 've', 'recognized', 'northwestern', 'university', 'research', 'tends', 'focused', 'central_region', 'right', 'downtown', 'streeterville', 'people', 'don', 'live', 'downtown', 'streeterville', 'diversity', 'richness', 'suburban_locations', 'now', 'northwestern', 'expansion', 'suburban_locations', 'able', 'support', 'research', 'just', 'downtown', 'place', 'nucats', 'familiar', 'term', 'northwestern', 'university', 'clinical_translational_sciences', 'institute', 'offer', 'research', 'supports', 'example', 'data_capture', 'tools', 'useful', 'statistical', 'support'

In [10]:
## Create a dataframe with specific columns and send to csv for review
test_df = pd.DataFrame(final_df, columns = ['tokens','no_stop','bigrams','trigrams'])
test_df.to_csv('output/processing/nostop_ngrams.csv', line_terminator='\n')

## Other parameters for to_csv:
## sep = ',', 

### Use Spacy for Parts of Speech Tagging 

Resource(s): 
<https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb>

In [11]:
def parts_of_speech(texts):
    
    parts_dict = {}
    nlp = spacy.load(r'C:\Users\keg827\AppData\Local\Continuum\anaconda3\Lib\site-packages\en_core_web_sm\en_core_web_sm-2.3.1')
    #nlp = spacy.load('C:\Users\keg827\AppData\Local\Continuum\anaconda3\Lib\site-packages\en_core_web_sm\en_core_web_sm-2.3.1', disable=['parser', 'ner'])
    nlp.max_length = 2700000 ## or any large value, as long as you don't run out of RAM
                    
    for doc in texts:
        ## Remove the commas between tokens, and apply spacy
        new_doc = nlp(" ".join(doc))
        ## Create a list for token and the token's part of speech 
        token_text = [token.orth_ for token in new_doc]
        token_pos = [token.pos_ for token in new_doc]
        ## Zip the two lists into a dictionary
        parts_dict= dict(zip(token_text, token_pos))
  
    return parts_dict

final_df['parts_of_speech']  = final_df['trigrams'].apply(lambda x:parts_of_speech(x))


## Resource(s)
## https://spacy.io/usage/linguistic-features
## Review Spacy's parts of speech here: https://spacy.io/api/annotation
## E008 Text Length Exceeds Maximum Error: https://datascience.stackexchange.com/questions/38745/increasing-spacy-max-nlp-limit/55725

In [12]:
## Create a dataframe with specific columns and send to csv for review
test_df = pd.DataFrame(final_df, columns = ['tokens','no_stop','bigrams','trigrams','parts_of_speech'])
test_df.head()
#speech_df.to_csv('output/processing/nostop_ngrams_partsofspeech.csv')

Unnamed: 0,tokens,no_stop,bigrams,trigrams,parts_of_speech
0,"[allen, pratt, cdh, lurie, peds, allen, pratt,...","[allen, pratt, cdh, lurie, peds, allen, pratt,...","[[allen, pratt, cdh, lurie, peds, allen, pratt...","[[allen, pratt, cdh, lurie, peds, allen, pratt...","{'allen': 'PROPN', 'pratt': 'PROPN', 'cdh': 'P..."
1,"[arie, habis, cdh, lurie, peds, arie, habis, c...","[arie, habis, cdh, lurie, peds, arie, habis, c...","[[arie, habis, cdh, lurie, peds, arie, habis, ...","[[arie, habis, cdh, lurie, peds, arie, habis, ...","{'arie': 'PROPN', 'habis': 'PROPN', 'cdh': 'PR..."
2,"[bauer, and, karen, lake, forest, grays, lake,...","[bauer, karen, lake, forest, grays, lake, baue...","[[bauer, karen, lake_forest, grays, lake, baue...","[[bauer, karen, lake_forest, grays, lake, baue...","{'bauer': 'PROPN', 'karen': 'PROPN', 'lake_for..."
3,"[bonomo, residency, programs, west, bonomo, re...","[bonomo, residency, programs, west, bonomo, re...","[[bonomo, residency, programs, west, bonomo, r...","[[bonomo, residency, programs, west, bonomo, r...","{'bonomo': 'PROPN', 'residency': 'NOUN', 'prog..."
4,"[chawla, cdh, lurie, cancer, center, chawla, c...","[chawla, cdh, lurie, cancer, center, chawla, c...","[[chawla, cdh, lurie, cancer_center, chawla, c...","[[chawla, cdh, lurie, cancer_center, chawla, c...","{'chawla': 'PROPN', 'cdh': 'PROPN', 'lurie': '..."


In [13]:
def name_entities(texts):
    
    entities_dict = {}
    nlp = spacy.load(r'C:\Users\keg827\AppData\Local\Continuum\anaconda3\Lib\site-packages\en_core_web_sm\en_core_web_sm-2.3.1')
    #nlp = spacy.load('C:\Users\keg827\AppData\Local\Continuum\anaconda3\Lib\site-packages\en_core_web_sm\en_core_web_sm-2.3.1', disable=['parser', 'ner'])
    nlp.max_length = 2700000 ## or any large value, as long as you don't run out of RAM
                    
    for doc in texts:
        ## Remove the commas between tokens, and apply spacy
        new_doc = nlp(" ".join(doc))
        ## Create a list for token and the token's name entity 
        ent_text = [ent.text for ent in new_doc.ents]
        ent_label = [ent.label_ for ent in new_doc.ents]
        ## Zip the two lists into a dictionary
        entities_dict= dict(zip(ent_text, ent_label))
        
     
  
    return entities_dict

final_df['name_entities']  = final_df['trigrams'].apply(lambda x:name_entities(x))
print(final_df['name_entities'][0])



{'allen pratt cdh lurie peds allen': 'PERSON', 'lurie peds': 'PERSON', 'don': 'PERSON', 'jeff': 'PERSON', 'intro': 'PERSON', 'accelerator chicago hospitals means ve recognized northwestern university': 'ORG', 'central_region': 'ORG', 'suburban_locations': 'ORG', 'northwestern university clinical_translational_sciences institute': 'ORG', 'suburban_hospitals': 'ORG', 'first': 'ORDINAL', 'little_bit': 'ORG', 'lurie_children': 'ORG', 'lurie hospital': 'FAC', 'just half years mm_hmm spent years': 'DATE', 'chicago': 'GPE', 'geneva': 'GPE', 'mondays': 'DATE', 'stanford': 'ORG', 'don excel': 'PERSON', 'people department': 'ORG', 'lurie title': 'PERSON', 'lurie days week day': 'DATE', 'lurie charting': 'PERSON', 'lurie computer': 'PERSON', 'lurie cdh': 'PERSON', 'lurie downtown': 'PERSON', 'day week month year': 'DATE', 'week': 'DATE', 'today': 'DATE', 'system_clinician': 'CARDINAL', 'hours week': 'DATE', 'hours': 'TIME', 'half day day clinic week': 'DATE', 'days': 'DATE', 'may take month': 'DA

In [15]:
## Create a dataframe with specific columns and send to csv for review
test_df = pd.DataFrame(final_df, columns = ['tokens','no_stop','bigrams','trigrams','parts_of_speech', 'name_entities'])
test_df.head()
#test_df.to_csv('output/processing/nostop_ngrams_partsofspeech_nameentities.csv')

Unnamed: 0,tokens,no_stop,bigrams,trigrams,parts_of_speech,name_entities
0,"[allen, pratt, cdh, lurie, peds, allen, pratt,...","[allen, pratt, cdh, lurie, peds, allen, pratt,...","[[allen, pratt, cdh, lurie, peds, allen, pratt...","[[allen, pratt, cdh, lurie, peds, allen, pratt...","{'allen': 'PROPN', 'pratt': 'PROPN', 'cdh': 'P...","{'allen pratt cdh lurie peds allen': 'PERSON',..."
1,"[arie, habis, cdh, lurie, peds, arie, habis, c...","[arie, habis, cdh, lurie, peds, arie, habis, c...","[[arie, habis, cdh, lurie, peds, arie, habis, ...","[[arie, habis, cdh, lurie, peds, arie, habis, ...","{'arie': 'PROPN', 'habis': 'PROPN', 'cdh': 'PR...","{'arie habis cdh': 'PERSON', 'lurie peds': 'PE..."
2,"[bauer, and, karen, lake, forest, grays, lake,...","[bauer, karen, lake, forest, grays, lake, baue...","[[bauer, karen, lake_forest, grays, lake, baue...","[[bauer, karen, lake_forest, grays, lake, baue...","{'bauer': 'PROPN', 'karen': 'PROPN', 'lake_for...","{'bauer karen': 'PERSON', 'lake_forest grays l..."
3,"[bonomo, residency, programs, west, bonomo, re...","[bonomo, residency, programs, west, bonomo, re...","[[bonomo, residency, programs, west, bonomo, r...","[[bonomo, residency, programs, west, bonomo, r...","{'bonomo': 'PROPN', 'residency': 'NOUN', 'prog...","{'bonomo': 'ORG', 'west bonomo': 'GPE', 'west'..."
4,"[chawla, cdh, lurie, cancer, center, chawla, c...","[chawla, cdh, lurie, cancer, center, chawla, c...","[[chawla, cdh, lurie, cancer_center, chawla, c...","[[chawla, cdh, lurie, cancer_center, chawla, c...","{'chawla': 'PROPN', 'cdh': 'PROPN', 'lurie': '...","{'chawla cdh': 'PERSON', 'lurie cancer_center'..."


## Lemmatize the Data

"...lemmatization is a lot more powerful. It looks beyond word reduction and considers a language’s full vocabulary to apply a morphological analysis to words, aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma."

Resource(s):
* <https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb>
* <https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/>

In [16]:
## Lemmatize the data that contains bigrams and trigrams for Gensim's Bag of Words or Word2Vec
## This step can take some time to complete

def lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
    lemmatize = []
    nlp = spacy.load(r'C:\Users\keg827\AppData\Local\Continuum\anaconda3\Lib\site-packages\en_core_web_sm\en_core_web_sm-2.3.1')
    #nlp = spacy.load('C:\Users\keg827\AppData\Local\Continuum\anaconda3\Lib\site-packages\en_core_web_sm\en_core_web_sm-2.3.1', disable=['parser', 'ner'])
    nlp.max_length = 1500000 #or any large value, as long as you don't run out of RAM, previously set as: 2700000
   
    ## Remove the commas between tokens, and apply spacy
    doc = nlp(" ".join(text))

    for token in doc:
        ## Add lemmatized tokens to the list if token's part of speech is in our allowed list 
        lemmatize.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
       
    return lemmatize

final_df['lemmatize']  = final_df['parts_of_speech'].apply(lambda x:lemmatization(x, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']))

print(final_df['lemmatize'][0])

## Resources
## pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
## https://stackoverflow.com/questions/54334304/spacy-cant-find-model-en-core-web-sm-on-windows-10-and-python-3-5-3-anacon

[['allen', 'pratt', 'cdh', 'lurie', 'ped', 'let', 'just', 'tell', 'maybe', 'don', 'know', 'jeff', 'tell', 'helpful', 'give', 'quick', 'ask', 'question', 'hasn', 'email', 'conversation', 'program', 'great', 'reach', 'stand', 'research', 'enable', 'accelerator', 'chicago', 'hospital', 'mean', 've', 'recognize', 'northwestern', 'university', 'tend', 'focus', 'central_region', 'right', 'downtown', 'streeterville', 'people', 'live', 'diversity', 'richness', 'suburban_location', 'now', 'expansion', 'able', 'support', 'place', 'nucat', 'familiar', 'term', 'clinical_translational_science', 'institute', 'offer', 'support', 'example', 'data_capture', 'tool', 'useful', 'statistical', 'limited', 'resource', 'study', 'interested', 'grant', 'writing', 'support', 'kind', 'human', 'base', 'kind', 'stuff', 'essentially', 'want', 'course', 'complexity', 'eligibility', 'work', 'handle', 'try', 'understand', 'need', 'suburban_hospitals', 'term', 'can', 'currently', 'say', 'nee', 'widely', 'need', 'will', 

In [17]:
## Save work to CSV and inspect as needed
with open(r"output/processing/nostop_ngrams_partsofspeech_nameentities_lemmatize.csv", 'w', encoding='utf-8') as file:
    test_df = pd.DataFrame(final_df, columns = ['tokens','no_stop','bigrams','trigrams', 'parts_of_speech', 'name_entities','lemmatize'])
    test_df.to_csv(file, index=True, line_terminator='\n')
    file.close()   

In [17]:
# Convert the "lemmatize" column in dataframe to a list to provide to the get_gensim_corpus_dictionary() function

def convert_lemmatize_to_list(final_df):
    texts_out = final_df['lemmatize'].tolist()
    return texts_out

texts_out = convert_lemmatize_to_list(final_df)

## Inspect output as needed
# print(type(texts_out))
# print(texts_out)
# print(texts_out[1])
# print(type(texts_out[1]))


In [18]:
## Convert the output to a list of lists, not a LIST of lists of lists...which is what it was...
## This will be your final text data which will be used in the gensim topic modeling! 

texts_out_2 = [item for sublist in texts_out for item in sublist]

## Inspect the output as needed
# print(type(texts_out_2))
# print(texts_out_2[1])
# print("----------------------------------------")
# print(texts_out_2[2])

<class 'list'>
['arie', 'habis', 'cdh', 'lurie', 'peds', 'arie', 'habis', 'cdh', 'lurie', 'peds', 'great', 'let', 'sure', 'familiar', 'just', 'go', 'begin', 'little', 'want', 'hear', 've', 'talk', 'colleague', 'know', 'jeff_loughead', 'group', 'chance', 'talk', 'christine', 'name', 'see', 'guy', 'work', 'qi', 'front', 'go', 'come', 'back', 'talk', 'let', 'begin', 'introduce', 'reach', 'think', 'want', 'little', 'introduce', 'nucat', 'part', 'nucat', 'okay', 'nucats', 'stand', 'northwestern', 'university', 'clinical_translational_science', 'institute', 'going', 'show', 'slide', 'talk', 'hear', 'mean', 'just', 'bunch', 'fancy', 'word', 'don', 'know', 'right', 'don', 'want', 've', 'try', 'keep', 'naive', 'meeting', 'go', 'ahead', 'assume', 'useful', 'look', 'will', 'show', 'clinical_translational_sciences', 'institute', 'reach', 'project', 'research', 'enable', 'accelerator', 'community', 'health', 'basically', 'acronym', 'make', 'show', 'want', 'big', 'grant', 've', 'receive', 'take', 's

In [19]:
## Save the list as a .pkl file
## Pickling is a way to convert a python object (list, dict, etc.) into a character stream. 
## The idea is that this character stream contains all the information necessary to reconstruct the 
## object in another python script.

file_name = "output/processing/texts_out_2.pkl"

open_file = open(file_name, "wb")
pickle.dump(texts_out_2, open_file, protocol=4)
open_file.close()

## Resources
## https://www.kite.com/python/answers/how-to-save-and-read-a-list-in-python
## https://stackoverflow.com/questions/25843698/valueerror-unsupported-pickle-protocol-3-python2-pickle-can-not-load-the-file

In [21]:
## Lemmatize the data that contains no_stop words. 
## Use this for scikitlearn's tfidif "countVectorizer"

def lemmatization_nostop(text):
    lemmatize_nostop = []
    nlp = spacy.load(r'C:\Users\keg827\AppData\Local\Continuum\anaconda3\Lib\site-packages\en_core_web_sm\en_core_web_sm-2.3.1')
    #nlp = spacy.load('C:\Users\keg827\AppData\Local\Continuum\anaconda3\Lib\site-packages\en_core_web_sm\en_core_web_sm-2.3.1', disable=['parser', 'ner'])
    nlp.max_length = 2700000 #or any large value, as long as you don't run out of RAM
     
    ## Remove the commas between tokens in doc, and apply spacy
    doc = nlp(" ".join(text))
    
    for token in doc:
        ## Add lemmatized tokens to the list  
        lemmatize_nostop.append(token.lemma_)

   
    return lemmatize_nostop
    

final_df['lemmatize_nostop']  = final_df['no_stop'].apply(lambda x:lemmatization_nostop(x))

print(final_df['lemmatize_nostop'][0])

## Resources
## pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
## https://stackoverflow.com/questions/54334304/spacy-cant-find-model-en-core-web-sm-on-windows-10-and-python-3-5-3-anacon

['allen', 'pratt', 'cdh', 'lurie', 'peds', 'allen', 'pratt', 'cdh', 'lurie', 'ped', 'let', 'just', 'tell', 'maybe', 'don', 'know', 'jeff', 'tell', 'helpful', 'give', 'just', 'quick', 'intro', 'ask', 'question', 'alright', 'yeah', 'hasn', 'tell', 'just', 'email', 'conversation', 'program', 'okay', 'great', 'well', 'program', 'reach', 'stand', 'research', 'research', 'enable', 'accelerator', 'chicago', 'hospital', 'mean', 've', 'recognize', 'northwestern', 'university', 'research', 'tend', 'focus', 'central', 'region', 'right', 'downtown', 'streeterville', 'people', 'don', 'live', 'downtown', 'streeterville', 'diversity', 'richness', 'suburban', 'location', 'now', 'northwestern', 'expansion', 'suburban', 'location', 'able', 'support', 'research', 'just', 'downtown', 'place', 'nucat', 'familiar', 'term', 'northwestern', 'university', 'clinical', 'translational', 'sciences', 'institute', 'offer', 'research', 'support', 'example', 'data', 'capture', 'tool', 'useful', 'statistical', 'support

In [22]:
## Save work to CSV and inspect as needed
with open(r"output/processing/nostop_ngrams_partsofspeech_nameentities_lemmatize_nostop.csv", 'w', encoding='utf-8') as file:
    test_df = pd.DataFrame(final_df, columns = ['tokens','no_stop','bigrams','trigrams', 'parts_of_speech', 'name_entities','lemmatize', 'lemmatize_nostop'])
    test_df.to_csv(file, index=True, line_terminator='\n')
    file.close()   

In [23]:
# Convert the "lemmatize_nostop" column in dataframe to a list to provide to the get_gensim_corpus_dictionary() function

def convert_lemmatize_nostop_to_list(final_df):
    texts_out_3 = final_df['lemmatize_nostop'].tolist()
    return texts_out_3

texts_out_4 = convert_lemmatize_nostop_to_list(final_df)

## Inspect output as needed
# print(type(texts_out))
# print(texts_out)
# print(texts_out[1])
# print(type(texts_out[1]))


In [24]:
## Save the list as a .pkl file
## Pickling is a way to convert a python object (list, dict, etc.) into a character stream. 
## The idea is that this character stream contains all the information necessary to reconstruct the 
## object in another python script.

file_name = "output/processing/texts_out_4.pkl"

open_file = open(file_name, "wb")
pickle.dump(texts_out_4, open_file, protocol=4)
open_file.close()

## Resources
## https://www.kite.com/python/answers/how-to-save-and-read-a-list-in-python
## https://stackoverflow.com/questions/25843698/valueerror-unsupported-pickle-protocol-3-python2-pickle-can-not-load-the-file