In [None]:
# --Import packages

import pandas as pd
import boto3
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models
import nltk
nltk.data.path.append("../xxxxxxxx/nltk_data")
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.collocations import *
import string
import re
import num2words
import gensim
import gensim.corpora as corpora
# from gensim.models import HdpModel
# from gensim.models import Nmf
import warnings
warnings.filterwarnings(action='once')

rand_seed = 96

In [None]:
# -- Read in CSVs

# --rj

#wdf = pd.read_csv('s3://xxxxxxxxxxxxxx/rj.csv')

# -- rf 
wdf = pd.read_csv('s3://xxxxxxxxxxxxxx/rf.csv')

# -- all_notes
# wdf = pd.read_csv('s3://xxxxxxxxxxxxxx/combined.csv')

In [None]:
wdf.head(1)

In [None]:
def tokenizer(itext):
    # -- tokenize words for processing
    return nltk.word_tokenize(itext)

def senttokenizer(itext):
    # -- tokenize sentences for processing
    return nltk.sent_tokenize(itext)

def lemmatizer(itokens):
    # -- lemmatize words - to be used after tokenization
    lem = WordNetLemmatizer()
    return[lem.lemmatize(token) for token in itokens]

def rem_stopwords(itokens):
    # -- Removes stopwords including all customer stop words from processed word tokens.
    # -- Add your own stop words to custom_stop_words using '','',''
    custom_stop_wds = ['xxxx','yyyy','zzzz', 'aaaa', 'bbbb', 'cccc', 'dddd', 'eeee', 'ffff', 'gggg']
    stop_words = set(stopwords.words('english') + custom_stop_wds)
    return [token for token in itokens if token not in stop_words]

def remove_num(itokens):
    # -- Remove numbers from processed word tokens
    return [token for token in itokens if token.isalpha()]

def remove_short_tokens(itokens):
    # -- Remove word token when token is short - less than 2 letters
    return [token for token in itokens if len(token) > 2]

def pstem(itokens):
    # -- Return word stems (porter stemmer) for all tokens
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in itokens]

In [None]:
# -- proc_notes column to str to be processed

wdf['PROC_NOTES'] = wdf['PROC_NOTES'].astype(str)

In [None]:
# -- unwanted phrases removed from analysis... don't carry meaning on their own / in this context

wdf['PROC_NOTES'] = wdf['PROC_NOTES'].str.replace('xxxxxxxxxxx', '')
wdf['PROC_NOTES'] = wdf['PROC_NOTES'].str.replace('yyyyyyyyyyyyy', '')

In [None]:
# --tokenize

wdf['PROC_TOKENS'] = wdf['PROC_NOTES'].apply(tokenizer)

In [None]:
# -- remove stop words

wdf['PROC_TOKENS'] = wdf['PROC_TOKENS'].apply(rem_stopwords)

In [None]:
# -- remove the numbers

wdf['PROC_TOKENS'] = wdf['PROC_TOKENS'].apply(remove_num)

In [None]:
# -- remove the short words, less than 2 letters

wdf['PROC_TOKENS'] = wdf['PROC_TOKENS'].apply(remove_short_tokens)

In [None]:
# -- lemmatize

wdf['TOKENS_LEMM'] = wdf['PROC_TOKENS'].apply(lemmatizer)

In [None]:
# -- stemming
wdf['TOKENS_STEMS'] = wdf['PROC_TOKENS'].apply(pstem)

In [None]:
# -- Preparation for Gensim models

gensim_tokens = wdf["TOKENS_LEMM"]

In [None]:
gensim_dict = corpora.Dictionary(gensim_tokens)

print(gensim_dict)

In [None]:
gensim_corpus = [gensim_dict.doc2bow(token) for token in gensim_tokens]

# Running the algorithms

### Gridsearch for params (N/A for Gridsearch models)

In [None]:
# -- Run NMF

# -- GridSearchCV reccomended parameters (SKLearn) = {'alpha_H': 0, 'alpha_W': 0.1, 'l1_ratio': 0.1, 'n_components': 8, 'random_state': 1}

NMF = gensim.models.nmf.Nmf(corpus = gensim_corpus, id2word = gensim_dict, num_topics = 14, random_state=rand_seed)


In [None]:
# -- Run LDA

LDA = gensim.models.ldamodel.LdaModel(corpus = gensim_corpus, id2word= gensim_dict, num_topics=22, random_state=rand_seed)


In [None]:
# -- Run HDP Model

HDP = gensim.models.hdpmodel.HdpModel(corpus = gensim_corpus, id2word=gensim_dict, random_state=rand_seed)

In [None]:
NMF.top_topics(corpus=gensim_corpus)

In [None]:
LDA.top_topics(corpus=gensim_corpus)

In [None]:
HDP.print_topics(num_topics=15, num_words=25)

In [None]:
# -- LDA Vis

pyLDAvis.enable_notebook()

vislda = pyLDAvis.gensim_models.prepare(LDA, gensim_corpus, gensim_dict)
vislda

In [None]:
pd.set_option("display.max_colwidth", -1)

In [None]:
# -- Enter search for words / phrases and select a sample

words = ['','']

regex_words = []

for word in words:
    regex_words.append("(?=.*" + word + ")")    

regex_words

# -- Make search terms upper case to highlight them

searchdf = pd.DataFrame(wdf['NOTES'][wdf['NOTES'].str.contains(''.join(regex_words), case=False)])

sample = searchdf.sample(n=10)

for i in range(0,len(sample['NOTES'])):
    for j in words:
        sample.iat[i,0] = sample.iat[i,0].replace(j, str(j).upper())
    

In [None]:
print(len(searchdf),"/", len(wdf))

In [None]:
sample