In [3]:
file_name = 'eyecareFAQ.csv'
text = open(file_name).read()

In [4]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [5]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /home/kam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/kam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [8]:
import random
text_data = []
with open('eyecareFAQ.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['treatment', 'allergy']
['iritis', 'treat']
['prevent', 'corneal', 'opacity']
['water', 'older']
['angle', 'glaucoma', 'treat']
['different', 'form', 'infectious', 'conjunctivitis']
['vision', 'test', 'important']
['advantage', 'lasek', 'surgery']
['optometrist']
['bleeding']
['retinopathy', 'prematurity', 'diagnose']
['ultrasound']
['doctor', 'injury']
['cause']
['factor', 'glaucoma']
['treat']
['visit', 'emergency', 'department', 'subconjunctival', 'hemorrhage']
['test', 'doctor', 'child', 'younger']
['antihistamine', 'cause']
['prevent', 'ocular', 'rosacea', 'getting', 'worse']
['serious', 'disorder', 'know', 'occur', 'floater']
['drop', 'avoid', 'glaucoma']
['infectious', 'conjunctivitis']
['smoking', 'damage']
['happen', 'photorefractive', 'keratectomy', 'surgery']
['eyelid', 'inflammation', 'blepharitis', 'affect']
['nyctalopia']
['legally', 'blind']
['useless', 'exercise']
['someone', 'prepare']
['treat']
['strabismus', 'cross']
['reason', 'doctor', 'freckle']
['conditions', 'n

In [10]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [11]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.045*"treat" + 0.045*"glaucoma" + 0.045*"emergency" + 0.045*"hemorrhage"')
(1, '0.054*"doctor" + 0.029*"serious" + 0.029*"floater" + 0.029*"disorder"')
(2, '0.060*"child" + 0.060*"cross" + 0.060*"cause" + 0.033*"worse"')
(3, '0.045*"test" + 0.045*"prevent" + 0.045*"conditions" + 0.045*"night"')
(4, '0.069*"treat" + 0.069*"conjunctivitis" + 0.069*"infectious" + 0.038*"glaucoma"')


In [12]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]


In [13]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.062*"doctor" + 0.044*"test" + 0.026*"child" + 0.026*"prevent"')
(1, '0.075*"treat" + 0.058*"glaucoma" + 0.040*"cause" + 0.023*"occur"')
(2, '0.044*"surgery" + 0.044*"cross" + 0.044*"infectious" + 0.044*"conjunctivitis"')


In [14]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.057*"getting" + 0.057*"rosacea" + 0.057*"ocular" + 0.057*"worse"')
(1, '0.122*"treat" + 0.043*"know" + 0.043*"serious" + 0.043*"hemorrhage"')
(2, '0.082*"surgery" + 0.082*"photorefractive" + 0.082*"happen" + 0.082*"keratectomy"')
(3, '0.069*"test" + 0.069*"infectious" + 0.069*"conjunctivitis" + 0.036*"different"')
(4, '0.071*"affect" + 0.071*"blepharitis" + 0.071*"inflammation" + 0.071*"eyelid"')
(5, '0.146*"cause" + 0.076*"prevent" + 0.076*"corneal" + 0.076*"opacity"')
(6, '0.076*"doctor" + 0.076*"reason" + 0.076*"freckle" + 0.076*"damage"')
(7, '0.106*"child" + 0.106*"cross" + 0.106*"bleeding" + 0.010*"treat"')
(8, '0.089*"treat" + 0.089*"glaucoma" + 0.089*"angle" + 0.089*"legally"')
(9, '0.014*"treat" + 0.014*"cause" + 0.014*"optometrist" + 0.014*"glaucoma"')


In [17]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# Categorize 3 topics

In [18]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

# Categorize 10 topics

In [19]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)