In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions

plt.style.use('ggplot')
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models, similarities, matutils
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

### Import cleaned datasets

In [3]:
df_train_s = pd.read_csv('/Users/jsong/Documents/durg-recommendation/df_train.csv')
# df_test_s = pd.read_csv('/Users/jsong/Documents/durg-recommendation/df_test.csv')

In [4]:
df_train_s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148387 entries, 0 to 148386
Data columns (total 5 columns):
drugName       148387 non-null object
condition      148387 non-null object
rating         148387 non-null float64
usefulCount    148387 non-null int64
review         148386 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 5.7+ MB


In [5]:
#drop Nan value
df_train_s.dropna(inplace=True)

In [6]:
df_train_s.reset_index(inplace=True)

In [7]:
df_train_s['condition']

0                                 ADHD
1                        Birth Control
2                        Birth Control
3         Benign Prostatic Hyperplasia
4              Emergency Contraception
                      ...             
148381                   Birth Control
148382                 Nausea/Vomiting
148383            Rheumatoid Arthritis
148384             Underactive Thyroid
148385           Constipation, Chronic
Name: condition, Length: 148386, dtype: object

In [8]:
condition_list=df_train_s['condition'].tolist()

In [9]:
corpus_train=df_train_s.review
# corpus_test=df_test_s.review

In [10]:
#custom stopwords
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))
n = ["aren't","couldn't","didn't","doesn't","don't","hadn't","hasn't","haven't","isn't",
     "mightn't","mustn't","needn't","no","nor","not","shan't","shouldn't","wasn't","weren't","wouldn't"]
for i in n:
    stop.remove(i)

a = ['mg', 'week', 'month', 'day', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 
     'august', 'september','october','november','december', 'iv','oral','pound',]
for j in a:
    stop.add(j)

### CountVectorizer

In [11]:
# Create a CountVectorizer for parsing/counting words
cv = CountVectorizer(ngram_range=(2, 2), min_df=10, max_df=0.8)

cv.fit(corpus_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.8, max_features=None, min_df=10,
                ngram_range=(2, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [12]:
doc_word = cv.transform(corpus_train).transpose()

In [13]:
pd.DataFrame(doc_word.toarray(), cv.get_feature_names()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,148376,148377,148378,148379,148380,148381,148382,148383,148384,148385
abdomen area,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdomen pain,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdominal area,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdominal back,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdominal bloating,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
corpus = matutils.Sparse2Corpus(doc_word)

In [15]:
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [16]:
len(id2word)

81732

### LDA

In [17]:
lda = models.LdaModel(corpus=corpus, num_topics=2, id2word=id2word, passes=10)

KeyboardInterrupt: 

In [None]:
lda.print_topics()

In [None]:
lda_corpus = lda[corpus]
lda_corpus

In [None]:
lda_docs = [doc for doc in lda_corpus]

In [None]:
lda_docs[0:5]

In [None]:
len(lda_docs)

In [None]:
from wordcloud import WordCloud
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda.show_topics(formatted=False)

fig, axes = plt.subplots(1, 2, figsize=(10,20), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

plt.savefig('/Users/jsong/Documents/durg-recommendation/fig/wc_bigram_lda-2.svg')

In [18]:
lda1 = models.LdaModel(corpus=corpus, num_topics=4, id2word=id2word, passes=10)

In [19]:
lda1.print_topics()

[(0,
  '0.004*"blood pressure" + 0.004*"started taking" + 0.003*"lost lb" + 0.003*"feel like" + 0.003*"dry mouth" + 0.003*"weight loss" + 0.002*"lost pound" + 0.002*"year old" + 0.002*"much better" + 0.002*"fall asleep"'),
 (1,
  '0.004*"felt like" + 0.004*"first time" + 0.003*"yeast infection" + 0.003*"took pill" + 0.003*"hour later" + 0.002*"two day" + 0.002*"unprotected sex" + 0.002*"got period" + 0.002*"cold sore" + 0.002*"reading review"'),
 (2,
  '0.038*"side effect" + 0.007*"no side" + 0.004*"year ago" + 0.004*"panic attack" + 0.003*"year old" + 0.003*"feel like" + 0.002*"started taking" + 0.002*"no longer" + 0.002*"month ago" + 0.002*"work well"'),
 (3,
  '0.018*"birth control" + 0.010*"mood swing" + 0.009*"weight gain" + 0.008*"side effect" + 0.007*"sex drive" + 0.002*"taking pill" + 0.002*"gained weight" + 0.002*"first month" + 0.002*"gained pound" + 0.002*"no weight"')]

In [43]:
all_topics = lda1.get_document_topics(corpus)
all_topics

<gensim.interfaces.TransformedCorpus at 0x27497cc210>

In [44]:
num_docs = len(all_topics)

In [48]:
num_topics=4

lda_scores = np.empty([num_docs, num_topics])

for i in range(0, num_docs):
    lda_scores[i] = np.array(all_topics[i]).transpose()[1]

ValueError: could not broadcast input array from shape (3) into shape (4)

In [20]:
lda_corpus1 = lda1[corpus]
lda_corpus1

<gensim.interfaces.TransformedCorpus at 0x273ca257d0>

In [21]:
lda_docs1 = [doc for doc in lda_corpus1]

In [22]:
lda_docs1[0:5]

[[(0, 0.09211747), (1, 0.25642222), (2, 0.64425576)],
 [(1, 0.30345616), (3, 0.6843869)],
 [(1, 0.119722486), (3, 0.86307216)],
 [(0, 0.5926051), (1, 0.047874488), (2, 0.34914845), (3, 0.01037195)],
 [(0, 0.020970924), (1, 0.93699145), (2, 0.02100258), (3, 0.021035058)]]

In [26]:
len(lda_docs1)

148386

In [50]:
def dominant_topic(ldamodel, corpus, texts):
     #Function to find the dominant topic in each review
     sent_topics_df = pd.DataFrame() 
     # Get main topic in each review
     for i, row in enumerate(ldamodel[corpus]):
         row = sorted(row, key=lambda x: (x[1]), reverse=True)
         # Get the Dominant topic, Perc Contribution and Keywords for each review
         for j, (topic_num, prop_topic) in enumerate(row):
             if j == 0:  # => dominant topic
                 wp = ldamodel.show_topic(topic_num,topn=4)
                 topic_keywords = ", ".join([word for word, prop in wp])
                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
             else:
                 break
     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
     contents = pd.Series(texts)
     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
     return(sent_topics_df)

df_dominant_topic = dominant_topic(ldamodel=lda1, corpus=corpus, texts=df_train_s['review']) 
df_dominant_topic.head()

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,review
0,2.0,0.6443,"side effect, no side, year ago, panic attack",son halfway fourth intuniv became concerned be...
1,3.0,0.6844,"birth control, mood swing, weight gain, side e...",used take another contraceptive pill cycle hap...
2,3.0,0.863,"birth control, mood swing, weight gain, side e...",first time using form birth control glad went ...
3,0.0,0.5926,"blood pressure, started taking, lost lb, feel ...",nd started work rock hard erection however exp...
4,1.0,0.937,"felt like, first time, yeast infection, took pill",pulled cummed bit took plan b hour later took ...


In [51]:
#export topic keywords to csv file for later use
export_csv = df_dominant_topic.to_csv(r'/Users/jsong/Documents/durg-recommendation/df_dominant_topic.csv', index = None, header=True)

In [None]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics1 = lda1.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words1 = dict(topics1[i][1])
    cloud.generate_from_frequencies(topic_words1, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

plt.savefig('/Users/jsong/Documents/durg-recommendation/fig/wc_bigram_lda-4.svg')

In [None]:
lda2 = models.LdaModel(corpus=corpus, num_topics=6, id2word=id2word, passes=10)

In [None]:
lda2.print_topics()

In [None]:
lda_corpus2 = lda2[corpus]
lda_corpus2

In [None]:
lda_docs2 = [doc for doc in lda_corpus2]

In [None]:
lda_docs2[0:5]

In [None]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics2 = lda2.show_topics(formatted=False)

fig, axes = plt.subplots(2, 3, figsize=(12,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words2 = dict(topics2[i][1])
    cloud.generate_from_frequencies(topic_words2, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

plt.savefig('/Users/jsong/Documents/durg-recommendation/fig/wc_bigram_lda-6.svg')