In [2]:
# Importing Libraries
import pandas as pd
import numpy as np

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#nltk.download()
import nltk

import re
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('//content//drive//MyDrive//All CSV//drug.csv')

In [5]:
df.head()

Unnamed: 0,urlDrugName,rating,Review,score
0,enalapril,4,enalapril management of congestive heart failu...,Low
1,ortho-tri-cyclen,1,ortho-tri-cyclen birth prevention - Although t...,Low
2,ponstel,10,ponstel menstrual cramps - I was used to havin...,high
3,prilosec,3,prilosec acid reflux - The acid reflux went aw...,Low
4,lyrica,2,lyrica fibromyalgia - I think that the Lyrica ...,Low


In [6]:
df.shape

(4143, 4)

In [7]:
df['score'] = df.score.apply(lambda x: 1 if x == "high" else 0)

In [8]:
df['score'].value_counts()

Unnamed: 0_level_0,count
score,Unnamed: 1_level_1
1,3241
0,902


In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
## Text pre-processing

#remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

In [12]:
#Adding custom stop words
new_words = ["some","one","like","time","br","drug","effect","could","good",'even', 'get', 'would',
             'make', 'really', 'see', 'well', 'much', 'great', 'first', 'people', 'also', 'bad',
             'show', 'way', 'thing', 'made', 'go', 'think', 'know', 'watch','look','many','day']
stop_words = stop_words.union(new_words)


## **Defining functions**

In [13]:
def strip_newline(series):
    return [review.replace('\n','') for review in series]


def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    out = [[word for word in simple_preprocess(str(doc))
            if word not in stop_words]
            for doc in texts]
    return out

In [14]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

In [15]:
def get_corpus(df):
    df['Review'] = strip_newline(df.Review)
    words = list(sent_to_words(df.Review))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]

    return corpus, id2word, bigram

## **Apply function to corpus to pre-process and extract bi-grams**

In [16]:
train_corpus, train_id2word, bigram_train = get_corpus(df)

## **Build the Topic Model**

In [17]:
lda_train = gensim.models.ldamulticore.LdaMulticore(
                        corpus=train_corpus,
                        num_topics=10,
                        id2word=train_id2word,
                        chunksize=100,
                        workers=7, # Num. Processing Cores - 1
                        passes=50,
                        eval_every = 1,
                        per_word_topics=True)

In [18]:
lda_train.print_topics(20,num_words=15)[:10]

[(0,
  '0.026*"days" + 0.014*"took" + 0.011*"taking" + 0.011*"infection" + 0.010*"take" + 0.008*"treatment" + 0.008*"doctor" + 0.007*"prescribed" + 0.007*"symptoms" + 0.007*"went" + 0.007*"felt" + 0.007*"two" + 0.007*"rash" + 0.006*"week" + 0.006*"started"'),
 (1,
  '0.066*"mg" + 0.020*"dose" + 0.017*"increased" + 0.016*"dosage" + 0.013*"anxiety" + 0.012*"medication" + 0.012*"daily" + 0.011*"patient" + 0.009*"reduced" + 0.009*"symptoms" + 0.009*"months" + 0.009*"treatment" + 0.009*"started" + 0.008*"severe" + 0.008*"experienced"'),
 (2,
  '0.020*"depression" + 0.012*"feel" + 0.011*"anxiety" + 0.011*"felt" + 0.009*"life" + 0.009*"taking" + 0.008*"medication" + 0.008*"better" + 0.007*"years" + 0.006*"mg" + 0.006*"take" + 0.006*"feeling" + 0.006*"work" + 0.006*"mood" + 0.005*"started"'),
 (3,
  '0.021*"blood_pressure" + 0.020*"pressure" + 0.016*"allergies" + 0.015*"symptoms" + 0.015*"cholesterol" + 0.013*"asthma" + 0.011*"medication" + 0.010*"take" + 0.010*"high_blood" + 0.010*"none" + 0.

## **Topic Visualisation**

In [34]:
pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [35]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Prepare the visualization
lda_display = gensimvis.prepare(lda_train, train_corpus, train_id2word)

# Display the interactive visualization
pyLDAvis.display(lda_display)

In [36]:
pyLDAvis.enable_notebook()
pyLDAvis.display(lda_display)

  and should_run_async(code)


### **Extracting training vectors**

In [19]:
train_vecs = []
for i in range(len(df)):
    top_topics = (
        lda_train.get_document_topics(train_corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(10)]
    topic_vec.extend([len(df.iloc[i].Review)])
    train_vecs.append(topic_vec)

**Topic 2 is the most dominant topic**

In [20]:
train_vecs[2]

[0.001786405,
 0.0017864176,
 0.0017863943,
 0.0017863134,
 0.35718778,
 0.4132375,
 0.0017863876,
 0.21707006,
 0.0017864469,
 0.0017862511,
 712]

In [21]:
X = np.array(train_vecs)
y = np.array(df.score)

In [22]:
#Importing libraries for Model
from sklearn import model_selection, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.ensemble import RandomForestClassifier

In [23]:
print(X.shape, y.shape)

(4143, 11) (4143,)


In [24]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.30, random_state = 0)

In [25]:
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(2900, 11) (2900,)
(1243, 11) (1243,)


In [26]:
# Scale Data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [27]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train_scale, y_train)

In [28]:
# Predicting the Test set results
y_pred = classifier.predict(X_test_scale)

In [29]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cm

array([[103, 172],
       [155, 813]])

In [30]:
from sklearn.metrics import accuracy_score, recall_score
print(accuracy_score(y_test, y_pred))
#print(recall_score(y_test, y_pred))

0.7369267900241352


Made during the classes of MBA.