# Try LDA without "side effect"-Term

## Prepare dataset without "side effects"

In [1]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np

In [2]:
data = pd.read_csv('../raw_data/drugsComTrain_raw.csv')
data.head()

train_data = data[100:5100].drop(["uniqueID", "condition", "rating", "date", "usefulCount"], axis =1)
print(train_data.shape)
train_data.head()

(5000, 2)


Unnamed: 0,drugName,review
100,Macrobid,"""Awful medicine, the worst. The side effects o..."
101,Lurasidone,"""I&#039;ve been on Latuda for a little under 2..."
102,Adapalene / benzoyl peroxide,"""I&#039;ve seriously only been using Epiduo fo..."
103,Brimonidine,"""This cream is absolutely horrible. I will adm..."
104,Amlodipine / olmesartan,"""My blood pressure has been around 160/100. Do..."


In [3]:
# data cleaning
import string 

def remove_punct(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    text = text.replace('\n', ' ')
    return text

def to_lower(text):
    return text.lower()

def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text) 
    text = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(text)
    return text

from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    text = ''.join(lemmatized)
    return text

In [4]:
train_data["clean_review"] = train_data["review"].apply(remove_punct).apply(to_lower).apply(remove_numbers).apply(remove_stopwords).apply(lemmatize_text)

train_data.head()

Unnamed: 0,drugName,review,clean_review
100,Macrobid,"""Awful medicine, the worst. The side effects o...",awful medicine worst side effects outweigh ben...
101,Lurasidone,"""I&#039;ve been on Latuda for a little under 2...",ive latuda little half years almost completely...
102,Adapalene / benzoyl peroxide,"""I&#039;ve seriously only been using Epiduo fo...",ive seriously using epiduo four days seen huge...
103,Brimonidine,"""This cream is absolutely horrible. I will adm...",cream absolutely horrible admit cream initiall...
104,Amlodipine / olmesartan,"""My blood pressure has been around 160/100. Do...",blood pressure around doctor prescribed azor h...


In [41]:
# remove "side" and "effects"
train_data['clean_review'] = train_data['clean_review'].str.replace('side', '')
train_data['clean_review'] = train_data['clean_review'].str.replace('effects', '')
train_data

Unnamed: 0,drugName,review,clean_review
100,Macrobid,"""Awful medicine, the worst. The side effects o...",awful medicine worst outweigh benefit headac...
101,Lurasidone,"""I&#039;ve been on Latuda for a little under 2...",ive latuda little half years almost completely...
102,Adapalene / benzoyl peroxide,"""I&#039;ve seriously only been using Epiduo fo...",ive seriously using epiduo four days seen huge...
103,Brimonidine,"""This cream is absolutely horrible. I will adm...",cream absolutely horrible admit cream initiall...
104,Amlodipine / olmesartan,"""My blood pressure has been around 160/100. Do...",blood pressure around doctor prescribed azor h...
...,...,...,...
5095,Oseltamivir,"""Started feeling ill at lunchtime yesterday, b...",started feeling ill lunchtime yesterday time w...
5096,Losartan,"""I was switched to Losartan from Lisinipril wh...",switched losartan lisinipril caused horrifying...
5097,Celexa,"""Was on Citalopram for 1 month then asked my d...",citalopram month asked doctor change first wee...
5098,Ortho Tri-Cyclen Lo,"""I&#039;ve been on Ortho Tri-Cyclen Lo for 4 y...",ive ortho tricyclen lo years first used contro...


## Simple LDA

In [75]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range = (2,2)).fit(train_data['clean_review'])
data_vectorized = vectorizer.transform(train_data['clean_review'])

lda_model = LatentDirichletAllocation(n_components=2).fit(data_vectorized)

In [76]:
# visualise output
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        
print_topics(lda_model, vectorizer)

Topic 0:
[('side effects', 33.73979535993013), ('birth control', 17.22205603766533), ('works great', 11.560611558392786), ('side effect', 11.452744187490966), ('weight gain', 10.127115060511406), ('mood swings', 8.38747594833456), ('two weeks', 7.328212372035449), ('started taking', 7.243171083474), ('blood pressure', 6.898379075277094), ('stopped taking', 6.734157248412776)]
Topic 1:
[('side effects', 31.843987459515066), ('birth control', 15.544222900897253), ('started taking', 11.274575722450031), ('weight gain', 10.65655688228259), ('mood swings', 10.14500407010599), ('feel like', 9.548231608239165), ('sex drive', 8.566124048533354), ('side effect', 8.286765220007744), ('years ago', 8.01245077417522), ('years old', 7.718880835012835)]


## Interpretation

In [77]:
# dummy test: is 'side effect' a topic?
example = ["side effect"]

example_vectorized = vectorizer.transform(example)
lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1]) 
# nope, makes sense, since it hasn't seen it before, it doesnt know where to put it.

topic 0 : 0.6497193900737439
topic 1 : 0.3502806099262561


In [78]:
# compare to labelled data
test_data = pd.read_csv('../raw_data/manually_labelled_data.csv')
test_data = test_data.drop(["uniqueID", "condition", "rating", "date", "usefulCount"], axis =1)
test_data.head()

Unnamed: 0,drugName,review,sideEffect
0,Valsartan,"""It has no side effect, I take it in combinati...",0
1,Guanfacine,"""My son is halfway through his fourth week of ...",1
2,Lybrel,"""I used to take another oral contraceptive, wh...",1
3,Ortho Evra,"""This is my first time using any form of birth...",1
4,Buprenorphine / naloxone,"""Suboxone has completely turned my life around...",1


In [79]:
test_data["clean_review"] = test_data["review"].apply(remove_punct).apply(to_lower).apply(remove_numbers).apply(remove_stopwords).apply(lemmatize_text)

test_data.head()

Unnamed: 0,drugName,review,sideEffect,clean_review
0,Valsartan,"""It has no side effect, I take it in combinati...",0,side effect take combination bystolic mg fish oil
1,Guanfacine,"""My son is halfway through his fourth week of ...",1,son halfway fourth week intuniv became concern...
2,Lybrel,"""I used to take another oral contraceptive, wh...",1,used take another oral contraceptive pill cycl...
3,Ortho Evra,"""This is my first time using any form of birth...",1,first time using form birth control im glad we...
4,Buprenorphine / naloxone,"""Suboxone has completely turned my life around...",1,suboxone completely turned life around feel he...


In [80]:
# transform test_data
test_data_vectorized = vectorizer.transform(test_data['clean_review'])

lda_vectors = lda_model.transform(test_data_vectorized)

In [81]:
# Compare
predicted_topic = []
for row in range(len(test_data)):
    if (lda_vectors[row][0] > lda_vectors[row][1]):
        predicted_topic.append(0)
    else: predicted_topic.append(1)

(np.array(predicted_topic) == np.array(test_data.sideEffect)).sum()

## 55% --> pretty much randomly assigned

59

# MNBayes

In [69]:
data = pd.read_csv('../raw_data/drugsComTrain_raw.csv')
data.head()

train_data = data[100:5100].drop(["uniqueID", "condition", "rating", "date", "usefulCount"], axis =1)
print(train_data.shape)
train_data.head()

(5000, 2)


Unnamed: 0,drugName,review
100,Macrobid,"""Awful medicine, the worst. The side effects o..."
101,Lurasidone,"""I&#039;ve been on Latuda for a little under 2..."
102,Adapalene / benzoyl peroxide,"""I&#039;ve seriously only been using Epiduo fo..."
103,Brimonidine,"""This cream is absolutely horrible. I will adm..."
104,Amlodipine / olmesartan,"""My blood pressure has been around 160/100. Do..."


In [70]:
train_data["clean_review"] = train_data["review"].apply(remove_punct).apply(to_lower).apply(remove_numbers).apply(remove_stopwords).apply(lemmatize_text)

train_data.head()

Unnamed: 0,drugName,review,clean_review
100,Macrobid,"""Awful medicine, the worst. The side effects o...",awful medicine worst side effects outweigh ben...
101,Lurasidone,"""I&#039;ve been on Latuda for a little under 2...",ive latuda little half years almost completely...
102,Adapalene / benzoyl peroxide,"""I&#039;ve seriously only been using Epiduo fo...",ive seriously using epiduo four days seen huge...
103,Brimonidine,"""This cream is absolutely horrible. I will adm...",cream absolutely horrible admit cream initiall...
104,Amlodipine / olmesartan,"""My blood pressure has been around 160/100. Do...",blood pressure around doctor prescribed azor h...


In [72]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


vectorizer_mnb = TfidfVectorizer()

X = vectorizer_mnb.fit_transform(test_data.clean_review)
y = test_data.sideEffect

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 4)

nb_model = MultinomialNB()
nb_model.fit(X_train,y_train)
nb_model.score(X_test,y_test)

0.6333333333333333

In [74]:
# as crossvalidation
from sklearn.model_selection import cross_validate

X = test_data.clean_review
y = test_data.sideEffect
nb_model = MultinomialNB()

cv_results = cross_validate(nb_model, X, y, cv=5)

cv_results['test_score'].mean()

Traceback (most recent call last):
  File "/Users/hendrike.dahmke/.pyenv/versions/3.7.7/envs/lewagon/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/hendrike.dahmke/.pyenv/versions/3.7.7/envs/lewagon/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 615, in fit
    X, y = self._check_X_y(X, y)
  File "/Users/hendrike.dahmke/.pyenv/versions/3.7.7/envs/lewagon/lib/python3.7/site-packages/sklearn/naive_bayes.py", line 480, in _check_X_y
    return self._validate_data(X, y, accept_sparse='csr')
  File "/Users/hendrike.dahmke/.pyenv/versions/3.7.7/envs/lewagon/lib/python3.7/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/hendrike.dahmke/.pyenv/versions/3.7.7/envs/lewagon/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/Users/hendrike.dahmke/.

nan