# Data preparation

In [1]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

In [6]:
data = pd.read_csv('raw_data/drugsComTrain_raw.csv')
data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [62]:
# data cleaning
import string 

def remove_punct(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    text = text.replace('\n', ' ')
    return text

def to_lower(text):
    return text.lower()

def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text) 
    text = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(text)
    return text

from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    text = ''.join(lemmatized)
    return text

cleaning_functions = [remove_punct, to_lower, remove_numbers, remove_stopwords, lemmatize_text]

In [12]:
# don't do this again, takes way to long... select train_data of data before
data["clean_review"] = data["review"].apply(remove_punct).apply(to_lower).apply(remove_numbers).apply(remove_stopwords).apply(lemmatize_text)

In [67]:
train_data = data[100:50100].drop(["uniqueID", "condition", "rating", "date", "usefulCount"], axis =1)
print(train_data.shape)
train_data.head()

(50000, 3)


Unnamed: 0,drugName,review,clean_review
100,Macrobid,"""Awful medicine, the worst. The side effects o...",awful medicine worst side effects outweigh ben...
101,Lurasidone,"""I&#039;ve been on Latuda for a little under 2...",ive latuda little half years almost completely...
102,Adapalene / benzoyl peroxide,"""I&#039;ve seriously only been using Epiduo fo...",ive seriously using epiduo four days seen huge...
103,Brimonidine,"""This cream is absolutely horrible. I will adm...",cream absolutely horrible admit cream initiall...
104,Amlodipine / olmesartan,"""My blood pressure has been around 160/100. Do...",blood pressure around doctor prescribed azor h...


# Simple LDA

In [55]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df = 0.9, min_df = 0.1).fit(train_data['clean_review'])
data_vectorized = vectorizer.transform(train_data['clean_review'])

lda_model = LatentDirichletAllocation(n_components=2).fit(data_vectorized)

In [56]:
# visualise output
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        
print_topics(lda_model, vectorizer)

Topic 0:
[('side', 3520.653496325217), ('pain', 3504.1421844324345), ('effects', 3146.7471855162444), ('started', 2536.8704779129175), ('days', 2456.0040434443185), ('years', 2398.7725164074686), ('weeks', 2219.9805313147003), ('weight', 2118.0442715207605), ('back', 2113.2958106488436), ('medicine', 2093.644532442816)]
Topic 1:
[('im', 2967.731449509674), ('take', 2875.403981680937), ('mg', 2612.0065312617035), ('taking', 2588.1027056674857), ('day', 2580.921273505461), ('like', 2467.281850765035), ('ive', 2459.8838630779633), ('medication', 2351.1539107618446), ('feel', 2315.552081894911), ('get', 2143.928426523724)]


In [57]:
# dummy test: is 'side effect' a topic?
example = ["side effect"]

example_vectorized = vectorizer.transform(example)
lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.7488614365972491
topic 1 : 0.2511385634027508


# Compare to labelled data

## Get Data

In [68]:
test_data = pd.read_csv('raw_data/manually_labelled_data.csv')
test_data = test_data.drop(["uniqueID", "condition", "rating", "date", "usefulCount"], axis =1)
test_data.head()

Unnamed: 0,drugName,review,sideEffect
0,Valsartan,"""It has no side effect, I take it in combinati...",0
1,Guanfacine,"""My son is halfway through his fourth week of ...",1
2,Lybrel,"""I used to take another oral contraceptive, wh...",1
3,Ortho Evra,"""This is my first time using any form of birth...",1
4,Buprenorphine / naloxone,"""Suboxone has completely turned my life around...",1


In [69]:
test_data["clean_review"] = test_data["review"].apply(remove_punct).apply(to_lower).apply(remove_numbers).apply(remove_stopwords).apply(lemmatize_text)

In [70]:
test_data.head()

Unnamed: 0,drugName,review,sideEffect,clean_review
0,Valsartan,"""It has no side effect, I take it in combinati...",0,side effect take combination bystolic mg fish oil
1,Guanfacine,"""My son is halfway through his fourth week of ...",1,son halfway fourth week intuniv became concern...
2,Lybrel,"""I used to take another oral contraceptive, wh...",1,used take another oral contraceptive pill cycl...
3,Ortho Evra,"""This is my first time using any form of birth...",1,first time using form birth control im glad we...
4,Buprenorphine / naloxone,"""Suboxone has completely turned my life around...",1,suboxone completely turned life around feel he...


## Transform

In [59]:
test_data_vectorized = vectorizer.transform(test_data['clean_review'])

lda_vectors = lda_model.transform(test_data_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.4089268026379905
topic 1 : 0.5910731973620095


## Compare

In [60]:
predicted_topic = []
for row in range(len(test_data)):
    if (lda_vectors[row][0] > lda_vectors[row][1]):
        predicted_topic.append(0)
    else: predicted_topic.append(1)

np.array(predicted_topic)

array([1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1])

In [51]:
np.array(test_data.sideEffect)

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0])

In [61]:
(np.array(predicted_topic) == np.array(test_data.sideEffect)).sum()

55