# Data preparation

In [2]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv('../raw_data/drugsComTrain_raw.csv')
data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [5]:
# data cleaning
import string 

def remove_punct(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    text = text.replace('\n', ' ')
    return text

def to_lower(text):
    return text.lower()

def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text) 
    text = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(text)
    return text

from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    text = ''.join(lemmatized)
    return text

cleaning_functions = [remove_punct, to_lower, remove_numbers, remove_stopwords, lemmatize_text]

In [6]:
train_data = data[300:5300].drop(["uniqueID", "condition", "rating", "date", "usefulCount"], axis =1)
print(train_data.shape)
train_data.head()

(5000, 2)


Unnamed: 0,drugName,review
300,Nitrofurantoin,"""Although I think I might have kidney stones i..."
301,Metronidazole,"""I&#039;ve been taking metronidazoles for 4 da..."
302,Ziprasidone,"""This was the worst experience I&#039;ve had w..."
303,Acetaminophen / butalbital / caffeine,"""I have been suffering from terrible allergies..."
304,Naproxen,"""One thing I&#039;ve learned is to avoid dairy..."


In [7]:
# don't do this on full dataset, takes way to long... select train_data of data before
train_data["clean_review"] = train_data["review"].apply(remove_punct).apply(to_lower).apply(remove_numbers).apply(remove_stopwords).apply(lemmatize_text)

In [8]:
train_data.head()

Unnamed: 0,drugName,review,clean_review
300,Nitrofurantoin,"""Although I think I might have kidney stones i...",although think might kidney stones instead uti...
301,Metronidazole,"""I&#039;ve been taking metronidazoles for 4 da...",ive taking metronidazoles days day im still ag...
302,Ziprasidone,"""This was the worst experience I&#039;ve had w...",worst experience ive medication go lithium gav...
303,Acetaminophen / butalbital / caffeine,"""I have been suffering from terrible allergies...",suffering terrible allergies due hay fever all...
304,Naproxen,"""One thing I&#039;ve learned is to avoid dairy...",one thing ive learned avoid dairy products muc...


# Simple LDA

In [9]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df = 0.9, min_df = 0.1).fit(train_data['clean_review'])
data_vectorized = vectorizer.transform(train_data['clean_review'])

lda_model = LatentDirichletAllocation(n_components=2).fit(data_vectorized)

In [10]:
# visualise output
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        
print_topics(lda_model, vectorizer)

Topic 0:
[('im', 312.00345654454736), ('get', 278.1689537624207), ('months', 239.60880745015868), ('ive', 235.2408149365387), ('pill', 229.59984060502342), ('weight', 216.39681018413108), ('medication', 215.75956271226573), ('life', 210.30839868814436), ('feel', 207.67400560746853), ('time', 206.125357806003)]
Topic 1:
[('side', 349.8587153572554), ('pain', 340.0695590961187), ('effects', 316.56059962655246), ('day', 288.05886313261897), ('days', 262.30916973504213), ('took', 214.79450651685906), ('mg', 213.94392912977523), ('back', 205.0051380815705), ('medicine', 197.62171954946504), ('great', 195.99406473798885)]


In [11]:
# dummy test: is 'side effect' a topic?
example = ["side effect"]

example_vectorized = vectorizer.transform(example)
lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.2506184620082339
topic 1 : 0.7493815379917661


# Compare to labelled data

## Get Data

In [23]:
test_data = pd.read_csv('../raw_data/adr_labelled_data.csv', nrows=300)
test_data = test_data.drop(["uniqueID", "condition", "rating", "date", "usefulCount", "Unnamed: 8"], axis =1)
print(test_data.shape)
test_data.head()

(300, 3)


Unnamed: 0,drugName,review,sideEffect
0,Valsartan,"""It has no side effect, I take it in combinati...",0
1,Guanfacine,"""My son is halfway through his fourth week of ...",1
2,Lybrel,"""I used to take another oral contraceptive, wh...",1
3,Ortho Evra,"""This is my first time using any form of birth...",1
4,Buprenorphine / naloxone,"""Suboxone has completely turned my life around...",1


In [24]:
test_data["clean_review"] = test_data["review"].apply(remove_punct).apply(to_lower).apply(remove_numbers).apply(remove_stopwords).apply(lemmatize_text)

In [25]:
test_data.head()

Unnamed: 0,drugName,review,sideEffect,clean_review
0,Valsartan,"""It has no side effect, I take it in combinati...",0,side effect take combination bystolic mg fish oil
1,Guanfacine,"""My son is halfway through his fourth week of ...",1,son halfway fourth week intuniv became concern...
2,Lybrel,"""I used to take another oral contraceptive, wh...",1,used take another oral contraceptive pill cycl...
3,Ortho Evra,"""This is my first time using any form of birth...",1,first time using form birth control im glad we...
4,Buprenorphine / naloxone,"""Suboxone has completely turned my life around...",1,suboxone completely turned life around feel he...


## Transform

In [26]:
test_data_vectorized = vectorizer.transform(test_data['clean_review'])

lda_vectors = lda_model.transform(test_data_vectorized)

# display results for first row
print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.21581096661966642
topic 1 : 0.7841890333803335


## Compare

In [30]:
predicted_topic = []
for row in range(len(test_data)):
    if (lda_vectors[row][0] < lda_vectors[row][1]):
        predicted_topic.append(0)
    else: predicted_topic.append(1)

np.array(predicted_topic).shape

(300,)

In [28]:
np.array(test_data.sideEffect).shape

(300,)

In [31]:
#accurate predictions
((np.array(predicted_topic) == np.array(test_data.sideEffect)).sum())/len(test_data)

0.5233333333333333

## reserve count

In [32]:
data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [33]:
my_data = data[['drugName', 'review']]

In [34]:
my_data

Unnamed: 0,drugName,review
0,Valsartan,"""It has no side effect, I take it in combinati..."
1,Guanfacine,"""My son is halfway through his fourth week of ..."
2,Lybrel,"""I used to take another oral contraceptive, wh..."
3,Ortho Evra,"""This is my first time using any form of birth..."
4,Buprenorphine / naloxone,"""Suboxone has completely turned my life around..."
...,...,...
161292,Campral,"""I wrote my first report in Mid-October of 201..."
161293,Metoclopramide,"""I was given this in IV before surgey. I immed..."
161294,Orencia,"""Limited improvement after 4 months, developed..."
161295,Thyroid desiccated,"""I&#039;ve been on thyroid medication 49 years..."


In [35]:
my_data.review.split()

AttributeError: 'Series' object has no attribute 'split'