# Imports

In [1]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.decomposition import LatentDirichletAllocation # Latent Dirichlet Allocation is a topic model that is used for discovering abstract topics from a collection of documents (variational Bayes algorithm)
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

import string # Collection of string operations
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer #Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk import word_tokenize 
from nltk.sentiment.util import mark_negation

In [2]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jack/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jack/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Data

In [3]:
data = pd.read_csv('../raw_data/drugsComTrain_raw.csv')

print(f"The shape of the data is {data.shape[0]} rows and {data.shape[1]} columns")
print("\n")
print(f"The amount of unique ID is {len(data['uniqueID'].unique())}") # check if any of the uniqueID repeat
print("\n")
print(f"The number of unique drugs reviewed is {len(data['drugName'].unique())}")
print("\n")
print(f"The number of unique conditions is {len(data['condition'].unique())}")
print("\n")

#data["drugName"].unique()

The shape of the data is 161297 rows and 7 columns


The amount of unique ID is 161297


The number of unique drugs reviewed is 3436


The number of unique conditions is 885




In [26]:
trained_data = pd.read_csv('../raw_data/manually_labelled_data.csv')

print(f"The shape of the data is {trained_data.shape[0]} rows and {trained_data.shape[1]} columns")
print("\n")
print(f"The amount of unique ID is {len(trained_data['uniqueID'].unique())}") # check if any of the uniqueID repeat
print("\n")
print(f"The number of unique drugs reviewed is {len(trained_data['drugName'].unique())}")
print("\n")
print(f"The number of unique conditions is {len(trained_data['condition'].unique())}")
print("\n")

The shape of the data is 100 rows and 8 columns


The amount of unique ID is 100


The number of unique drugs reviewed is 84


The number of unique conditions is 54




In [10]:
trained_data.tail()
#trained_data['labeled_SEff'] = trained_data['sideEffect']
trained_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sideEffect
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,0
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,1
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1


In [5]:
word_counts = []
for i in data["review"]:
    word_counts.apply(pd.DataFrame(data["review"].value_counts()).head(10).T)

AttributeError: 'list' object has no attribute 'apply'

In [28]:
## define target

#y = trained_data["sideEffect"].head(100)

#trained_data = data.drop(["uniqueID", "date", "rating", "usefulCount"], axis = 1).head(100)

trained_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sideEffect
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,0
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,1
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1


In [12]:
## drop columns

data = data.drop(["uniqueID", "date", "rating", "usefulCount"], axis = 1).head(1000)

### Part of speech taging

In [None]:
def pos (x):
    tokenized = word_tokenize(x)
    pos_words = nltk.pos_tag(tokenized)
    return " ".join(pos_words)

# Functions

In [29]:
stop_words = set(stopwords.words('english')) 

def remove_stopwords (x):
    tokenized = mark_negation(word_tokenize(x))
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return " ".join(without_stopwords)

trained_data["review_neg"] = data.review.apply(remove_stopwords)



In [30]:
trained_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sideEffect,review_neg
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,0,"`` It side_NEG effect_NEG ,_NEG I_NEG take_NEG..."
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1,`` My son halfway fourth week Intuniv . We bec...
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,1,"`` I used take another oral contraceptive , 21..."
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1,`` This first time using form birth control . ...
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1,`` Suboxone completely turned life around . I ...


In [31]:
#create side effect feature

trained_data["contains_SE_mention"] = trained_data['review_neg'].str.contains('side effect', '')
trained_data["contains_NEG_SE_mention"] = trained_data['review_neg'].str.contains('side_NEG effect_NEG')

trained_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sideEffect,review_neg,contains_SE_mention,contains_NEG_SE_mention
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,0,"`` It side_NEG effect_NEG ,_NEG I_NEG take_NEG...",False,True
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1,`` My son halfway fourth week Intuniv . We bec...,False,False
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,1,"`` I used take another oral contraceptive , 21...",True,False
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1,`` This first time using form birth control . ...,False,False
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1,`` Suboxone completely turned life around . I ...,False,False


In [32]:
def punctuation(x):
    for punctuation in string.punctuation:
        x =  x.replace(punctuation, '')
    return x.lower()

trained_data["clean_review"] = trained_data["review_neg"].apply(punctuation)

In [33]:
trained_data.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sideEffect,review_neg,contains_SE_mention,contains_NEG_SE_mention,clean_review
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,0,"`` It side_NEG effect_NEG ,_NEG I_NEG take_NEG...",False,True,it sideneg effectneg neg ineg takeneg itneg i...
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1,`` My son halfway fourth week Intuniv . We bec...,False,False,my son halfway fourth week intuniv we became...
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,1,"`` I used take another oral contraceptive , 21...",True,False,i used take another oral contraceptive 21 pi...
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1,`` This first time using form birth control . ...,False,False,this first time using form birth control i ...
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1,`` Suboxone completely turned life around . I ...,False,False,suboxone completely turned life around i fee...


In [34]:
def remove_numbers (x):
    words_only = ''.join([i for i in x if not i.isdigit()])
    return words_only

trained_data['clean_review'] = data.clean_review.apply(remove_numbers)

In [None]:
data["review_words"]

In [None]:
# data["clean_review"]

In [20]:
data

Unnamed: 0,drugName,condition,review,review_neg,contains_SE_mention,contains_NEG_SE_mention,clean_review
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...","`` It side_NEG effect_NEG ,_NEG I_NEG take_NEG...",False,True,it sideneg effectneg neg ineg takeneg itneg i...
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",`` My son halfway fourth week Intuniv . We bec...,False,False,my son halfway fourth week intuniv we became...
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...","`` I used take another oral contraceptive , 21...",True,False,i used take another oral contraceptive pill...
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",`` This first time using form birth control . ...,False,False,this first time using form birth control i ...
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",`` Suboxone completely turned life around . I ...,False,False,suboxone completely turned life around i fee...
...,...,...,...,...,...,...,...
995,Duac,Acne,"""The product is working so well for my athleti...",`` The product working well athletic pre-teen ...,False,False,the product working well athletic preteen dau...
996,Liothyronine,Underactive Thyroid,"""as a supplement to levothyroxine it has been ...",`` supplement levothyroxine night day differen...,False,False,supplement levothyroxine night day difference...
997,Dextromethorphan,Cough,"""This worked great for my husband until he gav...",`` This worked great husband gave cold . I use...,False,False,this worked great husband gave cold i used m...
998,Dapsone,Acne,"""I have tried almost everything under the sun ...",`` I tried almost everything sun related acne ...,False,False,i tried almost everything sun related acne me...


# run baseline classifcation model with mention features

In [35]:
from sklearn.linear_model import SGDClassifier

X = trained_data[['contains_SE_mention', 'contains_NEG_SE_mention']]
y = trained_data['sideEffect']

svc = SGDClassifier(loss='hinge', penalty='l2', alpha=0.1)

svc.fit(X,y)



SGDClassifier(alpha=0.1)

In [36]:
svc.score(X,y)

0.55

# TF-IDF features | Latent Dirichlet allocation

In [None]:
import itertools

laplace = lidstone = range(1, 4)

lap_lid = list(itertools.product(laplace, lidstone))

lap_lid

In [None]:
vectorizer = TfidfVectorizer(min_df = 0.05, 
                             max_df = 0.40, 
                             max_features = None,
                             vocabulary = None,
                             ngram_range = (2, 2)).fit(data["clean_review"])

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(data["clean_review"])

lda_model = LatentDirichletAllocation(n_components = 2,
                                      verbose = 0,
                                      max_iter = 50).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

In [None]:
# vectorizer.vocabulary_

In [None]:
lda_model.components_ # Vectors?

In [None]:
print_topics(lda_model, vectorizer)

In [None]:
example = ["side effect"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

# TF-IDF features | Pipeline

In [None]:
# Create Pipeline
pipe = Pipeline([('tfidf', TfidfVectorizer()),
                 ('nb', MultinomialNB())
                ])

# Set parameters to search (model and vectorizer)
parameters = {
    'tfidf__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
    'tfidf__min_df': (np.linspace(0.01, 0.49, num = 10)),
    'tfidf__max_df': (np.linspace(0.50, 0.99, num = 10)),
    'nb__alpha': (np.linspace(0.01, 0.99, num = 10)) # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
    }

# Perform grid search
grid_search = GridSearchCV(pipe, parameters,
                           n_jobs=-1, 
                           verbose=1,
                           scoring = "accuracy", 
                           refit=True,
                           cv=5)

grid_search.fit(data["clean_review"], y)

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_

# best_model.transform(data["review"]) # Predict?!?!

In [None]:
# best_model.LatentDirichletAllocation(n_components = 2,
#                                       verbose = 0,
#                                       max_iter = 50).fit(data_vectorized)

# def print_topics(model, vectorizer):
#     for idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (idx))
#         print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
#                         for i in topic.argsort()[:-10 - 1:-1]])

In [None]:
# best_model.components_

In [None]:
# print_topics(best_model, vectorizer)

# CountVectorizer | Latent Dirichlet allocation

In [None]:
vectorizer = CountVectorizer(min_df = 0.05, 
                             max_df = 0.40, 
                             max_features = None,
                             vocabulary = None,
                             ngram_range = (2, 2)).fit(data["review"])

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(data["review"])

lda_model = LatentDirichletAllocation(n_components = 2,
                                      verbose = 0,
                                      max_iter = 50).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

In [None]:
# vectorizer.vocabulary_

In [None]:
lda_model.components_ # Vectors?

In [None]:
print_topics(lda_model, vectorizer)

In [None]:
example = ["side effect"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

# CountVectorizer | Pipeline

In [None]:
# Create Pipeline
pipe = Pipeline([('Count', CountVectorizer()),
                 ('nb', MultinomialNB())
                ])

# Set parameters to search (model and vectorizer)
parameters = {
    'Count__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
    'Count__min_df': (np.linspace(0.01, 0.49, num = 10)),
    'Count__max_df': (np.linspace(0.50, 0.99, num = 10)),
    'Count__max_features' : ([1 , 2, 3, 4, 5]),
    'nb__alpha': (np.linspace(0.01, 0.99, num = 10)), # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
    }

# Perform grid search
grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data["clean_review"], y)

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_