# Imports

In [12]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.decomposition import LatentDirichletAllocation # Latent Dirichlet Allocation is a topic model that is used for discovering abstract topics from a collection of documents (variational Bayes algorithm)
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

import string # Collection of string operations
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer #Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk import word_tokenize 
from nltk.sentiment.util import mark_negation

In [3]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jack/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jack/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Data

In [46]:
data = pd.read_csv('../raw_data/drugsComTrain_raw.csv')

print(f"The shape of the data is {data.shape[0]} rows and {data.shape[1]} columns")
print("\n")
print(f"The amount of unique ID is {len(data['uniqueID'].unique())}") # check if any of the uniqueID repeat
print("\n")
print(f"The number of unique drugs reviewed is {len(data['drugName'].unique())}")
print("\n")
print(f"The number of unique conditions is {len(data['condition'].unique())}")
print("\n")

#data["drugName"].unique()

The shape of the data is 161297 rows and 7 columns


The amount of unique ID is 161297


The number of unique drugs reviewed is 3436


The number of unique conditions is 885




In [51]:
trained_data = pd.read_csv('../raw_data/manually_labelled_data.csv')
trained_data

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sideEffect
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,0
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,1
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1
...,...,...,...,...,...,...,...,...
95,45237,Fluoxetine,Major Depressive Disorde,"""I started Prozac as one of my first anti depr...",2,12-Jan-16,18,0
96,102810,Aripiprazole,Depression,"""Intake Effexor XR 375 mg, and lorazepam for d...",4,17-Aug-12,33,1
97,60280,NuvaRing,Birth Control,"""I am torn by the Nuvaring. The convenience is...",5,31-Oct-11,0,1
98,10677,Spironolactone,Acne,"""I&#039;m 30 years old. I started having real...",9,21-Aug-13,31,0


In [None]:
correlation = data["rating"].corr(data["usefulCount"], method='pearson')

correlation

In [None]:
plt.matshow(data[["rating", "usefulCount"]].corr())
plt.show()

In [39]:
pd.DataFrame(data["review"].value_counts()).head(10).T

Unnamed: 0,"""Suboxone has completely turned my life around. I feel healthier, I&#039;m excelling at my job and I always have money in my pocket and my savings account. I had none of those before Suboxone and spent years abusing oxycontin. My paycheck was already spent by the time I got it and I started resorting to scheming and stealing to fund my addiction. All that is history. If you&#039;re ready to stop, there&#039;s a good chance that suboxone will put you on the path of great life again. I have found the side-effects to be minimal compared to oxycontin. I&#039;m actually sleeping better. Slight constipation is about it for me. It truly is amazing. The cost pales in comparison to what I spent on oxycontin.""","""It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil""","""This is my first time using any form of birth control. I&#039;m glad I went with the patch, I have been on it for 8 months. At first It decreased my libido but that subsided. The only downside is that it made my periods longer (5-6 days to be exact) I used to only have periods for 3-4 days max also made my cramps intense for the first two days of my period, I never had cramps before using birth control. Other than that in happy with the patch""","""My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective.""","""I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, because the ingredients are similar. When my other pills ended, I started Lybrel immediately, on my first day of period, as the instructions said. And the period lasted for two weeks. When taking the second pack- same two weeks. And now, with third pack things got even worse- my third period lasted for two weeks and now it&#039;s the end of the third week- I still have daily brown discharge.\r\nThe positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas."""
review,1,1,1,1,1


In [None]:
pd.DataFrame(data["condition"].value_counts()).head(14).T

In [52]:
#NUMERO_SAMPLES = 500

y = trained_data["sideEffect"].head(100)

trained_data = data.drop(["uniqueID", "date", "rating", "usefulCount"], axis = 1).head(100) # SACAR ESTOOOOOO!!

KeyError: "['uniqueID' 'date' 'rating' 'usefulCount'] not found in axis"

In [49]:
data = data.drop(["uniqueID", "date", "rating", "usefulCount"], axis = 1).head(1000)

# Functions

In [33]:
stop_words = set(stopwords.words('english')) 

def remove_stopwords (x):
    tokenized = mark_negation(word_tokenize(x))
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return " ".join(without_stopwords)

data["review_neg"] = data.review.apply(remove_stopwords)



In [34]:
data.head()

Unnamed: 0,drugName,condition,review,review_neg
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...","`` It side_NEG effect_NEG ,_NEG I_NEG take_NEG..."
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",`` My son halfway fourth week Intuniv . We bec...
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...","`` I used take another oral contraceptive , 21..."
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",`` This first time using form birth control . ...
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",`` Suboxone completely turned life around . I ...


In [35]:
def punctuation(x):
    for punctuation in string.punctuation:
        x =  x.replace(punctuation, '')
    return x.lower()

data["clean_review"] = data["review_neg"].apply(punctuation)

In [36]:
data.head()

Unnamed: 0,drugName,condition,review,review_neg,clean_review
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...","`` It side_NEG effect_NEG ,_NEG I_NEG take_NEG...",it sideneg effectneg neg ineg takeneg itneg i...
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",`` My son halfway fourth week Intuniv . We bec...,my son halfway fourth week intuniv we became...
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...","`` I used take another oral contraceptive , 21...",i used take another oral contraceptive 21 pi...
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",`` This first time using form birth control . ...,this first time using form birth control i ...
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",`` Suboxone completely turned life around . I ...,suboxone completely turned life around i fee...


In [37]:
def remove_numbers (x):
    words_only = ''.join([i for i in x if not i.isdigit()])
    return words_only

data['clean_review'] = data.clean_review.apply(remove_numbers)

In [None]:
data["review_words"]

In [None]:
# data["clean_review"]

In [38]:
data.head(15)

Unnamed: 0,drugName,condition,review,review_neg,clean_review
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...","`` It side_NEG effect_NEG ,_NEG I_NEG take_NEG...",it sideneg effectneg neg ineg takeneg itneg i...
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",`` My son halfway fourth week Intuniv . We bec...,my son halfway fourth week intuniv we became...
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...","`` I used take another oral contraceptive , 21...",i used take another oral contraceptive pill...
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",`` This first time using form birth control . ...,this first time using form birth control i ...
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",`` Suboxone completely turned life around . I ...,suboxone completely turned life around i fee...


# TF-IDF features | Latent Dirichlet allocation

In [None]:
import itertools

laplace = lidstone = range(1, 4)

lap_lid = list(itertools.product(laplace, lidstone))

lap_lid

In [None]:
vectorizer = TfidfVectorizer(min_df = 0.05, 
                             max_df = 0.40, 
                             max_features = None,
                             vocabulary = None,
                             ngram_range = (2, 2)).fit(data["clean_review"])

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(data["clean_review"])

lda_model = LatentDirichletAllocation(n_components = 2,
                                      verbose = 0,
                                      max_iter = 50).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

In [None]:
# vectorizer.vocabulary_

In [None]:
lda_model.components_ # Vectors?

In [None]:
print_topics(lda_model, vectorizer)

In [None]:
example = ["side effect"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

# TF-IDF features | Pipeline

In [None]:
# Create Pipeline
pipe = Pipeline([('tfidf', TfidfVectorizer()),
                 ('nb', MultinomialNB())
                ])

# Set parameters to search (model and vectorizer)
parameters = {
    'tfidf__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
    'tfidf__min_df': (np.linspace(0.01, 0.49, num = 10)),
    'tfidf__max_df': (np.linspace(0.50, 0.99, num = 10)),
    'nb__alpha': (np.linspace(0.01, 0.99, num = 10)) # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
    }

# Perform grid search
grid_search = GridSearchCV(pipe, parameters,
                           n_jobs=-1, 
                           verbose=1,
                           scoring = "accuracy", 
                           refit=True,
                           cv=5)

grid_search.fit(data["clean_review"], y)

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_

# best_model.transform(data["review"]) # Predict?!?!

In [None]:
# best_model.LatentDirichletAllocation(n_components = 2,
#                                       verbose = 0,
#                                       max_iter = 50).fit(data_vectorized)

# def print_topics(model, vectorizer):
#     for idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (idx))
#         print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
#                         for i in topic.argsort()[:-10 - 1:-1]])

In [None]:
# best_model.components_

In [None]:
# print_topics(best_model, vectorizer)

# CountVectorizer | Latent Dirichlet allocation

In [None]:
vectorizer = CountVectorizer(min_df = 0.05, 
                             max_df = 0.40, 
                             max_features = None,
                             vocabulary = None,
                             ngram_range = (2, 2)).fit(data["review"])

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(data["review"])

lda_model = LatentDirichletAllocation(n_components = 2,
                                      verbose = 0,
                                      max_iter = 50).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

In [None]:
# vectorizer.vocabulary_

In [None]:
lda_model.components_ # Vectors?

In [None]:
print_topics(lda_model, vectorizer)

In [None]:
example = ["side effect"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

# CountVectorizer | Pipeline

In [None]:
# Create Pipeline
pipe = Pipeline([('Count', CountVectorizer()),
                 ('nb', MultinomialNB())
                ])

# Set parameters to search (model and vectorizer)
parameters = {
    'Count__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
    'Count__min_df': (np.linspace(0.01, 0.49, num = 10)),
    'Count__max_df': (np.linspace(0.50, 0.99, num = 10)),
    'Count__max_features' : ([1 , 2, 3, 4, 5]),
    'nb__alpha': (np.linspace(0.01, 0.99, num = 10)), # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
    }

# Perform grid search
grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(data["clean_review"], y)

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_