# Imports

In [1]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.decomposition import LatentDirichletAllocation # Latent Dirichlet Allocation is a topic model that is used for discovering abstract topics from a collection of documents (variational Bayes algorithm)
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

import string # Collection of string operations
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer #Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk import word_tokenize

from nltk.sentiment.util import mark_negation



In [2]:
!pip install corextopic



In [3]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [4]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Functions

In [5]:
stop_words = set(stopwords.words('english')) 

NEGATIONS = ["no", "not", "shouldn't", "aren't", "couldn't", "didn't", "doesn't", "don't", "wasn't", "weren't", "wouldn't", "nor"]

NEW_WORDS = ['mg', "month", "year", "day", "week", "time", "im", "ive", "hour"]

for negation in NEGATIONS:
    stop_words.remove(negation)

for new_word in NEW_WORDS:
    stop_words.add(new_word)

# stop_words

In [6]:
def to_list(x):
    list_words = x.split(' ')
    return list_words

def to_string(x):
    string = " ".join(x)
    return string

#===============================================================

def punctuation(x):
    for punctuation in string.punctuation:
        x =  x.replace(punctuation, '')
    return x.lower()

def remove_numbers (x):
    words_only = ''.join([i for i in x if not i.isdigit()])
    return words_only

def m_negation(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    tokenized_neg = mark_negation(without_stopwords)
    return tokenized_neg

def remove_stopwords(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

def lemmatize_review(x):
    lemma = WordNetLemmatizer()
    lista = []
    for w in x:
       lista.append(lemma.lemmatize(w))
    return lista

#===============================================================

def count_words(x):
    wordfreq = []
    for w in x:
        wordfreq.append(x.count(w))
    return dict(zip(x, wordfreq))

def total_count(x):
    total_count = {}
    for row in x:
        for key in row.keys():
          if key in total_count:
              total_count[key] += 1
          else:
              total_count[key] = 1
    return pd.DataFrame(sorted(total_count.items(), key=lambda x: x[1], reverse=True)).head(30).T

def round_two(x):
    return str(int(round(x, 2) * 100)) + "%"

def one_or_zero(x):
    # Makes the prediction a binary outpur
    if x > 0.5:
        x = 1
    else:
        x = 0
    return x

#===============================================================

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

# Manual

In [7]:
#manual = pd.read_csv('manually_labelled_data.csv')

manual = pd.read_csv("/content/drive/MyDrive/Data/manually_labelled_data.csv")

manual = manual.drop(["uniqueID", "drugName", "condition", "date", "rating", "usefulCount"], axis = 1)

manual["clean_review"] = manual["review"].apply(punctuation)
manual['clean_review'] = manual.clean_review.apply(remove_numbers)
manual['clean_review_lst'] = manual.clean_review.apply(to_list)

manual["NonStopwords_review_lst"] = manual.clean_review.apply(remove_stopwords)
manual["NonStopwords_review_str"] = manual.NonStopwords_review_lst.apply(to_string)

manual["NonStopwords_review_lst_MN"] = manual.clean_review.apply(m_negation)
manual["NonStopwords_review_str_MN"] = manual.NonStopwords_review_lst_MN.apply(to_string)

manual["Lemmatized_review_lst"] = manual.NonStopwords_review_lst_MN.apply(lemmatize_review)
manual["Lemmatized_review_str"] = manual.Lemmatized_review_lst.apply(to_string)

manual["Lemmatized_review_list"] = manual.NonStopwords_review_lst.apply(lemmatize_review)
manual["Lemmatized_review"] = manual.Lemmatized_review_list.apply(to_string)
manual["Lemmatized_review_list"] = manual.Lemmatized_review.apply(remove_stopwords)
manual["Lemmatized_review"] = manual.Lemmatized_review_list.apply(to_string)

manual["words_count"] = manual.Lemmatized_review_list.apply(count_words)

X = manual["Lemmatized_review_str"]

y = manual["sideEffect"]

manual = manual.drop(["clean_review", "clean_review_lst", "NonStopwords_review_lst", "NonStopwords_review_str"], axis = 1)

manual.head(3)

Unnamed: 0,review,sideEffect,NonStopwords_review_lst_MN,NonStopwords_review_str_MN,Lemmatized_review_lst,Lemmatized_review_str,Lemmatized_review_list,Lemmatized_review,words_count
0,"""It has no side effect, I take it in combinati...",0,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side, effect, take, combination, bystolic...",no side effect take combination bystolic fish oil,"{'no': 1, 'side': 1, 'effect': 1, 'take': 1, '..."
1,"""My son is halfway through his fourth week of ...",1,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'intuniv..."
2,"""I used to take another oral contraceptive, wh...",1,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1..."


In [8]:
total_count(manual["words_count"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,not,effect,side,taking,started,take,no,first,like,work,get,one,also,would,doctor,life,help,pain,much,still,went,feel,took,really,well,two,good,every,better,medication
1,39,38,34,32,31,29,24,23,23,22,21,20,18,18,17,17,17,17,16,16,16,16,16,16,16,15,15,15,14,14


# Data

In [9]:
#data = pd.read_csv('drugsComTrain_raw.csv')

data = pd.read_csv('/content/drive/MyDrive/Data/drugsComTrain_raw.csv')

print(f"The shape of the data is {data.shape[0]} rows and {data.shape[1]} columns")
print("\n")
print(f"The amount of unique ID is {len(data['uniqueID'].unique())}") # check if any of the uniqueID repeat
print("\n")
print(f"The number of unique drugs reviewed is {len(data['drugName'].unique())}")
print("\n")
print(f"The number of unique conditions is {len(data['condition'].unique())}")
print("\n")

The shape of the data is 161297 rows and 7 columns


The amount of unique ID is 161297


The number of unique drugs reviewed is 3436


The number of unique conditions is 885




In [10]:
pd.DataFrame(data["drugName"].value_counts()).head(14).T

Unnamed: 0,Levonorgestrel,Etonogestrel,Ethinyl estradiol / norethindrone,Nexplanon,Ethinyl estradiol / norgestimate,Ethinyl estradiol / levonorgestrel,Phentermine,Sertraline,Escitalopram,Mirena,Implanon,Gabapentin,Bupropion,Venlafaxine
drugName,3657,3336,2850,2156,2117,1888,1543,1360,1292,1242,1102,1047,1022,1016


In [11]:
pd.DataFrame(data["condition"].value_counts()).head(14).T

Unnamed: 0,Birth Control,Depression,Pain,Anxiety,Acne,Bipolar Disorde,Insomnia,Weight Loss,Obesity,ADHD,"Diabetes, Type 2",Emergency Contraception,High Blood Pressure,Vaginal Yeast Infection
condition,28788,9069,6145,5904,5588,4224,3673,3609,3568,3383,2554,2463,2321,2274


In [12]:
NUMERO_SAMPLES = 150

data = data.drop(["uniqueID", "drugName", "condition", "date", "rating", "usefulCount"], axis = 1).head(NUMERO_SAMPLES) # SACAR ESTOOOOOO!!

In [13]:
data["clean_review"] = data["review"].apply(punctuation)
data['clean_review'] = data.clean_review.apply(remove_numbers)
data['clean_review_lst'] = data.clean_review.apply(to_list)

data["NonStopwords_review_lst"] = data.clean_review.apply(remove_stopwords)
data["NonStopwords_review_str"] = data.NonStopwords_review_lst.apply(to_string)

data["NonStopwords_review_lst_MN"] = data.clean_review.apply(m_negation)
data["NonStopwords_review_str_MN"] = data.NonStopwords_review_lst_MN.apply(to_string)

data["Lemmatized_review_lst"] = data.NonStopwords_review_lst_MN.apply(lemmatize_review)
data["Lemmatized_review_str"] = data.Lemmatized_review_lst.apply(to_string)

data["Lemmatized_review_list"] = data.NonStopwords_review_lst.apply(lemmatize_review)
data["Lemmatized_review"] = data.Lemmatized_review_list.apply(to_string)
data["Lemmatized_review_list"] = data.Lemmatized_review.apply(remove_stopwords)
data["Lemmatized_review"] = data.Lemmatized_review_list.apply(to_string)

data["words_count"] = data.Lemmatized_review_list.apply(count_words)

data = data.drop(["clean_review", "clean_review_lst", "NonStopwords_review_lst", "NonStopwords_review_str"], axis = 1)

data.head(3)

Unnamed: 0,review,NonStopwords_review_lst_MN,NonStopwords_review_str_MN,Lemmatized_review_lst,Lemmatized_review_str,Lemmatized_review_list,Lemmatized_review,words_count
0,"""It has no side effect, I take it in combinati...","[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side, effect, take, combination, bystolic...",no side effect take combination bystolic fish oil,"{'no': 1, 'side': 1, 'effect': 1, 'take': 1, '..."
1,"""My son is halfway through his fourth week of ...","[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'intuniv..."
2,"""I used to take another oral contraceptive, wh...","[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1..."


In [14]:
total_count(data["words_count"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,not,effect,side,no,started,take,taking,first,get,one,work,doctor,like,pain,feel,would,went,help,much,medication,life,well,still,really,also,since,two,better,period,took
1,55,54,52,43,42,40,40,38,34,32,31,29,29,29,26,26,25,25,24,24,24,24,23,23,22,22,21,21,21,21


In [15]:
# correlation = data["rating"].corr(data["usefulCount"], method='pearson')

# plt.matshow(data[["rating", "usefulCount"]].corr())

# plt.show()

# CountVectorizer | Latent Dirichlet allocation

In [16]:
vectorizer = CountVectorizer(min_df = 0.01, 
                             max_df = 0.30, 
                             max_features = None,
                             vocabulary = None,
                             ngram_range = (1, 2)).fit(manual["Lemmatized_review_str"]) #

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(manual["Lemmatized_review_str"]) #

lda_model = LatentDirichletAllocation(n_components = 2,
                                      verbose = 0,
                                      max_iter = 50).fit(data_vectorized)

In [17]:
"""A model with higher log-likelihood and lower perplexity
(exp(-1. * log-likelihood per word)) is considered to be good. Let’s check for our model."""

# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
print(lda_model.get_params())

Log Likelihood:  -76295.22021866274
Perplexity:  7053.177298183489
{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'batch', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 50, 'mean_change_tol': 0.001, 'n_components': 2, 'n_jobs': None, 'perp_tol': 0.1, 'random_state': None, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [18]:
# Define Search Param
search_params = {'n_components' : [2, 3, 4], 'learning_decay' : [.2, .3, .4], "max_iter" : [50, 100], "learning_offset" : [5, 10]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.3, 'learning_offset': 5, 'max_iter': 50, 'n_components': 2}
Best Log Likelihood Score:  -20037.774186950497
Model Perplexity:  7024.364956008521


In [19]:
# Define Search Param
search_params = {'ngram_range' : [(1, 2), (2, 2)]}

# Init the Model
vectorizer = CountVectorizer()

# Init Grid Search Class
model = GridSearchCV(vectorizer, param_grid = search_params)

# Do the Grid Search
model.fit(data_vectorized)

# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

TypeError: ignored

In [None]:
# vectorizer.vocabulary_

In [None]:
# lda_model.components_ # Vectors

In [None]:
print_topics(lda_model, vectorizer)

In [None]:
example = ["pain back"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

In [None]:
lda_vectors

In [None]:
manual["Lemmatized_review_lst"]

In [None]:
predictions = []

for review in manual["Lemmatized_review"]:
    vectorized = vectorizer.transform([review])
    lda_vectors = predictions.append(lda_model.transform(vectorized))

predictions = np.concatenate(predictions, axis=0)

predictions.shape

In [None]:
compare_data = pd.DataFrame(predictions, columns = ["Side_Effect", "No_Side_Effect"])

compare_data["Manually_Labelled"] = manual["sideEffect"] # Brings a column from the other DataFrame

compare_data["Prediction"] = compare_data.Side_Effect.apply(one_or_zero) # Applies the binary output
compare_data["No_Side_Effect_%"] = compare_data.No_Side_Effect.apply(round_two) # Applies a format
compare_data["Side_Effect_%"] = compare_data.Side_Effect.apply(round_two) # Applies a format

compare_data["bool"] = np.where(compare_data["Manually_Labelled"] == compare_data["Prediction"], True, False) # Compares betwen the precdiction and the label

compare_data.drop(["Side_Effect", "No_Side_Effect"], axis = 1)

In [None]:
compare_data["bool"].value_counts()

In [None]:
compare_data.to_csv('data.csv')

!cp data.csv "drive/My Drive/"

In [None]:


# correct predictions / 100
# recall = correctly predicted side effects / total side effects
# precision = correctly predicted side effects / total predicted side effects

# predicted_topic = []



# (np.array(predicted_topic) == np.array(test_data.sideEffect)).sum()

In [None]:
# for row in range(len(test_data)):
#     if (lda_vectors[row][0] > lda_vectors[row][1]):
#         predicted_topic.append(0)
#     else: predicted_topic.append(1)

# CountVectorizer | Pipeline

In [None]:
import itertools

laplace = lidstone = range(1, 4)

lap_lid = list(itertools.product(laplace, lidstone))

lap_lid

In [None]:
# # Create Pipeline
# pipe = Pipeline([('Count', CountVectorizer()),
#                  ('nb', MultinomialNB())
#                 ])

# # Set parameters to search (model and vectorizer)
# parameters = {
#     'Count__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
#     'Count__min_df': (np.linspace(0.01, 0.49, num = 10)),
#     'Count__max_df': (np.linspace(0.50, 0.99, num = 10)),
#     'Count__max_features' : ([1 , 2, 3, 4, 5]),
#     'nb__alpha': (np.linspace(0.01, 0.99, num = 10)), # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
#     }

# # Perform grid search
# grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, 
#                            verbose=1, scoring = "accuracy", 
#                            refit=True, cv=5)

# grid_search.fit(X, y)

In [None]:
# grid_search.best_params_

In [None]:
# best_model = grid_search.best_estimator_