# Imports

In [1]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.decomposition import LatentDirichletAllocation # Latent Dirichlet Allocation is a topic model that is used for discovering abstract topics from a collection of documents (variational Bayes algorithm)
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

import string # Collection of string operations
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer #Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk import word_tokenize

from nltk.sentiment.util import mark_negation



In [2]:
!pip install corextopic



In [3]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [4]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Functions

In [5]:
stop_words = set(stopwords.words('english')) 

NEGATIONS = ["no", "not", "shouldn't", "aren't", "couldn't", "didn't", "doesn't", "don't", "wasn't", "weren't", "wouldn't", "nor"]

NEW_WORDS = ['mg', "month", "year", "day", "week", "time", "im", "ive", "hour"]

for negation in NEGATIONS:
    stop_words.remove(negation)

for new_word in NEW_WORDS:
    stop_words.add(new_word)

# stop_words

In [6]:
def to_list(x):
    list_words = x.split(' ')
    return list_words

def to_string(x):
    string = " ".join(x)
    return string

#===============================================================

def punctuation(x):
    for punctuation in string.punctuation:
        x =  x.replace(punctuation, '')
    return x.lower()

def remove_numbers (x):
    words_only = ''.join([i for i in x if not i.isdigit()])
    return words_only

def m_negation(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    tokenized_neg = mark_negation(without_stopwords)
    return tokenized_neg

def remove_stopwords(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

def lemmatize_review(x):
    lemma = WordNetLemmatizer()
    lista = []
    for w in x:
       lista.append(lemma.lemmatize(w))
    return lista

#===============================================================

def count_words(x):
    wordfreq = []
    for w in x:
        wordfreq.append(x.count(w))
    return dict(zip(x, wordfreq))

def total_count(x):
    total_count = {}
    for row in x:
        for key in row.keys():
          if key in total_count:
              total_count[key] += 1
          else:
              total_count[key] = 1
    return pd.DataFrame(sorted(total_count.items(), key=lambda x: x[1], reverse=True)).head(30).T

def round_two(x):
    return str(int(round(x, 2) * 100)) + "%"
#===============================================================

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

# Manual

In [7]:
# manual = pd.read_csv('manually_labelled_data.csv')

manual = pd.read_csv("/content/drive/MyDrive/Data/manually_labelled_data.csv")

manual = manual.drop(["uniqueID", "drugName", "condition", "date", "rating", "usefulCount"], axis = 1)

manual["clean_review"] = manual["review"].apply(punctuation)
manual['clean_review'] = manual.clean_review.apply(remove_numbers)
manual['clean_review_lst'] = manual.clean_review.apply(to_list)

manual["NonStopwords_review_lst"] = manual.clean_review.apply(remove_stopwords)
manual["NonStopwords_review_str"] = manual.NonStopwords_review_lst.apply(to_string)

manual["NonStopwords_review_lst_MN"] = manual.clean_review.apply(m_negation)
manual["NonStopwords_review_str_MN"] = manual.NonStopwords_review_lst_MN.apply(to_string)

manual["Lemmatized_review_lst"] = manual.NonStopwords_review_lst_MN.apply(lemmatize_review)
manual["Lemmatized_review_str"] = manual.Lemmatized_review_lst.apply(to_string)

manual["Lemmatized_review_list"] = manual.NonStopwords_review_lst.apply(lemmatize_review)
manual["Lemmatized_review"] = manual.Lemmatized_review_list.apply(to_string)
manual["Lemmatized_review_list"] = manual.Lemmatized_review.apply(remove_stopwords)
manual["Lemmatized_review"] = manual.Lemmatized_review_list.apply(to_string)

manual["words_count"] = manual.Lemmatized_review_list.apply(count_words)

X = manual["Lemmatized_review_str"]

y = manual["sideEffect"]

manual = manual.drop(["clean_review", "clean_review_lst", "NonStopwords_review_lst", "NonStopwords_review_str"], axis = 1)

manual.head(3)

Unnamed: 0,review,sideEffect,NonStopwords_review_lst_MN,NonStopwords_review_str_MN,Lemmatized_review_lst,Lemmatized_review_str,Lemmatized_review_list,Lemmatized_review,words_count
0,"""It has no side effect, I take it in combinati...",0,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side, effect, take, combination, bystolic...",no side effect take combination bystolic fish oil,"{'no': 1, 'side': 1, 'effect': 1, 'take': 1, '..."
1,"""My son is halfway through his fourth week of ...",1,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'intuniv..."
2,"""I used to take another oral contraceptive, wh...",1,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1..."


In [8]:
total_count(manual["words_count"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,not,effect,side,taking,started,take,no,first,like,work,get,one,also,would,doctor,life,help,pain,much,still,went,feel,took,really,well,two,good,every,better,medication
1,39,38,34,32,31,29,24,23,23,22,21,20,18,18,17,17,17,17,16,16,16,16,16,16,16,15,15,15,14,14


# Data

In [9]:
# data = pd.read_csv('drugsComTrain_raw.csv')

data = pd.read_csv('/content/drive/MyDrive/Data/drugsComTrain_raw.csv')

print(f"The shape of the data is {data.shape[0]} rows and {data.shape[1]} columns")
print("\n")
print(f"The amount of unique ID is {len(data['uniqueID'].unique())}") # check if any of the uniqueID repeat
print("\n")
print(f"The number of unique drugs reviewed is {len(data['drugName'].unique())}")
print("\n")
print(f"The number of unique conditions is {len(data['condition'].unique())}")
print("\n")

The shape of the data is 161297 rows and 7 columns


The amount of unique ID is 161297


The number of unique drugs reviewed is 3436


The number of unique conditions is 885




In [10]:
pd.DataFrame(data["drugName"].value_counts()).head(14).T

Unnamed: 0,Levonorgestrel,Etonogestrel,Ethinyl estradiol / norethindrone,Nexplanon,Ethinyl estradiol / norgestimate,Ethinyl estradiol / levonorgestrel,Phentermine,Sertraline,Escitalopram,Mirena,Implanon,Gabapentin,Bupropion,Venlafaxine
drugName,3657,3336,2850,2156,2117,1888,1543,1360,1292,1242,1102,1047,1022,1016


In [11]:
pd.DataFrame(data["condition"].value_counts()).head(14).T

Unnamed: 0,Birth Control,Depression,Pain,Anxiety,Acne,Bipolar Disorde,Insomnia,Weight Loss,Obesity,ADHD,"Diabetes, Type 2",Emergency Contraception,High Blood Pressure,Vaginal Yeast Infection
condition,28788,9069,6145,5904,5588,4224,3673,3609,3568,3383,2554,2463,2321,2274


In [12]:
NUMERO_SAMPLES = 150

data = data.drop(["uniqueID", "drugName", "condition", "date", "rating", "usefulCount"], axis = 1).head(NUMERO_SAMPLES) # SACAR ESTOOOOOO!!

In [13]:
data["clean_review"] = data["review"].apply(punctuation)
data['clean_review'] = data.clean_review.apply(remove_numbers)
data['clean_review_lst'] = data.clean_review.apply(to_list)

data["NonStopwords_review_lst"] = data.clean_review.apply(remove_stopwords)
data["NonStopwords_review_str"] = data.NonStopwords_review_lst.apply(to_string)

data["NonStopwords_review_lst_MN"] = data.clean_review.apply(m_negation)
data["NonStopwords_review_str_MN"] = data.NonStopwords_review_lst_MN.apply(to_string)

data["Lemmatized_review_lst"] = data.NonStopwords_review_lst_MN.apply(lemmatize_review)
data["Lemmatized_review_str"] = data.Lemmatized_review_lst.apply(to_string)

data["Lemmatized_review_list"] = data.NonStopwords_review_lst.apply(lemmatize_review)
data["Lemmatized_review"] = data.Lemmatized_review_list.apply(to_string)
data["Lemmatized_review_list"] = data.Lemmatized_review.apply(remove_stopwords)
data["Lemmatized_review"] = data.Lemmatized_review_list.apply(to_string)

data["words_count"] = data.Lemmatized_review_list.apply(count_words)

data = data.drop(["clean_review", "clean_review_lst", "NonStopwords_review_lst", "NonStopwords_review_str"], axis = 1)

data.head(3)

Unnamed: 0,review,NonStopwords_review_lst_MN,NonStopwords_review_str_MN,Lemmatized_review_lst,Lemmatized_review_str,Lemmatized_review_list,Lemmatized_review,words_count
0,"""It has no side effect, I take it in combinati...","[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side, effect, take, combination, bystolic...",no side effect take combination bystolic fish oil,"{'no': 1, 'side': 1, 'effect': 1, 'take': 1, '..."
1,"""My son is halfway through his fourth week of ...","[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'intuniv..."
2,"""I used to take another oral contraceptive, wh...","[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1..."


In [14]:
total_count(data["words_count"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,not,effect,side,no,started,take,taking,first,get,one,work,doctor,like,pain,feel,would,went,help,much,medication,life,well,still,really,also,since,two,better,period,took
1,55,54,52,43,42,40,40,38,34,32,31,29,29,29,26,26,25,25,24,24,24,24,23,23,22,22,21,21,21,21


In [15]:
# correlation = data["rating"].corr(data["usefulCount"], method='pearson')

# plt.matshow(data[["rating", "usefulCount"]].corr())

# plt.show()

# corex

In [16]:
vectorizer = CountVectorizer(min_df = 0.01, 
                             max_df = 0.30, 
                             max_features = None,
                             vocabulary = None,
                             ngram_range = (1, 3)).fit(manual["Lemmatized_review_str"]) #

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(manual["Lemmatized_review_str"]) #

lda_model = LatentDirichletAllocation(n_components = 2,
                                      verbose = 0,
                                      max_iter = 50).fit(data_vectorized)

In [17]:
vectorizer = vectorizer.fit(manual["Lemmatized_review"])

tfidf = vectorizer.transform(manual["Lemmatized_review"])

vocab = vectorizer.get_feature_names()

print(len(vocab))

9409


In [18]:
from corextopic import corextopic as ct

anchors = []
model = ct.Corex(n_hidden = 7, seed=42)
model = model.fit(
    tfidf,
    words = vocab
)

In [19]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: finally, little, mild, get, home, experiencing, really, pain nausea, school, got worse
Topic #2: lb, easily, along, lost lb, horrible, try, lb far, twice daily, office, cramp terrible
Topic #3: show, cleared, remain, wonder, wondering, not take, sometimes, finding, worked well, care
Topic #4: cured, let, serious, exactly, heart palpitation, level, system, one side, normal life, initial
Topic #5: high, gp, drive completely, gp started, died, sex drive completely, minimal, review, chance
Topic #6: hr, changed, changed life, considering, needed, drug, much better, upper, age, person
Topic #7: sleep, drink, start, issue, wouldnt, thank, st, honestly, become, couple


In [20]:
# Anchors designed to nudge the model towards measuring specific genres
anchors = [
    ["headache", "problems"],
    ["diarrhea", "problems"],
    ["acne", "problems", "worst"],
    ["constipation", "problems"],
    ["insomnia",  "problems"],
    ["gain pound", "caused gain", "problems"],
    ["good", "response", "improvement", "has no side effect", "normal person"]
]
anchors = [
    [a for a in topic if a in vocab]
    for topic in anchors
]

model = ct.Corex(n_hidden=7, seed=42)
model = model.fit(
    tfidf,
    words = vocab,
    anchors = anchors, # Pass the anchors in here
    anchor_strength = 3 # Tell the model how much it should rely on the anchors
)

In [21]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: changed, upper, begin, changed life, male, age, needed, small, good luck, taking low
Topic #2: symptom, broke, mental, already, hopefully, told, plan, seem, high, around
Topic #3: worst, acne, next, med, breakthrough, exactly, hit, one side, around pm, level
Topic #4: worse, recently, even worse, form birth control, form birth, remission, didnt work, clear skin, discharge, period max
Topic #5: home, use, head, yet, someone, really, doctor, noticeable, really help, mild
Topic #6: bed, new, remain, seems manage, school, became, last started, dec, not take, one first
Topic #7: close, improvement, normal person, go, note, way better, lose, best thing, terrible side effect, terrible side


In [22]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(7)]).astype(float)

topic_df.index = manual.index

df = pd.concat([manual, topic_df], axis=1)

In [23]:
pd.reset_option

<pandas._config.config.CallableDynamicDoc at 0x7f4c22a6ae48>

In [24]:
df.drop(["NonStopwords_review_lst_MN",
         "NonStopwords_review_str_MN",
         "Lemmatized_review_lst",
         "Lemmatized_review_str",
         "words_count",
         "review", "Lemmatized_review_list"], axis = 1).head(95)

Unnamed: 0,sideEffect,Lemmatized_review,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7
0,0,no side effect take combination bystolic fish oil,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,son halfway fourth intuniv became concerned be...,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,1,used take another oral contraceptive pill cycl...,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1,first using form birth control glad went patch...,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1,suboxone completely turned life around feel he...,1.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
90,1,medicine absolutely terrible three using hair ...,0.0,0.0,0.0,0.0,0.0,0.0,1.0
91,0,cat bite handfull set teeth sharp hand swelled...,1.0,1.0,0.0,0.0,0.0,0.0,0.0
92,1,yes first use experience raw constant itching ...,0.0,0.0,0.0,1.0,1.0,0.0,0.0
93,1,sex th august took plan b intercourse havent g...,0.0,1.0,1.0,0.0,1.0,0.0,0.0
