# Imports

In [1]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts
from sklearn.decomposition import LatentDirichletAllocation # Latent Dirichlet Allocation is a topic model that is used for discovering abstract topics from a collection of documents (variational Bayes algorithm)
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

from nltk.sentiment.util import extract_bigram_feats

import string # Collection of string operations
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer #Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk import word_tokenize

from nltk.sentiment.util import mark_negation

In [2]:
!pip install corextopic



In [7]:
# from google.colab import drive
# drive.mount('drive')

ModuleNotFoundError: No module named 'google'

In [2]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pefle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pefle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pefle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pefle\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

# Functions

In [3]:
stop_words = set(stopwords.words('english')) 

NEGATIONS = ["no", "not", "shouldn't", "aren't", "couldn't", "didn't", "doesn't", "don't", "wasn't", "weren't", "wouldn't", "nor"]

NEW_WORDS = ['mg', "month", "year", "day", "week", "time", "im", "ive", "hour"]

for negation in NEGATIONS:
    stop_words.remove(negation)

for new_word in NEW_WORDS:
    stop_words.add(new_word)

# stop_words

In [4]:
def to_list(x):
    list_words = x.split(' ')
    return list_words

def to_string(x):
    string = " ".join(x)
    return string

#===============================================================

def punctuation(x):
    for punctuation in string.punctuation:
        x =  x.replace(punctuation, '')
    return x.lower()

def remove_numbers (x):
    words_only = ''.join([i for i in x if not i.isdigit()])
    return words_only

def m_negation(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    tokenized_neg = mark_negation(without_stopwords)
    return tokenized_neg

def remove_stopwords(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

def remove_our_stopwords(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in OUR_STOPWORDS]
    return without_stopwords

def lemmatize_review(x):
    lemma = WordNetLemmatizer()
    lista = []
    for w in x:
       lista.append(lemma.lemmatize(w))
    return lista

#===============================================================

def count_words(x):
    wordfreq = []
    for w in x:
        wordfreq.append(x.count(w))
    return dict(zip(x, wordfreq))

def total_count(x):
    total_count = {}
    for row in x:
        for key in row.keys():
          if key in total_count:
              total_count[key] += 1
          else:
              total_count[key] = 1
    return pd.DataFrame(sorted(total_count.items(), key=lambda x: x[1], reverse=True)).head(30).T

def round_two(x):
    return str(int(round(x, 2) * 100)) + "%"

def one_or_zero(x):
    # Makes the prediction a binary outpur
    if x > 0.5:
        x = 1
    else:
        x = 0
    return x

#===============================================================

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

#===============================================================

def side_effects_lst(x):
    lista = []
    for i in x:
        y = remove_our_stopwords(i)
        if i not in y and i not in lista:
            lista.append(i)
    return lista

def se_encoder(x, y):
    if y in x:
        return 1
    else:
        return 0

# Side Effects

In [5]:
real_se_domains: ['pain', 'skin', 'libido', 'depression', 'anxiety', 'gastro-intestinal', 'discomfort', 'gynecological', 'weight gain', 'eyes']

real_se_dict = {
    'headache' : 'pain', 
    'migrane' : 'pain', 
    'migranes' : 'pain', 
    'headaches' : 'pain',
    'pain' : 'pain', 
    'leg pain' : 'pain', 
    'back pain' : 'pain', 
    'arm pain' : 'pain', 
    'cramps' : 'pain', 
    'tendonitis' : 'pain', 
    'pain in joints' : 'pain',
    
    'sensitive skin' : 'skin', 
    'dry skin' : 'skin', 
    'burned' : 'skin', 
    'red' : 'skin', 
    'dryness' : 'skin', 
    'swollen' : 'skin', 
    'itchy' : 'skin', 
    'irritated' : 'skin', 
    'bumpy spots' : 'skin', 
    'peeling' : 'skin', 
    'chapped lips' : 'skin', 
    'acne' : 'skin', 
    'burning' : 'skin', 
    'itching' : 'skin', 
    'lost sensation' : 'skin', 
    'swollen' : 'skin',
    
    'sex drive decreased': 'libido', 
    'lower sex drive' : 'libido', 
    'decreased libido' : 'libido',
    'loss of libido' : 'libido',
    
    'anhedonia' : 'depression', 
    'depression' : 'depression', 
    'mood swings' : 'depression', 
    'moody' : 'depression', 
    'weepy' : 'depression', 
    'no feelings' : 'depression', 
    'feeling' : 'depression', 
    'crying' : 'depression', 
    'cry constantly' : 'depression',
    
    'jaw clenching' : 'anxiety', 
    'inability to sit still' : 'anxiety', 
    'hear internal voices' : 'anxiety', 
    'paranoid' : 'anxiety', 
    'anxious' : 'anxiety', 
    'scared to be alone' : 'anxiety', 
    'manic state' : 'anxiety', 
    'anxiety' : 'anxiety', 
    'little confused' : 'anxiety', 
    'cognitive problems' : 'anxiety', 
    'irritable' : 'anxiety', 
    'angry' : 'anxiety', 
    'unreasonable' : 'anxiety', 
    'can not think straight' : 'anxiety', 
    'aggressive' : 'anxiety',
    'nightmare' : 'anxiety', 
    'nightmares' : 'anxiety', 
    'yawning' : 'anxiety', 
    'bad dreams' : 'anxiety', 
    'inability to sleep' : 'anxiety', 
    'insomnia' : 'anxiety', 
    
    'gastro-intestinal' : 'gastro-intestinal',
    'diarrhoea' : 'gastro-intestinal', 
    'stomach cramps' : 'gastro-intestinal', 
    'heartburn' : 'gastro-intestinal', 
    'nauseated' : 'gastro-intestinal', 
    'no appetite' : 'gastro-intestinal', 
    'nausea' : 'gastro-intestinal', 
    'appetite suppressed' : 'gastro-intestinal', 
    'bowel movements' : 'gastro-intestinal', 
    'constipation' : 'gastro-intestinal', 
    'constipated' : 'gastro-intestinal', 
    'upset stomach' : 'gastro-intestinal', 
    'bloating' : 'gastro-intestinal', 
    'passing winds' : 'gastro-intestinal', 
    'acidy burbs' : 'gastro-intestinal', 
    'stomach aches' : 'gastro-intestinal', 
    'nauseous' : 'gastro-intestinal', 
    'sour stomach' : 'gastro-intestinal',
    
    'discomfort' : 'discomfort',
    'shakiness' : 'discomfort', 
    'shaky' : 'discomfort', 
    'tired' : 'discomfort', 
    'fatigue' : 'discomfort', 
    'breathlessness' : 'discomfort', 
    'dry mouth' : 'discomfort', 
    'no energy' : 'discomfort', 
    'cold' : 'discomfort', 
    'hot flashes' : 'discomfort',  
    'night sweats' : 'discomfort', 
    'shaking' : 'discomfort', 
    'dizziness' : 'discomfort', 
    'knocks me out' : 'discomfort', 
    'weakness' : 'discomfort', 
    'wheezy' : 'discomfort', 
    'tiredness' : 'discomfort', 
    'uncomfortable' : 'discomfort', 
    'sweating' : 'discomfort',
    
    'brown periods' : 'gynecological', 
    'spot' : 'gynecological', 
    'spotting' : 'gynecological', 
    'bleeding lasted' : 'gynecological', 
    'increased discharge' : 'gynecological', 
    'longer periods' : 'gynecological', 
    'cramping' : 'gynecological', 
    'burning' : 'gynecological', 
    'itching' : 'gynecological', 
    'heavy periods' : 'gynecological', 
    'swelling chest' : 'gynecological', 
    'tenderness in chest' : 'gynecological',
    
    'weight gain' : 'weight gain', 
    'gaining weight' : 'weight gain', 
    'hungry' : 'weight gain', 
    'appetite increased' : 'weight gain',

    'burning eyes' : 'eyes', 
    'blurred vision' : 'eyes', 
    'blurry vision' : 'eyes', 
    'light sensitivity' : 'eyes',
    
    'side' : 'bonus', 
    'effect' : 'bonus', 
    'symptom' : 'bonus',
    'no' : 'bonus',
    'side_NEG' : 'bonus',
    'effect_NEG' : 'bonus',
    'better_NEG' : 'bonus'

}

keys, values = list(real_se_dict.keys()), list(real_se_dict.values())

In [10]:
giga_word = pd.read_csv('../raw_data/gigaword_sideeffects.csv')

no_giga_word = pd.read_csv('../raw_data/gigaword_no_sideeffects.csv')

giga_word = list(no_giga_word["0"]) + list(giga_word["0"])

len(giga_word)

256

In [9]:
# OUR_STOPWORDS = keys

# OUR_STOPWORDS = giga_word

# len(OUR_STOPWORDS)

In [11]:
OUR_STOPWORDS = giga_word + keys

len(OUR_STOPWORDS)

# OUR_STOPWORDS

378

# Manual

In [14]:
#manual = pd.read_csv('manually_labelled_data.csv')

manual = pd.read_csv("../raw_data/adr_labelled_data.csv")

manual = manual.drop(["Unnamed: 8"], axis = 1).head(299)

manual = manual.drop(["uniqueID", "condition", "date", "rating", "usefulCount"], axis = 1)

manual["clean_review"] = manual["review"].apply(punctuation)
manual['clean_review'] = manual.clean_review.apply(remove_numbers)
manual['clean_review_lst'] = manual.clean_review.apply(to_list)

manual["NonStopwords_review_lst"] = manual.clean_review.apply(remove_stopwords)
manual["NonStopwords_review_str"] = manual.NonStopwords_review_lst.apply(to_string)

manual["NonStopwords_review_lst_MN"] = manual.clean_review.apply(m_negation)
manual["NonStopwords_review_str_MN"] = manual.NonStopwords_review_lst_MN.apply(to_string)

manual["Lemmatized_review_lst"] = manual.NonStopwords_review_lst_MN.apply(lemmatize_review)
manual["Lemmatized_review_str"] = manual.Lemmatized_review_lst.apply(to_string)

manual["Lemmatized_review_list"] = manual.NonStopwords_review_lst.apply(lemmatize_review)
manual["Lemmatized_review"] = manual.Lemmatized_review_list.apply(to_string)
manual["Lemmatized_review_list"] = manual.Lemmatized_review.apply(remove_stopwords)
manual["Lemmatized_review"] = manual.Lemmatized_review_list.apply(to_string)

manual["words_count"] = manual.Lemmatized_review_list.apply(count_words)

X = manual["Lemmatized_review"]

y = manual["sideEffect"]

manual = manual.drop(["clean_review", "clean_review_lst", "NonStopwords_review_lst", "NonStopwords_review_str", "NonStopwords_review_lst_MN", 
                      "NonStopwords_review_str_MN", "Lemmatized_review_str"], axis = 1)

manual["Side_Effects_mention"] = manual.Lemmatized_review_lst.apply(side_effects_lst)

manual.head(3)

Unnamed: 0,drugName,review,sideEffect,Lemmatized_review_lst,Lemmatized_review_list,Lemmatized_review,words_count,Side_Effects_mention
0,Valsartan,"""It has no side effect, I take it in combinati...",0.0,"[no, side_NEG, effect_NEG, take_NEG, combinati...","[no, side, effect, take, combination, bystolic...",no side effect take combination bystolic fish oil,"{'no': 1, 'side': 1, 'effect': 1, 'take': 1, '...","[no, side_NEG, effect_NEG]"
1,Guanfacine,"""My son is halfway through his fourth week of ...",1.0,"[son, halfway, fourth, intuniv, became, concer...","[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'intuniv...","[problem, much, emotional, better, medication]"
2,Lybrel,"""I used to take another oral contraceptive, wh...",1.0,"[used, take, another, oral, contraceptive, pil...","[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1...","[take, no, side_NEG]"


In [15]:
manual.review[9]

'"I had been on the pill for many years. When my doctor changed my RX to chateal, it was as effective. It really did help me by completely clearing my acne, this takes about 6 months though. I did not gain extra weight, or develop any emotional health issues. I stopped taking it bc I started using a more natural method of birth control, but started to take it bc I hate that my acne came back at age 28. I really hope symptoms like depression, or weight gain do not begin to affect me as I am older now. I&#039;m also naturally moody, so this may worsen things. I was in a negative mental rut today. Also I hope this doesn&#039;t push me over the edge, as I believe I am depressed. Hopefully it&#039;ll be just like when I was younger."'

In [16]:
total_count(manual["words_count"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,not,effect,no,side,taking,started,first,take,like,one,...,bad,back,help,period,went,never,since,good,life,two
1,105,99,93,93,77,76,72,67,66,65,...,45,44,44,43,43,43,43,41,41,40


In [17]:
flat_list = [item for sublist in manual["Side_Effects_mention"] for item in sublist] # Makes the side effects mention lists on the column one flatted list

flat_list = list(set(flat_list)) # makes the set a list

side_effect_encoder = pd.DataFrame(columns = list(flat_list), index = None) # creates a dataframe with the flatted list as columns

side_effect_data = pd.concat([manual, side_effect_encoder], axis = 1) # concatenates the two dataframes

for drug in flat_list:
    side_effect_data[drug] = side_effect_data["Side_Effects_mention"].apply(lambda effect: se_encoder(effect, drug)) # applies the method to the dataframe to encode the words

side_effect_data.head(166)

Unnamed: 0,drugName,review,sideEffect,Lemmatized_review_lst,Lemmatized_review_list,Lemmatized_review,words_count,Side_Effects_mention,illness,arthritis,...,swollen,diabetes,pain,difference,uncomfortable,tiredness,infection,itching,anxiety,suffered
0,Valsartan,"""It has no side effect, I take it in combinati...",0.0,"[no, side_NEG, effect_NEG, take_NEG, combinati...","[no, side, effect, take, combination, bystolic...",no side effect take combination bystolic fish oil,"{'no': 1, 'side': 1, 'effect': 1, 'take': 1, '...","[no, side_NEG, effect_NEG]",0,0,...,0,0,0,0,0,0,0,0,0,0
1,Guanfacine,"""My son is halfway through his fourth week of ...",1.0,"[son, halfway, fourth, intuniv, became, concer...","[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'intuniv...","[problem, much, emotional, better, medication]",0,0,...,0,0,0,0,0,0,0,0,0,0
2,Lybrel,"""I used to take another oral contraceptive, wh...",1.0,"[used, take, another, oral, contraceptive, pil...","[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1...","[take, no, side_NEG]",0,0,...,0,0,0,0,0,0,0,0,0,0
3,Ortho Evra,"""This is my first time using any form of birth...",1.0,"[first, using, form, birth, control, glad, wen...","[first, using, form, birth, control, glad, wen...",first using form birth control glad went patch...,"{'first': 3, 'using': 2, 'form': 1, 'birth': 2...",[],0,0,...,0,0,0,0,0,0,0,0,0,0
4,Buprenorphine / naloxone,"""Suboxone has completely turned my life around...",1.0,"[suboxone, completely, turned, life, around, f...","[suboxone, completely, turned, life, around, f...",suboxone completely turned life around feel he...,"{'suboxone': 3, 'completely': 1, 'turned': 1, ...",[better_NEG],0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,Viibryd,"""I just want to say that after taking a lot of...",1.0,"[want, say, taking, lot, antidepressant, med, ...","[want, say, taking, lot, antidepressant, med, ...",want say taking lot antidepressant med one bes...,"{'want': 1, 'say': 1, 'taking': 1, 'lot': 1, '...","[diarrhea, really]",0,0,...,0,0,0,0,0,0,0,0,0,0
162,Mirena,"""I am 29 and just had my second Mirena inserte...",0.0,"[second, mirena, inserted, day, ago, painful, ...","[second, mirena, inserted, ago, painful, pain,...",second mirena inserted ago painful pain last a...,"{'second': 2, 'mirena': 3, 'inserted': 2, 'ago...","[painful, pain]",0,0,...,0,0,1,0,0,0,0,0,0,0
163,Ethinyl estradiol / norelgestromin,"""This is absolutely the best birth control I h...",0.0,"[absolutely, best, birth, control, ever, used,...","[absolutely, best, birth, control, ever, used,...",absolutely best birth control ever used switch...,"{'absolutely': 1, 'best': 1, 'birth': 3, 'cont...","[weight, loss, might, no]",0,0,...,0,0,0,0,0,0,0,0,0,0
164,Propofol,"""I&#039;ve received spinal injections because ...",0.0,"[received, spinal, injection, serious, car, ac...","[received, spinal, injection, serious, car, ac...",received spinal injection serious car accident...,"{'received': 1, 'spinal': 1, 'injection': 3, '...","[serious, severe, pain, relieve, able, no, sid...",0,0,...,0,0,1,0,0,0,0,0,0,0


In [18]:
manual["Lemmatized_review_list"].shape[0]

299

In [96]:
manual_full = pd.read_csv("../raw_data/adr_labelled_data.csv")

In [138]:
def side_or_no(series):
    if 'no side effect' in series:
        return 0
    if 'side effect' in series:
        return 1
    else:
        return 2

In [159]:
manual['side_or_no'] = manual['Lemmatized_review'].apply(side_or_no)

In [179]:
s_o_n = manual[['drugName', 'sideEffect', 'side_or_no']]

In [180]:
s_o_n['rating'] = manual_full['rating']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [181]:
s_o_n = s_o_n.loc[s_o_n['side_or_no'] != 2]

In [182]:
# s_o_n = s_o_n.loc[s_o_n['rating'] < 10]

In [183]:
len(s_o_n.loc[s_o_n['side_or_no'] == s_o_n['sideEffect']])/len(s_o_n)

0.8314606741573034

In [184]:
len(s_o_n)

89

In [186]:
index_list = list(s_o_n.index.values)

In [187]:
new_train_X = manual.iloc[index_list]['Lemmatized_review']

In [188]:
new_train_X

0      no side effect take combination bystolic fish oil
2      used take another oral contraceptive pill cycl...
5      nd started work rock hard erection however exp...
7      abilify changed life hope zoloft clonidine fir...
10     medication almost two started working way curr...
                             ...                        
285    stated taking contrave ago lost lb far donrsqu...
288    type diabetic yr tried victoza may today sept ...
294    really helped rls life rls worse yr almost bel...
295    first started taking birth control started reg...
298                                          side effect
Name: Lemmatized_review, Length: 89, dtype: object

In [61]:
def word_count_se(series):
    results = {}
    for word in series:
        results['word'] += 1
    return {k: v for k, v in sorted(results.items(), key=lambda item: item[1])}

In [77]:
no_effect_words = manual.loc[manual['sideEffect'] == 0]['Lemmatized_review_lst']

In [78]:
results = {}
for value in no_effect_words:
    for item in value:
        if item in results:
            results[item] += 1
        else:
            results[item] = 1

In [79]:
neg_word_list = list({k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse = True)}.keys())

In [81]:
neg_word_list[0:10]

['not_NEG',
 'year',
 'no_NEG',
 'pain',
 'no',
 'not',
 'effects_NEG',
 'day',
 'side_NEG',
 'started']

In [82]:
effect_words = manual.loc[manual['sideEffect'] == 1]['Lemmatized_review_lst']
results = {}
for value in effect_words:
    for item in value:
        if item in results:
            results[item] += 1
        else:
            results[item] = 1
pos_word_list = list({k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse = True)}.keys())

In [83]:
print(neg_word_list[0:10])
print(pos_word_list[0:10])

['not_NEG', 'year', 'no_NEG', 'pain', 'no', 'not', 'effects_NEG', 'day', 'side_NEG', 'started']
['not_NEG', 'side_NEG', 'no_NEG', 'started', 'not', 'get_NEG', 'day', 'effect', 'taking', 'effects_NEG']


In [19]:
bigrams = [('no', 'side'), ('side', 'effect')]

for review, number in zip(manual["Lemmatized_review_list"], range(0, manual["Lemmatized_review_list"].shape[0])):
    print(f"review n {number}: {extract_bigram_feats(review, bigrams)}")

review n 0: {'contains(no - side)': True, 'contains(side - effect)': True}
review n 1: {'contains(no - side)': False, 'contains(side - effect)': False}
review n 2: {'contains(no - side)': True, 'contains(side - effect)': True}
review n 3: {'contains(no - side)': False, 'contains(side - effect)': False}
review n 4: {'contains(no - side)': False, 'contains(side - effect)': False}
review n 5: {'contains(no - side)': False, 'contains(side - effect)': True}
review n 6: {'contains(no - side)': False, 'contains(side - effect)': False}
review n 7: {'contains(no - side)': False, 'contains(side - effect)': True}
review n 8: {'contains(no - side)': False, 'contains(side - effect)': False}
review n 9: {'contains(no - side)': False, 'contains(side - effect)': False}
review n 10: {'contains(no - side)': False, 'contains(side - effect)': True}
review n 11: {'contains(no - side)': True, 'contains(side - effect)': True}
review n 12: {'contains(no - side)': False, 'contains(side - effect)': False}
revie

In [20]:
for review in manual["Lemmatized_review"]:
    text = word_tokenize(review)
    print(nltk.pos_tag(text))

[('no', 'DT'), ('side', 'NN'), ('effect', 'NN'), ('take', 'VB'), ('combination', 'NN'), ('bystolic', 'JJ'), ('fish', 'JJ'), ('oil', 'NN')]
[('son', 'NN'), ('halfway', 'RB'), ('fourth', 'JJ'), ('intuniv', 'NN'), ('became', 'VBD'), ('concerned', 'JJ'), ('began', 'VBD'), ('last', 'JJ'), ('started', 'VBD'), ('taking', 'VBG'), ('highest', 'JJS'), ('dose', 'JJ'), ('two', 'CD'), ('could', 'MD'), ('hardly', 'RB'), ('get', 'VB'), ('bed', 'JJ'), ('cranky', 'NN'), ('slept', 'VBD'), ('nearly', 'RB'), ('drive', 'JJ'), ('home', 'NN'), ('school', 'NN'), ('vacation', 'NN'), ('unusual', 'JJ'), ('called', 'VBD'), ('doctor', 'NN'), ('monday', 'NN'), ('morning', 'NN'), ('said', 'VBD'), ('stick', 'JJ'), ('see', 'NN'), ('school', 'NN'), ('getting', 'VBG'), ('morning', 'NN'), ('last', 'JJ'), ('two', 'CD'), ('problem', 'NN'), ('free', 'JJ'), ('much', 'RB'), ('agreeable', 'JJ'), ('ever', 'RB'), ('le', 'JJ'), ('emotional', 'JJ'), ('good', 'JJ'), ('thing', 'NN'), ('le', 'NN'), ('cranky', 'NN'), ('remembering', '

[('intuniv', 'NN'), ('not', 'RB'), ('work', 'VB'), ('son', 'NN'), ('bouncing', 'VBG'), ('wall', 'NN'), ('taking', 'VBG'), ('major', 'JJ'), ('issue', 'NN'), ('class', 'NN'), ('seems', 'VBZ'), ('work', 'VBP'), ('opposite', 'RB')]
[('pain', 'NN'), ('management', 'NN'), ('doctor', 'NN'), ('put', 'VBD'), ('butrans', 'NNS'), ('patch', 'VBP'), ('ago', 'RB'), ('dose', 'JJ'), ('first', 'JJ'), ('box', 'NN'), ('four', 'CD'), ('lifesaver', 'NN'), ('no', 'DT'), ('agony', 'NN'), ('work', 'NN'), ('able', 'JJ'), ('sleep', 'JJ'), ('two', 'CD'), ('weekend', 'NN'), ('two', 'CD'), ('hoping', 'VBG'), ('bump', 'NN'), ('ten', 'NNS'), ('dose', 'VBP'), ('soon', 'RB'), ('cut', 'VBN'), ('norco', 'JJ'), ('chronic', 'JJ'), ('pain', 'NN'), ('many', 'JJ'), ('many', 'JJ'), ('medicine', 'NN'), ('including', 'VBG'), ('oxycontin', 'JJ'), ('patch', 'NN'), ('best', 'JJS'), ('far', 'RB')]
[('got', 'VBD'), ('heart', 'NN'), ('palpitation', 'NN'), ('really', 'RB'), ('bad', 'JJ'), ('like', 'IN'), ('almost', 'RB'), ('constant',

[('torn', 'JJ'), ('nuvaring', 'VBG'), ('convenience', 'NN'), ('great', 'JJ'), ('no', 'DT'), ('daily', 'JJ'), ('take', 'VB'), ('pill', 'NN'), ('easily', 'RB'), ('marked', 'VBD'), ('calender', 'NN'), ('put', 'VBD'), ('phone', 'NN'), ('never', 'RB'), ('fall', 'VBP'), ('never', 'RB'), ('feel', 'VBP'), ('changing', 'VBG'), ('simple', 'NN'), ('would', 'MD'), ('give', 'VB'), ('nuvaring', 'NN'), ('based', 'VBN'), ('upon', 'IN'), ('simplicity', 'NN'), ('price', 'NN'), ('really', 'RB'), ('reasonable', 'JJ'), ('downside', 'NN'), ('personally', 'RB'), ('first', 'RB'), ('not', 'RB'), ('notice', 'JJ'), ('side', 'NN'), ('effect', 'NN'), ('although', 'IN'), ('low', 'JJ'), ('tolerance', 'NN'), ('alcohol', 'NN'), ('nuvaring', 'VBG'), ('emotional', 'JJ'), ('tired', 'JJ'), ('strung', 'NN'), ('pill', 'NN'), ('none', 'NN'), ('happened', 'VBD'), ('also', 'RB'), ('longer', 'JJR'), ('period', 'NN'), ('worth', 'IN'), ('try', 'NN'), ('every', 'DT'), ('girl', 'NN'), ('seems', 'VBZ'), ('different', 'JJ'), ('experi

[('accurate', 'JJ'), ('information', 'NN')]
[('diagnosed', 'VBN'), ('adult', 'NN'), ('onset', 'VBN'), ('diabetes', 'NNS'), ('last', 'JJ'), ('dec', 'NN'), ('started', 'VBD'), ('metformin', 'RB'), ('almost', 'RB'), ('immediately', 'RB'), ('fasting', 'VBG'), ('glucose', 'NN'), ('dropped', 'VBD'), ('since', 'IN'), ('bothered', 'VBN'), ('neuropathy', 'JJ'), ('tried', 'VBN'), ('lyrica', 'NN'), ('absolutely', 'RB'), ('nothing', 'NN'), ('pain', 'NN'), ('presently', 'RB'), ('taking', 'VBG'), ('gabapentin', 'NN'), ('bedtime', 'NN'), ('help', 'NN'), ('slightly', 'RB'), ('cost', 'VBN'), ('lyrica', 'JJ'), ('money', 'NN'), ('window', 'NN')]
[('genius', 'NN'), ('psychiatrist', 'NN'), ('started', 'VBD'), ('drug', 'NN'), ('fall', 'NN'), ('first', 'RB'), ('felt', 'VBD'), ('even', 'RB'), ('worse', 'JJR'), ('started', 'VBD'), ('taking', 'VBG'), ('fluoxetine', 'JJ'), ('many', 'JJ'), ('drug', 'NN'), ('eventually', 'RB'), ('taking', 'VBG'), ('manic', 'JJ'), ('thought', 'NN'), ('amp', 'NN'), ('episode', 'JJ')

[('lexapro', 'JJ'), ('mild', 'JJ'), ('side', 'NN'), ('effect', 'NN'), ('like', 'IN'), ('yawning', 'VBG'), ('insomnia', 'NN'), ('really', 'RB'), ('bad', 'JJ'), ('point', 'NN'), ('sleeping', 'VBG'), ('five', 'CD'), ('soi', 'NNS'), ('say', 'VBP'), ('calm', 'JJ'), ('havent', 'NN'), ('got', 'VBD'), ('full', 'JJ'), ('effect', 'NN'), ('getting', 'VBG'), ('back', 'RB'), ('old', 'JJ'), ('self', 'NN')]
[('dulera', 'NN'), ('work', 'NN'), ('pretty', 'RB'), ('well', 'RB'), ('not', 'RB'), ('need', 'VB'), ('use', 'NN'), ('rescue', 'NN'), ('inhaler', 'NN'), ('often', 'RB'), ('one', 'CD'), ('main', 'JJ'), ('downside', 'NN'), ('continuously', 'RB'), ('cause', 'VBZ'), ('sinuitis', 'NN'), ('cause', 'NN'), ('migraine', 'NN'), ('always', 'RB'), ('bad', 'JJ'), ('problem', 'NN'), ('sinus', 'NN'), ('definately', 'RB'), ('reconsidering', 'VBG'), ('continuing', 'VBG'), ('using', 'VBG'), ('product', 'NN')]
[('metoprolol', 'NN'), ('er', 'NN'), ('since', 'IN'), ('september', 'NN'), ('th', 'NN'), ('first', 'RB'), ('

[('wonderful', 'JJ'), ('experience', 'NN'), ('ziana', 'NNP'), ('acne', 'VBZ'), ('medication', 'NN'), ('didnt', 'NN'), ('dry', 'JJ'), ('skin', 'NN'), ('using', 'VBG'), ('thought', 'VBN'), ('would', 'MD'), ('never', 'RB'), ('good', 'JJ'), ('experience', 'NN'), ('acne', 'JJ'), ('medication', 'NN'), ('ziana', 'NNP'), ('smooth', 'CC'), ('moisturizing', 'VBG'), ('acne', 'JJ'), ('medication', 'NN'), ('leaf', 'NN'), ('skin', 'NN'), ('feeling', 'NN'), ('great', 'JJ'), ('couple', 'NN'), ('redness', 'NN'), ('swelling', 'VBG'), ('acne', 'JJ'), ('went', 'VBD'), ('significantly', 'RB'), ('dont', 'JJ'), ('see', 'VBP'), ('ever', 'RB'), ('using', 'VBG'), ('another', 'DT'), ('acne', 'NN'), ('medication', 'NN')]
[('taking', 'VBG'), ('pristiq', 'NN'), ('started', 'VBD'), ('taking', 'VBG'), ('couple', 'NN'), ('ago', 'RB'), ('yr', 'RB'), ('old', 'JJ'), ('antidepressant', 'JJ'), ('yr', 'NN'), ('tried', 'VBD'), ('many', 'JJ'), ('ssri', 'JJ'), ('lexapro', 'JJ'), ('several', 'JJ'), ('yr', 'NNS'), ('never', 'RB'

In [21]:
review_x_side_effect = side_effect_data.drop(["Lemmatized_review", "Lemmatized_review_list", "sideEffect", "drugName", "review", "words_count", "Lemmatized_review_lst", "Side_Effects_mention"], axis = 1)

drug_x_side_effect = side_effect_data.drop(["sideEffect"], axis = 1).groupby(["drugName"]).sum()

print(review_x_side_effect.shape)

review_x_side_effect.head(3)

(299, 146)


Unnamed: 0,illness,arthritis,nervous,helped,take,traumatic,nothing,problem,no,symptom,...,swollen,diabetes,pain,difference,uncomfortable,tiredness,infection,itching,anxiety,suffered
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
print(drug_x_side_effect.shape)

drug_x_side_effect.head(3)

(214, 146)


Unnamed: 0_level_0,illness,arthritis,nervous,helped,take,traumatic,nothing,problem,no,symptom,...,swollen,diabetes,pain,difference,uncomfortable,tiredness,infection,itching,anxiety,suffered
drugName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accutane,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Acetaminophen / hydrocodone,0,1,0,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Acetaminophen / oxycodone,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [191]:
lda_model = LatentDirichletAllocation(n_components = 2,
                                      learning_method = 'online',   
                                      random_state = 29,
                                      batch_size = 128,
                                      learning_decay = 0.5,
                                      learning_offset = 5,
                                      evaluate_every = -1,
                                      verbose = 0,
                                      max_iter = 500).fit(data_vectorized)


# nmf_model = NMF(n_components = 2,
#                 init = 'random',
#                 random_state = 0,
#                 verbose = 0,
#                 max_iter = 500,
#                 l1_ratio = 0,
#                 alpha = 0).fit(drug_x_side_effect)

# svd_model = TruncatedSVD(n_components = 2,
#                          n_iter = 500,
#                          random_state = 42).fit(drug_x_side_effect)

In [197]:
predictions = lda_model.transform(data_vectorized_full)

In [205]:
manual['predictions'] = predictions[:, 0]

In [206]:
def predict(series):
    if series > .5:
        return 1
    else:
        return 0

In [207]:
manual['predictions'] = manual['predictions'].apply(predict)

In [208]:
len(manual.loc[manual['predictions'] == manual['sideEffect']])/len(manual)

0.43478260869565216

In [209]:
print(manual[['predictions', 'sideEffect']])

     predictions  sideEffect
0              1         0.0
1              0         1.0
2              0         1.0
3              0         1.0
4              0         1.0
..           ...         ...
294            1         1.0
295            0         1.0
296            0         1.0
297            0         1.0
298            0         1.0

[299 rows x 2 columns]


In [24]:
side_effects = list(drug_x_side_effect.columns)

In [192]:
lda_components = lda_model.components_

for t in range(len(lda_components)):
    dic = dict(zip(side_effects, np.round(lda_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:20])
    #print(f"topic {t}: {dict(zip(side_effects, lda_components[t]))}")

[('exertion', 3.208), ('shaking', 2.416), ('cause', 2.297), ('reaction', 2.275), ('painful', 2.249), ('disease', 2.234), ('done', 2.18), ('whether', 2.031), ('brain', 1.975), ('stomachache', 1.969), ('clear', 1.9), ('sweating', 1.797), ('suffering', 1.759), ('difficult', 1.727), ('frustration', 1.504), ('burning', 1.482), ('migranes', 1.457), ('experiencing', 1.438), ('breathlessness', 1.397), ('vomiting', 1.305)]
[('indeed', 5.946), ('moody', 4.949), ('yawning', 4.846), ('painful', 4.823), ('irritated', 4.644), ('relieve', 4.192), ('pneumonia', 4.168), ('hypothermia', 4.067), ('stress', 3.992), ('taken', 3.915), ('difficult', 3.911), ('nervousness', 3.91), ('sudden', 3.733), ('constipation', 3.634), ('nervous', 3.597), ('know', 3.558), ('euphoria', 3.47), ('no', 3.384), ('lethargy', 3.375), ('vomiting', 3.328)]


In [26]:
nmf_components = nmf_model.components_

for t in range(len(nmf_components)):
    dic = dict(zip(side_effects, np.round(nmf_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:20])

[('side', 2.178), ('effect', 2.044), ('medication', 0.527), ('better', 0.502), ('really', 0.434), ('much', 0.399), ('feeling', 0.391), ('depression', 0.345), ('take', 0.331), ('nothing', 0.315), ('treatment', 0.308), ('anxiety', 0.286), ('however', 0.261), ('mild', 0.242), ('nausea', 0.221), ('headache', 0.221), ('weight', 0.22), ('stomach', 0.195), ('going', 0.194), ('experience', 0.19)]
[('no', 2.308), ('side_NEG', 2.192), ('better_NEG', 0.673), ('pain', 0.634), ('take', 0.596), ('effect_NEG', 0.447), ('depression', 0.418), ('weight', 0.417), ('medication', 0.333), ('really', 0.286), ('feeling', 0.286), ('better', 0.278), ('bleeding', 0.275), ('sleep', 0.259), ('severe', 0.248), ('anxiety', 0.226), ('much', 0.225), ('acne', 0.211), ('experienced', 0.2), ('diagnosed', 0.199)]


In [27]:
svd_components = svd_model.components_

for t in range(len(svd_components)):
    dic = dict(zip(side_effects, np.round(svd_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:20])

[('no', 0.423), ('side', 0.406), ('side_NEG', 0.402), ('effect', 0.38), ('take', 0.181), ('medication', 0.169), ('pain', 0.161), ('better', 0.155), ('depression', 0.15), ('really', 0.141), ('feeling', 0.132), ('better_NEG', 0.129), ('weight', 0.123), ('much', 0.123), ('effect_NEG', 0.107), ('anxiety', 0.102), ('treatment', 0.079), ('nausea', 0.078), ('severe', 0.073), ('nothing', 0.071)]
[('side', 0.503), ('effect', 0.474), ('nothing', 0.057), ('mild', 0.056), ('however', 0.055), ('better', 0.054), ('medication', 0.049), ('treatment', 0.048), ('insomnia', 0.042), ('much', 0.042), ('really', 0.039), ('tired', 0.036), ('feeling', 0.034), ('yawning', 0.033), ('slight', 0.031), ('wheezy', 0.031), ('stomach', 0.03), ('fatigue', 0.029), ('constipation', 0.029), ('dizziness', 0.028)]


# Data

In [28]:
#data = pd.read_csv('drugsComTrain_raw.csv')

data = pd.read_csv('../raw_data/drugsComTrain_raw.csv')

data = data[data['rating'] < 10]

data = data.reset_index(drop = True)

data = data.sort_values(by=['drugName'])

data = data.reset_index(drop = True)

print(f"The shape of the data is {data.shape[0]} rows and {data.shape[1]} columns")
print("\n")
print(f"The amount of unique ID is {len(data['uniqueID'].unique())}") # check if any of the uniqueID repeat
print("\n")
print(f"The number of unique drugs reviewed is {len(data['drugName'].unique())}")
print("\n")
print(f"The number of unique conditions is {len(data['condition'].unique())}")
print("\n")

The shape of the data is 110308 rows and 7 columns


The amount of unique ID is 110308


The number of unique drugs reviewed is 2946


The number of unique conditions is 791




In [29]:
pd.DataFrame(data["drugName"].value_counts()).head(14).T

Unnamed: 0,Etonogestrel,Ethinyl estradiol / norethindrone,Levonorgestrel,Ethinyl estradiol / norgestimate,Nexplanon,Ethinyl estradiol / levonorgestrel,Sertraline,Miconazole,Mirena,Implanon,Medroxyprogesterone,Escitalopram,Venlafaxine,Depo-Provera
drugName,2690,2380,2262,1776,1760,1565,948,942,914,864,838,838,760,757


In [211]:
pd.DataFrame(data["drugName"].value_counts()).to_csv('../raw_data/drugentrycount.csv')

In [30]:
pd.DataFrame(data["condition"].value_counts()).head(14).T

Unnamed: 0,Birth Control,Depression,Pain,Acne,Anxiety,Bipolar Disorde,Insomnia,ADHD,Obesity,Weight Loss,Vaginal Yeast Infection,Abnormal Uterine Bleeding,"Diabetes, Type 2",High Blood Pressure
condition,23080,6544,4037,3828,3628,2948,2611,2421,2153,1981,1974,1856,1834,1803


In [31]:
NUMERO_SAMPLES = 150000

data = data.drop(["uniqueID", "condition", "date", "rating", "usefulCount"], axis = 1) # .head(NUMERO_SAMPLES) # SACAR ESTOOOOOO!!

In [32]:
data["clean_review"] = data["review"].apply(punctuation)
data['clean_review'] = data.clean_review.apply(remove_numbers)
data['clean_review_lst'] = data.clean_review.apply(to_list)

data["NonStopwords_review_lst"] = data.clean_review.apply(remove_stopwords)
data["NonStopwords_review_str"] = data.NonStopwords_review_lst.apply(to_string)

data["NonStopwords_review_lst_MN"] = data.clean_review.apply(m_negation)
data["NonStopwords_review_str_MN"] = data.NonStopwords_review_lst_MN.apply(to_string)

data["Lemmatized_review_lst"] = data.NonStopwords_review_lst_MN.apply(lemmatize_review)
data["Lemmatized_review_str"] = data.Lemmatized_review_lst.apply(to_string)

data["Lemmatized_review_list"] = data.NonStopwords_review_lst.apply(lemmatize_review)
data["Lemmatized_review"] = data.Lemmatized_review_list.apply(to_string)
data["Lemmatized_review_list"] = data.Lemmatized_review.apply(remove_stopwords)
data["Lemmatized_review"] = data.Lemmatized_review_list.apply(to_string)

data["words_count"] = data.Lemmatized_review_list.apply(count_words)

data = data.drop(["clean_review", "clean_review_lst", "NonStopwords_review_lst", "NonStopwords_review_str", "NonStopwords_review_lst_MN", "NonStopwords_review_str_MN",
                  "Lemmatized_review_lst", "Lemmatized_review_str"], axis = 1)

data["Side_Effects_mention"] = data.Lemmatized_review_list.apply(side_effects_lst)

In [33]:
data.head(15)

Unnamed: 0,drugName,review,Lemmatized_review_list,Lemmatized_review,words_count,Side_Effects_mention
0,Abacavir / dolutegravir / lamivudine,"""I started on triumeq august 1,2016....on the ...","[started, triumeq, august, second, bad, headac...",started triumeq august second bad headache amp...,"{'started': 1, 'triumeq': 1, 'august': 1, 'sec...","[headache, stomach, ache, better, side, effect..."
1,Abacavir / dolutegravir / lamivudine,"""I have had nausea and I threw up too. I hope ...","[nausea, threw, hope, side, effect, stop, pas,...",nausea threw hope side effect stop pas also fe...,"{'nausea': 1, 'threw': 1, 'hope': 1, 'side': 1...","[nausea, hope, side, effect, feeling, tired]"
2,Abacavir / dolutegravir / lamivudine,"""I have been taking Triumeq for eight months. ...","[taking, triumeq, eight, one, taken, far, hiv,...",taking triumeq eight one taken far hiv cant re...,"{'taking': 1, 'triumeq': 1, 'eight': 1, 'one':...","[taken, really, treatment, tired, depression, ..."
3,Abacavir / dolutegravir / lamivudine,"""Started on Triumeq exactly 1 month. Trying to...","[started, triumeq, exactly, trying, understand...",started triumeq exactly trying understand vira...,"{'started': 1, 'triumeq': 1, 'exactly': 1, 'tr...","[side, effect, experience, no, headache, feeli..."
4,Abacavir / dolutegravir / lamivudine,"""I was born with HIV my birth mother transfer...","[born, hiv, birth, mother, transfer, amp, life...",born hiv birth mother transfer amp life never ...,"{'born': 1, 'hiv': 1, 'birth': 1, 'mother': 1,...","[taken, really, nausea, appetite, better]"
5,Abacavir / dolutegravir / lamivudine,"""I have been taking Triumeq for about 9years, ...","[taking, triumeq, taking, complera, doctor, de...",taking triumeq taking complera doctor decided ...,"{'taking': 2, 'triumeq': 2, 'complera': 3, 'do...","[serious, no, side, effect, medication, treatm..."
6,Abacavir / dolutegravir / lamivudine,"""I was diagnosed in January 2011. My own immun...","[diagnosed, january, immune, system, able, man...",diagnosed january immune system able manage hi...,"{'diagnosed': 1, 'january': 1, 'immune': 1, 's...","[diagnosed, able, nightmare, experienced, no, ..."
7,Abacavir / dolutegravir / lamivudine,"""Diagnosed 5 years ago been on truvada and ise...","[diagnosed, ago, truvada, isentress, no, probl...",diagnosed ago truvada isentress no problem doc...,"{'diagnosed': 1, 'ago': 1, 'truvada': 1, 'isen...","[diagnosed, no, problem, side, effect, sleep, ..."
8,Abacavir / dolutegravir / lamivudine,"""Triumeq is working as an HIV medication - but...","[triumeq, working, hiv, medication, gaining, w...",triumeq working hiv medication gaining weight,"{'triumeq': 1, 'working': 1, 'hiv': 1, 'medica...","[medication, weight]"
9,Abacavir / dolutegravir / lamivudine,"""My big concern with this drug has been a dras...","[big, concern, drug, drastic, decrease, level,...",big concern drug drastic decrease level energy...,"{'big': 1, 'concern': 1, 'drug': 1, 'drastic':...","[nausea, no]"


In [34]:
total_count(data["words_count"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,not,effect,side,taking,no,first,get,take,started,like,...,took,still,medication,got,bad,weight,dont,since,really,much
1,45085,30341,29682,29083,28358,26127,24921,24655,24148,23978,...,16442,16336,16023,15745,15488,15395,14916,14638,14349,14236


In [35]:
flat_list = [item for sublist in data["Side_Effects_mention"] for item in sublist]

flat_list = list(set(flat_list))

side_effect_encoder = pd.DataFrame(columns = list(flat_list), index = None)

side_effect_data = pd.concat([data, side_effect_encoder], axis = 1)

for drug in flat_list:
    side_effect_data[drug] = side_effect_data["Side_Effects_mention"].apply(lambda effect: se_encoder(effect, drug))

side_effect_data.head(3)

Unnamed: 0,drugName,review,Lemmatized_review_list,Lemmatized_review,words_count,Side_Effects_mention,breath,illness,arthritis,nervous,...,uncomfortable,exhaustion,tiredness,boredom,infection,delirium,itching,discomfort,anxiety,bloating
0,Abacavir / dolutegravir / lamivudine,"""I started on triumeq august 1,2016....on the ...","[started, triumeq, august, second, bad, headac...",started triumeq august second bad headache amp...,"{'started': 1, 'triumeq': 1, 'august': 1, 'sec...","[headache, stomach, ache, better, side, effect...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Abacavir / dolutegravir / lamivudine,"""I have had nausea and I threw up too. I hope ...","[nausea, threw, hope, side, effect, stop, pas,...",nausea threw hope side effect stop pas also fe...,"{'nausea': 1, 'threw': 1, 'hope': 1, 'side': 1...","[nausea, hope, side, effect, feeling, tired]",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Abacavir / dolutegravir / lamivudine,"""I have been taking Triumeq for eight months. ...","[taking, triumeq, eight, one, taken, far, hiv,...",taking triumeq eight one taken far hiv cant re...,"{'taking': 1, 'triumeq': 1, 'eight': 1, 'one':...","[taken, really, treatment, tired, depression, ...",1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
review_x_side_effect = side_effect_data.drop(["Lemmatized_review", "drugName", "Lemmatized_review_list", "review", "words_count", "Side_Effects_mention"], axis = 1)

drug_x_side_effect = side_effect_data.groupby(["drugName"]).sum()

print(review_x_side_effect.shape)

review_x_side_effect.head(3)

(110308, 248)


Unnamed: 0,breath,illness,arthritis,nervous,helped,anhedonia,take,step,traumatic,hangover,...,uncomfortable,exhaustion,tiredness,boredom,infection,delirium,itching,discomfort,anxiety,bloating
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
print(drug_x_side_effect.shape)

drug_x_side_effect.head(3)

(2946, 248)


Unnamed: 0_level_0,breath,illness,arthritis,nervous,helped,anhedonia,take,step,traumatic,hangover,...,uncomfortable,exhaustion,tiredness,boredom,infection,delirium,itching,discomfort,anxiety,bloating
drugName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abacavir / dolutegravir / lamivudine,1,1,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Abacavir / lamivudine / zidovudine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Abatacept,1,0,3,0,1,0,2,0,0,0,...,0,0,2,0,0,0,0,0,0,0


# CountVectorizer | Latent Dirichlet allocation

In [38]:
side_effects = list(drug_x_side_effect.columns)

In [194]:
# vectorizer = TfidfVectorizer(min_df = 0.05, 
#                              max_df = 0.55, 
#                              max_features = None,
#                              vocabulary = None,
#                              binary = False,
#                              ngram_range = (1, 2)).fit(manual.iloc[index_list]['Lemmatized_review']) #

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

# stop = vectorizer.get_stop_words()

data_vectorized_full = vectorizer.transform(manual['Lemmatized_review']) #

# lda_model = LatentDirichletAllocation(n_components = 25,
#                                       learning_method = 'online',   
#                                       random_state = 29,
#                                       batch_size = 128,
#                                       learning_decay = 0.5,
#                                       learning_offset = 5,
#                                       evaluate_every = -1,
#                                       verbose = 0,
#                                       max_iter = 100).fit(drug_x_side_effect) # Fitting

# nmf_model = NMF(n_components = 25,
#                 init = 'random',
#                 random_state = 0,
#                 verbose = 0,
#                 max_iter = 50,
#                 l1_ratio = 0,
#                 alpha = 0).fit(drug_x_side_effect)

# svd_model = TruncatedSVD(n_components = 25,
#                          n_iter = 10,
#                          random_state = 42).fit(drug_x_side_effect)

In [40]:
lda_components = lda_model.components_

for t in range(len(lda_components)):
    dic = dict(zip(side_effects, np.round(lda_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])
    #print(f"topic {t}: {dict(zip(side_effects, lda_components[t]))}")

[('burning', 1847.788), ('itching', 1541.885), ('pain', 584.549), ('no', 451.087), ('sleep', 377.942), ('much', 376.466), ('experience', 341.669), ('know', 326.052)]
[('swelling', 549.383), ('diagnosed', 478.251), ('arthritis', 396.808), ('bruising', 248.063), ('inflammation', 213.941), ('no', 190.25), ('severe', 162.203), ('ache', 143.879)]
[('bipolar', 1137.901), ('medication', 405.405), ('diagnosed', 364.178), ('stiffness', 10.11), ('breath', 0.04), ('illness', 0.04), ('arthritis', 0.04), ('nervous', 0.04)]
[('side', 9512.422), ('effect', 9162.133), ('no', 3388.063), ('medication', 1301.773), ('problem', 750.028), ('result', 640.333), ('however', 248.144), ('much', 226.72)]
[('no', 3827.563), ('take', 3823.139), ('medication', 3098.068), ('effect', 2991.211), ('much', 2801.722), ('better', 2653.594), ('side', 2531.38), ('really', 2441.546)]
[('heart', 1302.327), ('chest', 714.136), ('symptom', 664.198), ('breath', 559.213), ('breathing', 500.934), ('severe', 500.036), ('problem', 46

In [41]:
nmf_components = nmf_model.components_

for t in range(len(nmf_components)):
    dic = dict(zip(side_effects, np.round(nmf_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])

[('no', 2.754), ('side', 2.224), ('effect', 2.014), ('really', 1.185), ('take', 1.079), ('going', 0.861), ('weight', 0.787), ('headache', 0.731)]
[('weight', 2.676), ('bleeding', 2.613), ('spotting', 1.266), ('pain', 0.953), ('acne', 0.942), ('taken', 0.807), ('depression', 0.737), ('take', 0.707)]
[('infection', 2.995), ('no', 0.886), ('side', 0.533), ('effect', 0.445), ('take', 0.35), ('pain', 0.301), ('taken', 0.296), ('severe', 0.293)]
[('bipolar', 4.938), ('medication', 4.121), ('depression', 3.72), ('effect', 3.178), ('take', 3.009), ('side', 3.0), ('disorder', 2.341), ('thought', 2.162)]
[('pain', 3.774), ('cramping', 2.801), ('painful', 2.305), ('take', 1.003), ('uncomfortable', 0.96), ('really', 0.873), ('experience', 0.83), ('nothing', 0.811)]
[('acne', 8.313), ('nausea', 2.71), ('emotional', 2.295), ('clear', 2.282), ('nauseous', 1.76), ('problem', 1.742), ('moody', 1.738), ('appetite', 1.668)]
[('burning', 12.706), ('itching', 9.791), ('pain', 4.991), ('treatment', 3.596), 

In [42]:
svd_components = svd_model.components_

for t in range(len(svd_components)):
    dic = dict(zip(side_effects, np.round(svd_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])

[('no', 0.386), ('side', 0.328), ('effect', 0.324), ('weight', 0.292), ('take', 0.241), ('bleeding', 0.193), ('pain', 0.192), ('really', 0.186)]
[('bleeding', 0.391), ('spotting', 0.281), ('acne', 0.27), ('weight', 0.249), ('no', 0.187), ('cramping', 0.169), ('painful', 0.157), ('experience', 0.08)]
[('pain', 0.696), ('cramping', 0.224), ('painful', 0.2), ('take', 0.174), ('burning', 0.17), ('infection', 0.159), ('itching', 0.125), ('uncomfortable', 0.09)]
[('anxiety', 0.477), ('depression', 0.308), ('bleeding', 0.296), ('spotting', 0.223), ('cramping', 0.195), ('panic', 0.175), ('painful', 0.164), ('disorder', 0.09)]
[('burning', 0.508), ('itching', 0.391), ('infection', 0.311), ('acne', 0.228), ('anxiety', 0.18), ('treatment', 0.144), ('better', 0.127), ('thought', 0.114)]
[('weight', 0.406), ('burning', 0.245), ('itching', 0.202), ('bleeding', 0.192), ('sleep', 0.154), ('loss', 0.125), ('infection', 0.107), ('no', 0.066)]
[('cramping', 0.361), ('nausea', 0.276), ('painful', 0.237), 

In [43]:
# vectorizer = TfidfVectorizer(min_df = 0.05, 
#                              max_df = 0.55, 
#                              max_features = None,
#                              vocabulary = None,
#                              binary = False,
#                              ngram_range = (1, 2)).fit(data["Lemmatized_review"]) #

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

# stop = vectorizer.get_stop_words()

# data_vectorized = vectorizer.transform(data["Lemmatized_review"]) #

lda_model = LatentDirichletAllocation(n_components = 25,
                                      learning_method = 'online',   
                                      random_state = 29,
                                      batch_size = 128,
                                      learning_decay = 0.5,
                                      learning_offset = 5,
                                      evaluate_every = -1,
                                      verbose = 0,
                                      max_iter = 100).fit(review_x_side_effect) # Fitting

nmf_model = NMF(n_components = 25,
                init = 'random',
                random_state = 0,
                verbose = 0,
                max_iter = 50,
                l1_ratio = 0,
                alpha = 0).fit(review_x_side_effect)

svd_model = TruncatedSVD(n_components = 25,
                         n_iter = 10,
                         random_state = 42).fit(review_x_side_effect)



In [44]:
lda_components = lda_model.components_

for t in range(len(lda_components)):
    dic = dict(zip(side_effects, np.round(lda_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])
    #print(f"topic {t}: {dict(zip(side_effects, lda_components[t]))}")

[('come', 4434.083), ('infection', 3569.056), ('treatment', 3006.937), ('itching', 2261.644), ('sore', 1972.842), ('move', 1137.756), ('itchy', 1102.217), ('swollen', 1085.787)]
[('pain', 19957.759), ('nausea', 8567.587), ('severe', 3008.49), ('medication', 1341.986), ('rash', 1223.698), ('abdominal', 882.138), ('breathing', 791.014), ('relieve', 584.534)]
[('really', 14251.336), ('nightmare', 1601.744), ('irritability', 663.175), ('feeling', 528.885), ('irritation', 407.086), ('medication', 400.231), ('soreness', 247.73), ('breath', 0.04)]
[('effect', 31111.876), ('side', 30213.059), ('shaking', 554.322), ('dysfunction', 317.681), ('sleeplessness', 213.544), ('dehydration', 179.579), ('create', 62.645), ('stomachache', 36.756)]
[('no', 17634.252), ('tired', 5039.101), ('diagnosed', 3585.765), ('disorder', 2577.93), ('probably', 2096.821), ('vomiting', 2060.703), ('feeling', 1579.364), ('ptsd', 445.824)]
[('experienced', 6490.353), ('cause', 3842.859), ('muscle', 2764.662), ('step', 37

In [45]:
nmf_components = nmf_model.components_

for t in range(len(nmf_components)):
    dic = dict(zip(side_effects, np.round(nmf_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])

[('headache', 0.66), ('migraine', 0.083), ('tired', 0.035), ('slight', 0.024), ('fatigue', 0.021), ('pressure', 0.021), ('ache', 0.021), ('dizziness', 0.02)]
[('effect', 0.988), ('side', 0.979), ('experienced', 0.035), ('loss', 0.022), ('pressure', 0.017), ('mild', 0.016), ('insomnia', 0.015), ('dizziness', 0.015)]
[('feeling', 1.037), ('tired', 0.098), ('nauseous', 0.049), ('anxious', 0.035), ('experience', 0.027), ('chest', 0.026), ('heart', 0.025), ('stomach', 0.02)]
[('really', 1.523), ('hope', 0.057), ('come', 0.035), ('tired', 0.034), ('difference', 0.033), ('something', 0.032), ('everything', 0.031), ('nauseous', 0.022)]
[('sleep', 1.159), ('able', 0.142), ('insomnia', 0.135), ('tired', 0.119), ('taken', 0.058), ('appetite', 0.032), ('nightmare', 0.031), ('trouble', 0.024)]
[('better', 1.79), ('hope', 0.095), ('result', 0.044), ('able', 0.036), ('infection', 0.03), ('something', 0.027), ('treatment', 0.021), ('find', 0.02)]
[('pain', 2.198), ('stomach', 0.192), ('painful', 0.14)

In [46]:
svd_components = svd_model.components_

for t in range(len(svd_components)):
    dic = dict(zip(side_effects, np.round(svd_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])

[('effect', 0.473), ('side', 0.466), ('no', 0.338), ('take', 0.268), ('pain', 0.201), ('medication', 0.172), ('weight', 0.162), ('much', 0.15)]
[('take', 0.376), ('pain', 0.274), ('no', 0.256), ('much', 0.149), ('better', 0.135), ('really', 0.131), ('going', 0.108), ('medication', 0.1)]
[('no', 0.742), ('weight', 0.27), ('acne', 0.133), ('bleeding', 0.094), ('spotting', 0.077), ('problem', 0.055), ('cramping', 0.037), ('painful', 0.028)]
[('anxiety', 0.285), ('weight', 0.217), ('depression', 0.211), ('really', 0.159), ('better', 0.151), ('take', 0.149), ('much', 0.147), ('medication', 0.112)]
[('much', 0.291), ('better', 0.263), ('pain', 0.228), ('medication', 0.162), ('anxiety', 0.15), ('really', 0.148), ('weight', 0.144), ('depression', 0.135)]
[('medication', 0.533), ('no', 0.322), ('anxiety', 0.274), ('depression', 0.168), ('sleep', 0.117), ('severe', 0.087), ('feeling', 0.076), ('panic', 0.071)]
[('medication', 0.441), ('weight', 0.412), ('acne', 0.107), ('severe', 0.104), ('heada

In [44]:
# """A model with higher log-likelihood and lower perplexity
# (exp(-1. * log-likelihood per word)) is considered to be good. Let’s check for our model."""

# # Log Likelyhood: Higher the better
# print("Log Likelihood: ", lda_model.score(data_vectorized))

# # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
# print("Perplexity: ", lda_model.perplexity(data_vectorized))

# # See model parameters
# print(lda_model.get_params())

## Plot

In [45]:
# from mpl_toolkits.mplot3d import Axes3D

# fig = pyplot.figure()
# ax = Axes3D(fig)

# pipeline = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
# ])    

# X = pipeline.fit_transform(manual["Lemmatized_review"]).todense()

# pca = PCA(n_components=3).fit(X)
# data2D = pca.transform(X)
# plt.figure(figsize=(20,7))
# ax.scatter(data2D[:,0], data2D[:,1], data2D[:,2], c = y)
# # plt.show()

# kmeans = KMeans(n_clusters=2).fit(X)
# centers2D = pca.transform(kmeans.cluster_centers_)

# plt.scatter(centers2D[:,0], centers2D[:,1], 
#             marker='x', s=200, linewidths=3)
# plt.show() 

In [46]:
# pipeline = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
# ])    

# X = pipeline.fit_transform(data["Lemmatized_review"]).todense()

# pca = PCA(n_components=2).fit(X)
# data2D = pca.transform(X)
# plt.figure(figsize=(20,7))
# plt.scatter(data2D[:,0], data2D[:,1])
# #plt.show()

# kmeans = KMeans(n_clusters=2).fit(X)
# centers2D = pca.transform(kmeans.cluster_centers_)

# plt.scatter(centers2D[:,0], centers2D[:,1], 
#             marker='x', s=200, linewidths=3)
# plt.show() 

In [47]:
# pipeline = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
# ])    

# X = pipeline.fit_transform(bad_data["Lemmatized_review"]).todense()

# pca = PCA(n_components=2).fit(X)
# data2D = pca.transform(X)
# plt.figure(figsize=(20,7))
# plt.scatter(data2D[:,0], data2D[:,1])
# #plt.show()

# kmeans = KMeans(n_clusters=2).fit(X)
# centers2D = pca.transform(kmeans.cluster_centers_)

# plt.scatter(centers2D[:,0], centers2D[:,1], 
#             marker='x', s=200, linewidths=3)
# plt.show() 

## Grid

In [48]:
# # Define Search Param
# search_params = {'n_components' : [2, 3, 4], 'learning_decay' : [.2, .3, .4], "max_iter" : [50, 100], "learning_offset" : [5, 10]}

# # Init the Model
# lda = LatentDirichletAllocation()

# # Init Grid Search Class
# model = GridSearchCV(lda, param_grid = search_params)

# # Do the Grid Search
# model.fit(data_vectorized)

# # Best Model
# best_lda_model = model.best_estimator_

# # Model Parameters
# print("Best Model's Params: ", model.best_params_)

# # Log Likelihood Score
# print("Best Log Likelihood Score: ", model.best_score_)

# # Perplexity
# print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

In [49]:
# len(vectorizer.vocabulary_)

## Example

In [50]:
example = ["pain back"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

for topic in range(lda_vectors.shape[1]):
    print(f"topic {topic} :", lda_vectors[0][topic])

NameError: ignored

# Prediction

In [None]:
predictions = []

for review in manual["Lemmatized_review"]:
    vectorized = vectorizer.transform([review])
    lda_vectors = predictions.append(lda_model.transform(vectorized))

predictions = np.concatenate(predictions, axis=0)

predictions.shape

## Compare

In [None]:
compare_data = pd.DataFrame(predictions, columns = ["Side_Effect", "No_Side_Effect"])

compare_data["Manually_Labelled"] = manual["sideEffect"] # Brings a column from the other DataFrame

compare_data["Prediction"] = compare_data.Side_Effect.apply(one_or_zero) # Applies the binary output
compare_data["No_Side_Effect_%"] = compare_data.No_Side_Effect.apply(round_two) # Applies a format
compare_data["Side_Effect_%"] = compare_data.Side_Effect.apply(round_two) # Applies a format

compare_data["bool"] = np.where(compare_data["Manually_Labelled"] == compare_data["Prediction"], True, False) # Compares betwen the precdiction and the label

compare_data.drop(["Side_Effect", "No_Side_Effect"], axis = 1)

In [None]:
compare_data["bool"].value_counts()

In [None]:
compare_data.to_csv('data.csv')

!cp data.csv "drive/My Drive/"

In [None]:


# correct predictions / 100
# recall = correctly predicted side effects / total side effects
# precision = correctly predicted side effects / total predicted side effects

# predicted_topic = []



# (np.array(predicted_topic) == np.array(test_data.sideEffect)).sum()

In [None]:
# for row in range(len(test_data)):
#     if (lda_vectors[row][0] > lda_vectors[row][1]):
#         predicted_topic.append(0)
#     else: predicted_topic.append(1)

# CountVectorizer | Pipeline

In [None]:
import itertools

laplace = lidstone = range(1, 4)

lap_lid = list(itertools.product(laplace, lidstone))

lap_lid

In [None]:
# # Create Pipeline
# pipe = Pipeline([('Count', CountVectorizer()),
#                  ('nb', MultinomialNB())
#                 ])

# # Set parameters to search (model and vectorizer)
# parameters = {
#     'Count__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
#     'Count__min_df': (np.linspace(0.01, 0.49, num = 10)),
#     'Count__max_df': (np.linspace(0.50, 0.99, num = 10)),
#     'Count__max_features' : ([1 , 2, 3, 4, 5]),
#     'nb__alpha': (np.linspace(0.01, 0.99, num = 10)), # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
#     }

# # Perform grid search
# grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, 
#                            verbose=1, scoring = "accuracy", 
#                            refit=True, cv=5)

# grid_search.fit(X, y)

In [None]:
# grid_search.best_params_

In [None]:
# best_model = grid_search.best_estimator_