# Imports

In [1]:
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts
from sklearn.decomposition import LatentDirichletAllocation # Latent Dirichlet Allocation is a topic model that is used for discovering abstract topics from a collection of documents (variational Bayes algorithm)
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

import string # Collection of string operations
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer #Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk import word_tokenize

from nltk.sentiment.util import mark_negation



In [2]:
!pip install corextopic



In [3]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [4]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Functions

In [5]:
stop_words = set(stopwords.words('english')) 

NEGATIONS = ["no", "not", "shouldn't", "aren't", "couldn't", "didn't", "doesn't", "don't", "wasn't", "weren't", "wouldn't", "nor"]

NEW_WORDS = ['mg', "month", "year", "day", "week", "time", "im", "ive", "hour"]

for negation in NEGATIONS:
    stop_words.remove(negation)

for new_word in NEW_WORDS:
    stop_words.add(new_word)

# stop_words

In [6]:
def to_list(x):
    list_words = x.split(' ')
    return list_words

def to_string(x):
    string = " ".join(x)
    return string

#===============================================================

def punctuation(x):
    for punctuation in string.punctuation:
        x =  x.replace(punctuation, '')
    return x.lower()

def remove_numbers (x):
    words_only = ''.join([i for i in x if not i.isdigit()])
    return words_only

def m_negation(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    tokenized_neg = mark_negation(without_stopwords)
    return tokenized_neg

def remove_stopwords(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

def remove_our_stopwords(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in OUR_STOPWORDS]
    return without_stopwords

def lemmatize_review(x):
    lemma = WordNetLemmatizer()
    lista = []
    for w in x:
       lista.append(lemma.lemmatize(w))
    return lista

#===============================================================

def count_words(x):
    wordfreq = []
    for w in x:
        wordfreq.append(x.count(w))
    return dict(zip(x, wordfreq))

def total_count(x):
    total_count = {}
    for row in x:
        for key in row.keys():
          if key in total_count:
              total_count[key] += 1
          else:
              total_count[key] = 1
    return pd.DataFrame(sorted(total_count.items(), key=lambda x: x[1], reverse=True)).head(30).T

def round_two(x):
    return str(int(round(x, 2) * 100)) + "%"

def one_or_zero(x):
    # Makes the prediction a binary outpur
    if x > 0.5:
        x = 1
    else:
        x = 0
    return x

#===============================================================

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

#===============================================================

def side_effects_lst(x):
    lista = []
    for i in x:
        y = remove_our_stopwords(i)
        if i not in y and i not in lista:
            lista.append(i)
    return lista

def se_encoder(x, y):
    if y in x:
        return 1
    else:
        return 0

# Side Effects

In [7]:
real_se_domains: ['pain', 'skin', 'libido', 'depression', 'anxiety', 'gastro-intestinal', 'discomfort', 'gynecological', 'weight gain', 'eyes']

real_se_dict = {
    'headache' : 'pain', 
    'migrane' : 'pain', 
    'migranes' : 'pain', 
    'headaches' : 'pain',
    'pain' : 'pain', 
    'leg pain' : 'pain', 
    'back pain' : 'pain', 
    'arm pain' : 'pain', 
    'cramps' : 'pain', 
    'tendonitis' : 'pain', 
    'pain in joints' : 'pain',
    
    'sensitive skin' : 'skin', 
    'dry skin' : 'skin', 
    'burned' : 'skin', 
    'red' : 'skin', 
    'dryness' : 'skin', 
    'swollen' : 'skin', 
    'itchy' : 'skin', 
    'irritated' : 'skin', 
    'bumpy spots' : 'skin', 
    'peeling' : 'skin', 
    'chapped lips' : 'skin', 
    'acne' : 'skin', 
    'burning' : 'skin', 
    'itching' : 'skin', 
    'lost sensation' : 'skin', 
    'swollen' : 'skin',
    
    'sex drive decreased': 'libido', 
    'lower sex drive' : 'libido', 
    'decreased libido' : 'libido',
    'loss of libido' : 'libido',
    
    'anhedonia' : 'depression', 
    'depression' : 'depression', 
    'mood swings' : 'depression', 
    'moody' : 'depression', 
    'weepy' : 'depression', 
    'no feelings' : 'depression', 
    'feeling' : 'depression', 
    'crying' : 'depression', 
    'cry constantly' : 'depression',
    
    'jaw clenching' : 'anxiety', 
    'inability to sit still' : 'anxiety', 
    'hear internal voices' : 'anxiety', 
    'paranoid' : 'anxiety', 
    'anxious' : 'anxiety', 
    'scared to be alone' : 'anxiety', 
    'manic state' : 'anxiety', 
    'anxiety' : 'anxiety', 
    'little confused' : 'anxiety', 
    'cognitive problems' : 'anxiety', 
    'irritable' : 'anxiety', 
    'angry' : 'anxiety', 
    'unreasonable' : 'anxiety', 
    'can not think straight' : 'anxiety', 
    'aggressive' : 'anxiety',
    'nightmare' : 'anxiety', 
    'nightmares' : 'anxiety', 
    'yawning' : 'anxiety', 
    'bad dreams' : 'anxiety', 
    'inability to sleep' : 'anxiety', 
    'insomnia' : 'anxiety', 
    
    'gastro-intestinal' : 'gastro-intestinal',
    'diarrhoea' : 'gastro-intestinal', 
    'stomach cramps' : 'gastro-intestinal', 
    'heartburn' : 'gastro-intestinal', 
    'nauseated' : 'gastro-intestinal', 
    'no appetite' : 'gastro-intestinal', 
    'nausea' : 'gastro-intestinal', 
    'appetite suppressed' : 'gastro-intestinal', 
    'bowel movements' : 'gastro-intestinal', 
    'constipation' : 'gastro-intestinal', 
    'constipated' : 'gastro-intestinal', 
    'upset stomach' : 'gastro-intestinal', 
    'bloating' : 'gastro-intestinal', 
    'passing winds' : 'gastro-intestinal', 
    'acidy burbs' : 'gastro-intestinal', 
    'stomach aches' : 'gastro-intestinal', 
    'nauseous' : 'gastro-intestinal', 
    'sour stomach' : 'gastro-intestinal',
    
    'discomfort' : 'discomfort',
    'shakiness' : 'discomfort', 
    'shaky' : 'discomfort', 
    'tired' : 'discomfort', 
    'fatigue' : 'discomfort', 
    'breathlessness' : 'discomfort', 
    'dry mouth' : 'discomfort', 
    'no energy' : 'discomfort', 
    'cold' : 'discomfort', 
    'hot flashes' : 'discomfort',  
    'night sweats' : 'discomfort', 
    'shaking' : 'discomfort', 
    'dizziness' : 'discomfort', 
    'knocks me out' : 'discomfort', 
    'weakness' : 'discomfort', 
    'wheezy' : 'discomfort', 
    'tiredness' : 'discomfort', 
    'uncomfortable' : 'discomfort', 
    'sweating' : 'discomfort',
    
    'brown periods' : 'gynecological', 
    'spot' : 'gynecological', 
    'spotting' : 'gynecological', 
    'bleeding lasted' : 'gynecological', 
    'increased discharge' : 'gynecological', 
    'longer periods' : 'gynecological', 
    'cramping' : 'gynecological', 
    'burning' : 'gynecological', 
    'itching' : 'gynecological', 
    'heavy periods' : 'gynecological', 
    'swelling chest' : 'gynecological', 
    'tenderness in chest' : 'gynecological',
    
    'weight gain' : 'weight gain', 
    'gaining weight' : 'weight gain', 
    'hungry' : 'weight gain', 
    'appetite increased' : 'weight gain',

    'burning eyes' : 'eyes', 
    'blurred vision' : 'eyes', 
    'blurry vision' : 'eyes', 
    'light sensitivity' : 'eyes',
    
    'side effect' : 'bonus', 
    'no side effect' : 'bonus', 
    'symptom' : 'bonus', 
    
    }

keys, values = list(real_se_dict.keys()), list(real_se_dict.values())

In [8]:
giga_word = pd.read_csv('/content/drive/MyDrive/Data/gigaword_sideeffects.csv')

no_giga_word = pd.read_csv('/content/drive/MyDrive/Data/gigaword_no_sideeffects.csv')

giga_word = list(no_giga_word["0"]) + list(giga_word["0"])

len(giga_word)

256

In [9]:
# OUR_STOPWORDS = keys

# OUR_STOPWORDS = giga_word

# len(OUR_STOPWORDS)

In [10]:
OUR_STOPWORDS = giga_word + keys

len(OUR_STOPWORDS)

374

# Manual

In [11]:
#manual = pd.read_csv('manually_labelled_data.csv')

manual = pd.read_csv("/content/drive/MyDrive/Data/adr_labelled_data.csv")

manual = manual.drop(["Unnamed: 8"], axis = 1).head(299)

manual = manual.drop(["uniqueID", "drugName", "condition", "date", "rating", "usefulCount"], axis = 1)

manual["clean_review"] = manual["review"].apply(punctuation)
manual['clean_review'] = manual.clean_review.apply(remove_numbers)
manual['clean_review_lst'] = manual.clean_review.apply(to_list)

manual["NonStopwords_review_lst"] = manual.clean_review.apply(remove_stopwords)
manual["NonStopwords_review_str"] = manual.NonStopwords_review_lst.apply(to_string)

manual["NonStopwords_review_lst_MN"] = manual.clean_review.apply(m_negation)
manual["NonStopwords_review_str_MN"] = manual.NonStopwords_review_lst_MN.apply(to_string)

manual["Lemmatized_review_lst"] = manual.NonStopwords_review_lst_MN.apply(lemmatize_review)
manual["Lemmatized_review_str"] = manual.Lemmatized_review_lst.apply(to_string)

manual["Lemmatized_review_list"] = manual.NonStopwords_review_lst.apply(lemmatize_review)
manual["Lemmatized_review"] = manual.Lemmatized_review_list.apply(to_string)
manual["Lemmatized_review_list"] = manual.Lemmatized_review.apply(remove_stopwords)
manual["Lemmatized_review"] = manual.Lemmatized_review_list.apply(to_string)

manual["words_count"] = manual.Lemmatized_review_list.apply(count_words)

X = manual["Lemmatized_review"]

y = manual["sideEffect"]

manual = manual.drop(["clean_review", "clean_review_lst", "NonStopwords_review_lst", "NonStopwords_review_str"], axis = 1)

manual.head(3)

Unnamed: 0,review,sideEffect,NonStopwords_review_lst_MN,NonStopwords_review_str_MN,Lemmatized_review_lst,Lemmatized_review_str,Lemmatized_review_list,Lemmatized_review,words_count
0,"""It has no side effect, I take it in combinati...",0.0,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side, effect, take, combination, bystolic...",no side effect take combination bystolic fish oil,"{'no': 1, 'side': 1, 'effect': 1, 'take': 1, '..."
1,"""My son is halfway through his fourth week of ...",1.0,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"[son, halfway, fourth, intuniv, became, concer...",son halfway fourth intuniv became concerned be...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'intuniv..."
2,"""I used to take another oral contraceptive, wh...",1.0,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1..."


In [12]:
total_count(manual["words_count"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,not,effect,no,side,taking,started,first,take,like,one,get,pain,feel,work,would,doctor,much,really,medication,took,bad,back,help,period,went,never,since,good,life,two
1,105,99,93,93,77,76,72,67,66,65,61,54,53,52,51,49,48,48,46,46,45,44,44,43,43,43,43,41,41,40


# Data

In [13]:
#data = pd.read_csv('drugsComTrain_raw.csv')

data = pd.read_csv('/content/drive/MyDrive/Data/drugsComTrain_raw.csv')

data = data[data['rating'] < 10]

data = data.reset_index(drop = True)

data = data.sort_values(by=['drugName'])

data = data.reset_index(drop = True)

print(f"The shape of the data is {data.shape[0]} rows and {data.shape[1]} columns")
print("\n")
print(f"The amount of unique ID is {len(data['uniqueID'].unique())}") # check if any of the uniqueID repeat
print("\n")
print(f"The number of unique drugs reviewed is {len(data['drugName'].unique())}")
print("\n")
print(f"The number of unique conditions is {len(data['condition'].unique())}")
print("\n")

The shape of the data is 110308 rows and 7 columns


The amount of unique ID is 110308


The number of unique drugs reviewed is 2946


The number of unique conditions is 791




In [14]:
pd.DataFrame(data["drugName"].value_counts()).head(14).T

Unnamed: 0,Etonogestrel,Ethinyl estradiol / norethindrone,Levonorgestrel,Ethinyl estradiol / norgestimate,Nexplanon,Ethinyl estradiol / levonorgestrel,Sertraline,Miconazole,Mirena,Implanon,Escitalopram,Medroxyprogesterone,Venlafaxine,Depo-Provera
drugName,2690,2380,2262,1776,1760,1565,948,942,914,864,838,838,760,757


In [15]:
pd.DataFrame(data["condition"].value_counts()).head(14).T

Unnamed: 0,Birth Control,Depression,Pain,Acne,Anxiety,Bipolar Disorde,Insomnia,ADHD,Obesity,Weight Loss,Vaginal Yeast Infection,Abnormal Uterine Bleeding,"Diabetes, Type 2",High Blood Pressure
condition,23080,6544,4037,3828,3628,2948,2611,2421,2153,1981,1974,1856,1834,1803


In [16]:
NUMERO_SAMPLES = 150000

data = data.drop(["uniqueID", "condition", "date", "rating", "usefulCount"], axis = 1) # .head(NUMERO_SAMPLES) # SACAR ESTOOOOOO!!

In [17]:
data["clean_review"] = data["review"].apply(punctuation)
data['clean_review'] = data.clean_review.apply(remove_numbers)
data['clean_review_lst'] = data.clean_review.apply(to_list)

data["NonStopwords_review_lst"] = data.clean_review.apply(remove_stopwords)
data["NonStopwords_review_str"] = data.NonStopwords_review_lst.apply(to_string)

data["NonStopwords_review_lst_MN"] = data.clean_review.apply(m_negation)
data["NonStopwords_review_str_MN"] = data.NonStopwords_review_lst_MN.apply(to_string)

data["Lemmatized_review_lst"] = data.NonStopwords_review_lst_MN.apply(lemmatize_review)
data["Lemmatized_review_str"] = data.Lemmatized_review_lst.apply(to_string)

data["Lemmatized_review_list"] = data.NonStopwords_review_lst.apply(lemmatize_review)
data["Lemmatized_review"] = data.Lemmatized_review_list.apply(to_string)
data["Lemmatized_review_list"] = data.Lemmatized_review.apply(remove_stopwords)
data["Lemmatized_review"] = data.Lemmatized_review_list.apply(to_string)

data["words_count"] = data.Lemmatized_review_list.apply(count_words)

data = data.drop(["clean_review", "clean_review_lst", "NonStopwords_review_lst", "NonStopwords_review_str", "NonStopwords_review_lst_MN", "NonStopwords_review_str_MN",
                  "Lemmatized_review_lst", "Lemmatized_review_str"], axis = 1)

data["Side_Effects_mention"] = data.Lemmatized_review_list.apply(side_effects_lst)

In [18]:
data.head(15)

Unnamed: 0,drugName,review,Lemmatized_review_list,Lemmatized_review,words_count,Side_Effects_mention
0,Abacavir / dolutegravir / lamivudine,"""I started on triumeq august 1,2016....on the ...","[started, triumeq, august, second, bad, headac...",started triumeq august second bad headache amp...,"{'started': 1, 'triumeq': 1, 'august': 1, 'sec...","[headache, stomach, ache, better, loss, appeti..."
1,Abacavir / dolutegravir / lamivudine,"""I have had nausea and I threw up too. I hope ...","[nausea, threw, hope, side, effect, stop, pas,...",nausea threw hope side effect stop pas also fe...,"{'nausea': 1, 'threw': 1, 'hope': 1, 'side': 1...","[nausea, hope, feeling, tired]"
2,Abacavir / dolutegravir / lamivudine,"""I have been taking Triumeq for eight months. ...","[taking, triumeq, eight, one, taken, far, hiv,...",taking triumeq eight one taken far hiv cant re...,"{'taking': 1, 'triumeq': 1, 'eight': 1, 'one':...","[taken, really, treatment, tired, depression, ..."
3,Abacavir / dolutegravir / lamivudine,"""Started on Triumeq exactly 1 month. Trying to...","[started, triumeq, exactly, trying, understand...",started triumeq exactly trying understand vira...,"{'started': 1, 'triumeq': 1, 'exactly': 1, 'tr...","[experience, headache, feeling, indeed, result..."
4,Abacavir / dolutegravir / lamivudine,"""I was born with HIV my birth mother transfer...","[born, hiv, birth, mother, transfer, amp, life...",born hiv birth mother transfer amp life never ...,"{'born': 1, 'hiv': 1, 'birth': 1, 'mother': 1,...","[taken, really, nausea, appetite, better]"
5,Abacavir / dolutegravir / lamivudine,"""I have been taking Triumeq for about 9years, ...","[taking, triumeq, taking, complera, doctor, de...",taking triumeq taking complera doctor decided ...,"{'taking': 2, 'triumeq': 2, 'complera': 3, 'do...","[serious, medication, treatment]"
6,Abacavir / dolutegravir / lamivudine,"""I was diagnosed in January 2011. My own immun...","[diagnosed, january, immune, system, able, man...",diagnosed january immune system able manage hi...,"{'diagnosed': 1, 'january': 1, 'immune': 1, 's...","[diagnosed, able, nightmare, experienced, stom..."
7,Abacavir / dolutegravir / lamivudine,"""Diagnosed 5 years ago been on truvada and ise...","[diagnosed, ago, truvada, isentress, no, probl...",diagnosed ago truvada isentress no problem doc...,"{'diagnosed': 1, 'ago': 1, 'truvada': 1, 'isen...","[diagnosed, problem, sleep, nauseous, pain, me..."
8,Abacavir / dolutegravir / lamivudine,"""Triumeq is working as an HIV medication - but...","[triumeq, working, hiv, medication, gaining, w...",triumeq working hiv medication gaining weight,"{'triumeq': 1, 'working': 1, 'hiv': 1, 'medica...","[medication, weight]"
9,Abacavir / dolutegravir / lamivudine,"""My big concern with this drug has been a dras...","[big, concern, drug, drastic, decrease, level,...",big concern drug drastic decrease level energy...,"{'big': 1, 'concern': 1, 'drug': 1, 'drastic':...",[nausea]


In [19]:
total_count(data["words_count"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,not,effect,side,taking,no,first,get,take,started,like,feel,pain,work,pill,would,doctor,also,back,period,one,took,still,medication,got,bad,weight,dont,since,really,much
1,45085,30341,29682,29083,28358,26127,24921,24655,24148,23978,20642,20493,20232,19049,18423,18065,18047,17988,17884,17728,16442,16336,16023,15745,15488,15395,14916,14638,14349,14236


In [20]:
flat_list = [item for sublist in data["Side_Effects_mention"] for item in sublist]

flat_list = list(set(flat_list))

side_effect_encoder = pd.DataFrame(columns = list(flat_list), index = None)

side_effect_data = pd.concat([data, side_effect_encoder], axis = 1)

for drug in flat_list:
    side_effect_data[drug] = side_effect_data["Side_Effects_mention"].apply(lambda effect: se_encoder(effect, drug))

side_effect_data.head(3)

Unnamed: 0,drugName,review,Lemmatized_review_list,Lemmatized_review,words_count,Side_Effects_mention,patient,breathlessness,difficulty,indeed,weight,bipolar,psychosis,exertion,heartburn,insomnia,breath,able,acne,noting,inducing,loss,chronic,anxiety,stomachache,soreness,spot,given,syndrome,anorexia,problem,dryness,drowsiness,numbness,create,agony,come,taken,experienced,pneumonia,...,paralysis,sort,weakness,tendonitis,treatment,fibromyalgia,step,feeling,itching,sleeplessness,constipation,thought,anemia,wheezy,breathing,anhedonia,angry,schizophrenia,chemotherapy,helped,irritation,resulting,experience,bronchitis,serious,probably,hypothermia,paranoia,persistent,reaction,clearly,going,sleepiness,headache,sore,really,hungry,restlessness,lethargy,treating
0,Abacavir / dolutegravir / lamivudine,"""I started on triumeq august 1,2016....on the ...","[started, triumeq, august, second, bad, headac...",started triumeq august second bad headache amp...,"{'started': 1, 'triumeq': 1, 'august': 1, 'sec...","[headache, stomach, ache, better, loss, appeti...",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,Abacavir / dolutegravir / lamivudine,"""I have had nausea and I threw up too. I hope ...","[nausea, threw, hope, side, effect, stop, pas,...",nausea threw hope side effect stop pas also fe...,"{'nausea': 1, 'threw': 1, 'hope': 1, 'side': 1...","[nausea, hope, feeling, tired]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Abacavir / dolutegravir / lamivudine,"""I have been taking Triumeq for eight months. ...","[taking, triumeq, eight, one, taken, far, hiv,...",taking triumeq eight one taken far hiv cant re...,"{'taking': 1, 'triumeq': 1, 'eight': 1, 'one':...","[taken, really, treatment, tired, depression, ...",0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [21]:
review_x_side_effect = side_effect_data.drop(["Lemmatized_review", "drugName", "Lemmatized_review_list", "review", "words_count", "Side_Effects_mention"], axis = 1)

drug_x_side_effect = side_effect_data.groupby(["drugName"]).sum()

print(review_x_side_effect.shape)

review_x_side_effect.head(3)

(110308, 245)


Unnamed: 0,patient,breathlessness,difficulty,indeed,weight,bipolar,psychosis,exertion,heartburn,insomnia,breath,able,acne,noting,inducing,loss,chronic,anxiety,stomachache,soreness,spot,given,syndrome,anorexia,problem,dryness,drowsiness,numbness,create,agony,come,taken,experienced,pneumonia,hospitalized,cold,nightmare,infection,exhaustion,loneliness,...,paralysis,sort,weakness,tendonitis,treatment,fibromyalgia,step,feeling,itching,sleeplessness,constipation,thought,anemia,wheezy,breathing,anhedonia,angry,schizophrenia,chemotherapy,helped,irritation,resulting,experience,bronchitis,serious,probably,hypothermia,paranoia,persistent,reaction,clearly,going,sleepiness,headache,sore,really,hungry,restlessness,lethargy,treating
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [22]:
print(drug_x_side_effect.shape)

drug_x_side_effect.head(3)

(2946, 245)


Unnamed: 0_level_0,patient,breathlessness,difficulty,indeed,weight,bipolar,psychosis,exertion,heartburn,insomnia,breath,able,acne,noting,inducing,loss,chronic,anxiety,stomachache,soreness,spot,given,syndrome,anorexia,problem,dryness,drowsiness,numbness,create,agony,come,taken,experienced,pneumonia,hospitalized,cold,nightmare,infection,exhaustion,loneliness,...,paralysis,sort,weakness,tendonitis,treatment,fibromyalgia,step,feeling,itching,sleeplessness,constipation,thought,anemia,wheezy,breathing,anhedonia,angry,schizophrenia,chemotherapy,helped,irritation,resulting,experience,bronchitis,serious,probably,hypothermia,paranoia,persistent,reaction,clearly,going,sleepiness,headache,sore,really,hungry,restlessness,lethargy,treating
drugName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Abacavir / dolutegravir / lamivudine,0,0,0,1,3,0,0,0,1,2,1,1,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,4,2,0,0,0,2,0,0,0,...,0,0,1,0,6,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,0,0,0,0,0,2,0,7,1,2,0,0,0,0
Abacavir / lamivudine / zidovudine,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Abatacept,0,0,0,0,1,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,4,0,0,0,0,0,0,3,0,0,0,1,1,0,0,0,0,0


# CountVectorizer | Latent Dirichlet allocation

In [23]:
side_effects = list(drug_x_side_effect.columns)

In [38]:
# vectorizer = TfidfVectorizer(min_df = 0.05, 
#                              max_df = 0.55, 
#                              max_features = None,
#                              vocabulary = None,
#                              binary = False,
#                              ngram_range = (1, 2)).fit(data["Lemmatized_review"]) #

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

# stop = vectorizer.get_stop_words()

# data_vectorized = vectorizer.transform(data["Lemmatized_review"]) #

lda_model = LatentDirichletAllocation(n_components = 25,
                                      learning_method = 'online',   
                                      random_state = 29,
                                      batch_size = 128,
                                      learning_decay = 0.5,
                                      learning_offset = 5,
                                      evaluate_every = -1,
                                      verbose = 0,
                                      max_iter = 100).fit(drug_x_side_effect) # Fitting

nmf_model = NMF(n_components = 25,
                init = 'random',
                random_state = 0,
                verbose = 0,
                max_iter = 50,
                l1_ratio = 0,
                alpha = 0).fit(drug_x_side_effect)

svd_model = TruncatedSVD(n_components = 25,
                         n_iter = 10,
                         random_state = 42).fit(drug_x_side_effect)

In [25]:
lda_components = lda_model.components_

for t in range(len(lda_components)):
    dic = dict(zip(side_effects, np.round(lda_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])
    #print(f"topic {t}: {dict(zip(side_effects, lda_components[t]))}")

[('weight', 5960.45), ('bleeding', 5762.416), ('spotting', 3847.922), ('going', 1932.549), ('acne', 1753.92), ('problem', 1637.684), ('moody', 1628.943), ('emotional', 1506.145)]
[('bipolar', 1206.901), ('weight', 1201.423), ('medication', 1002.294), ('depression', 702.162), ('disorder', 637.623), ('take', 540.074), ('thought', 536.682), ('diagnosed', 510.344)]
[('cough', 1200.44), ('cold', 828.632), ('better', 555.084), ('much', 548.999), ('sore', 526.743), ('coughing', 489.7), ('take', 437.342), ('asthma', 429.115)]
[('symptom', 2889.05), ('pain', 2577.041), ('take', 2147.507), ('medication', 2039.757), ('headache', 2004.302), ('severe', 1705.229), ('diagnosed', 1456.633), ('better', 1260.387)]
[('experience', 3625.774), ('cramping', 3424.665), ('really', 2970.481), ('painful', 2902.219), ('experienced', 2548.994), ('however', 2494.679), ('symptom', 2112.457), ('pain', 2062.441)]
[('burning', 1991.526), ('itching', 1663.299), ('pain', 930.798), ('itchy', 590.894), ('going', 568.264),

In [26]:
nmf_components = nmf_model.components_

for t in range(len(nmf_components)):
    dic = dict(zip(side_effects, np.round(nmf_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])

[('weight', 2.499), ('take', 1.504), ('acne', 1.397), ('really', 1.156), ('problem', 1.131), ('experience', 1.054), ('however', 0.84), ('experienced', 0.838)]
[('much', 0.894), ('better', 0.817), ('really', 0.806), ('going', 0.471), ('able', 0.418), ('medication', 0.402), ('thought', 0.387), ('take', 0.385)]
[('bleeding', 7.574), ('spotting', 4.185), ('weight', 2.886), ('pain', 1.618), ('going', 1.486), ('take', 1.356), ('really', 1.346), ('nothing', 1.284)]
[('take', 9.686), ('medication', 6.225), ('symptom', 2.208), ('diagnosed', 2.102), ('taken', 1.831), ('severe', 1.52), ('pain', 1.373), ('really', 1.232)]
[('infection', 13.935), ('symptom', 5.824), ('stomach', 5.789), ('take', 5.373), ('headache', 4.863), ('pain', 4.78), ('diarrhea', 4.415), ('better', 4.18)]
[('burning', 30.756), ('itching', 23.807), ('infection', 18.638), ('pain', 13.079), ('treatment', 8.696), ('better', 7.721), ('much', 7.369), ('going', 7.144)]
[('acne', 21.111), ('clear', 7.237), ('really', 4.072), ('better'

In [27]:
svd_components = svd_model.components_

for t in range(len(svd_components)):
    dic = dict(zip(side_effects, np.round(svd_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])

[('weight', 0.363), ('take', 0.3), ('pain', 0.247), ('bleeding', 0.247), ('really', 0.233), ('acne', 0.221), ('much', 0.202), ('going', 0.178)]
[('bleeding', 0.39), ('acne', 0.278), ('spotting', 0.278), ('weight', 0.262), ('cramping', 0.154), ('painful', 0.143), ('moody', 0.078), ('experience', 0.067)]
[('pain', 0.689), ('cramping', 0.225), ('painful', 0.2), ('burning', 0.161), ('infection', 0.158), ('take', 0.129), ('itching', 0.119), ('uncomfortable', 0.089)]
[('anxiety', 0.388), ('bleeding', 0.3), ('depression', 0.254), ('spotting', 0.229), ('cramping', 0.22), ('painful', 0.177), ('panic', 0.143), ('pain', 0.108)]
[('burning', 0.522), ('itching', 0.405), ('infection', 0.299), ('anxiety', 0.189), ('bleeding', 0.154), ('treatment', 0.138), ('depression', 0.12), ('thought', 0.119)]
[('weight', 0.473), ('burning', 0.179), ('sleep', 0.175), ('bleeding', 0.172), ('itching', 0.149), ('loss', 0.145), ('take', 0.09), ('appetite', 0.086)]
[('cramping', 0.366), ('nausea', 0.303), ('painful', 0

In [28]:
# vectorizer = TfidfVectorizer(min_df = 0.05, 
#                              max_df = 0.55, 
#                              max_features = None,
#                              vocabulary = None,
#                              binary = False,
#                              ngram_range = (1, 2)).fit(data["Lemmatized_review"]) #

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

# stop = vectorizer.get_stop_words()

# data_vectorized = vectorizer.transform(data["Lemmatized_review"]) #

lda_model = LatentDirichletAllocation(n_components = 25,
                                      learning_method = 'online',   
                                      random_state = 29,
                                      batch_size = 128,
                                      learning_decay = 0.5,
                                      learning_offset = 5,
                                      evaluate_every = -1,
                                      verbose = 0,
                                      max_iter = 100).fit(review_x_side_effect) # Fitting

nmf_model = NMF(n_components = 25,
                init = 'random',
                random_state = 0,
                verbose = 0,
                max_iter = 50,
                l1_ratio = 0,
                alpha = 0).fit(review_x_side_effect)

svd_model = TruncatedSVD(n_components = 25,
                         n_iter = 10,
                         random_state = 42).fit(review_x_side_effect)

In [29]:
lda_components = lda_model.components_

for t in range(len(lda_components)):
    dic = dict(zip(side_effects, np.round(lda_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])
    #print(f"topic {t}: {dict(zip(side_effects, lda_components[t]))}")

[('everything', 4361.135), ('caused', 3956.385), ('clear', 3053.723), ('burning', 2693.267), ('nightmare', 1601.767), ('patient', 1076.007), ('irritated', 489.921), ('irritation', 407.109)]
[('anxiety', 6133.131), ('given', 3577.971), ('panic', 3356.716), ('heart', 2893.311), ('probably', 2096.844), ('medication', 1539.346), ('red', 1405.549), ('serious', 1298.52)]
[('experience', 8518.369), ('something', 5512.555), ('extreme', 3170.967), ('difference', 2940.187), ('really', 2323.648), ('fatigue', 2268.301), ('chest', 1942.208), ('take', 1484.292)]
[('experienced', 6490.375), ('cramping', 3324.931), ('pressure', 2908.148), ('itching', 2261.667), ('cold', 2191.313), ('reaction', 2116.472), ('suffer', 1867.704), ('severe', 1514.209)]
[('sleep', 11361.871), ('depression', 10186.517), ('anxiety', 5617.23), ('take', 4340.085), ('medication', 2852.284), ('disorder', 2577.952), ('chronic', 2326.944), ('suffered', 2179.605)]
[('cause', 3842.881), ('done', 3418.138), ('emotional', 2055.119), ('

In [30]:
nmf_components = nmf_model.components_

for t in range(len(nmf_components)):
    dic = dict(zip(side_effects, np.round(nmf_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])

[('pain', 1.07), ('severe', 0.161), ('stomach', 0.099), ('muscle', 0.072), ('chronic', 0.064), ('painful', 0.064), ('chest', 0.043), ('helped', 0.042)]
[('really', 1.128), ('helped', 0.105), ('hope', 0.044), ('come', 0.027), ('something', 0.027), ('difference', 0.026), ('tired', 0.026), ('everything', 0.025)]
[('acne', 1.005), ('problem', 0.562), ('bleeding', 0.452), ('spotting', 0.306), ('nothing', 0.286), ('clear', 0.209), ('painful', 0.203), ('cramping', 0.159)]
[('headache', 1.202), ('nausea', 0.964), ('severe', 0.333), ('stomach', 0.24), ('experienced', 0.161), ('appetite', 0.16), ('migraine', 0.159), ('dizziness', 0.149)]
[('feeling', 1.664), ('thought', 0.452), ('better', 0.151), ('tired', 0.15), ('nauseous', 0.078), ('anxious', 0.065), ('stomach', 0.061), ('heart', 0.056)]
[('taken', 1.964), ('something', 0.075), ('infection', 0.072), ('bleeding', 0.061), ('given', 0.059), ('symptom', 0.053), ('caused', 0.045), ('migraine', 0.042)]
[('sleep', 2.114), ('problem', 0.285), ('able'

In [31]:
svd_components = svd_model.components_

for t in range(len(svd_components)):
    dic = dict(zip(side_effects, np.round(svd_components[t], decimals = 3)))
    print(sorted(dic.items(), key = lambda x: x[1], reverse = True)[0:8])

[('take', 0.442), ('pain', 0.323), ('medication', 0.255), ('much', 0.233), ('weight', 0.218), ('really', 0.218), ('better', 0.213), ('anxiety', 0.182)]
[('weight', 0.415), ('anxiety', 0.217), ('depression', 0.192), ('really', 0.187), ('much', 0.167), ('acne', 0.148), ('better', 0.139), ('appetite', 0.071)]
[('pain', 0.536), ('weight', 0.187), ('severe', 0.113), ('acne', 0.099), ('much', 0.095), ('headache', 0.082), ('bleeding', 0.076), ('painful', 0.07)]
[('medication', 0.508), ('anxiety', 0.326), ('depression', 0.207), ('better', 0.188), ('feeling', 0.135), ('sleep', 0.125), ('severe', 0.102), ('panic', 0.081)]
[('much', 0.571), ('better', 0.529), ('really', 0.087), ('feeling', 0.076), ('sleep', 0.039), ('take', 0.022), ('hope', 0.021), ('pain', 0.015)]
[('really', 0.789), ('going', 0.145), ('anxiety', 0.141), ('thought', 0.088), ('helped', 0.081), ('depression', 0.072), ('know', 0.051), ('sleep', 0.046)]
[('anxiety', 0.439), ('depression', 0.34), ('severe', 0.179), ('feeling', 0.127)

In [32]:
# """A model with higher log-likelihood and lower perplexity
# (exp(-1. * log-likelihood per word)) is considered to be good. Let’s check for our model."""

# # Log Likelyhood: Higher the better
# print("Log Likelihood: ", lda_model.score(data_vectorized))

# # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
# print("Perplexity: ", lda_model.perplexity(data_vectorized))

# # See model parameters
# print(lda_model.get_params())

## Plot

In [33]:
# from mpl_toolkits.mplot3d import Axes3D

# fig = pyplot.figure()
# ax = Axes3D(fig)

# pipeline = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
# ])    

# X = pipeline.fit_transform(manual["Lemmatized_review"]).todense()

# pca = PCA(n_components=3).fit(X)
# data2D = pca.transform(X)
# plt.figure(figsize=(20,7))
# ax.scatter(data2D[:,0], data2D[:,1], data2D[:,2], c = y)
# # plt.show()

# kmeans = KMeans(n_clusters=2).fit(X)
# centers2D = pca.transform(kmeans.cluster_centers_)

# plt.scatter(centers2D[:,0], centers2D[:,1], 
#             marker='x', s=200, linewidths=3)
# plt.show() 

In [34]:
# pipeline = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
# ])    

# X = pipeline.fit_transform(data["Lemmatized_review"]).todense()

# pca = PCA(n_components=2).fit(X)
# data2D = pca.transform(X)
# plt.figure(figsize=(20,7))
# plt.scatter(data2D[:,0], data2D[:,1])
# #plt.show()

# kmeans = KMeans(n_clusters=2).fit(X)
# centers2D = pca.transform(kmeans.cluster_centers_)

# plt.scatter(centers2D[:,0], centers2D[:,1], 
#             marker='x', s=200, linewidths=3)
# plt.show() 

In [35]:
# pipeline = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
# ])    

# X = pipeline.fit_transform(bad_data["Lemmatized_review"]).todense()

# pca = PCA(n_components=2).fit(X)
# data2D = pca.transform(X)
# plt.figure(figsize=(20,7))
# plt.scatter(data2D[:,0], data2D[:,1])
# #plt.show()

# kmeans = KMeans(n_clusters=2).fit(X)
# centers2D = pca.transform(kmeans.cluster_centers_)

# plt.scatter(centers2D[:,0], centers2D[:,1], 
#             marker='x', s=200, linewidths=3)
# plt.show() 

## Grid

In [36]:
# # Define Search Param
# search_params = {'n_components' : [2, 3, 4], 'learning_decay' : [.2, .3, .4], "max_iter" : [50, 100], "learning_offset" : [5, 10]}

# # Init the Model
# lda = LatentDirichletAllocation()

# # Init Grid Search Class
# model = GridSearchCV(lda, param_grid = search_params)

# # Do the Grid Search
# model.fit(data_vectorized)

# # Best Model
# best_lda_model = model.best_estimator_

# # Model Parameters
# print("Best Model's Params: ", model.best_params_)

# # Log Likelihood Score
# print("Best Log Likelihood Score: ", model.best_score_)

# # Perplexity
# print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

In [37]:
# len(vectorizer.vocabulary_)

## Example

In [38]:
example = ["pain back"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

for topic in range(lda_vectors.shape[1]):
    print(f"topic {topic} :", lda_vectors[0][topic])

NameError: ignored

# Prediction

In [None]:
predictions = []

for review in manual["Lemmatized_review"]:
    vectorized = vectorizer.transform([review])
    lda_vectors = predictions.append(lda_model.transform(vectorized))

predictions = np.concatenate(predictions, axis=0)

predictions.shape

## Compare

In [None]:
compare_data = pd.DataFrame(predictions, columns = ["Side_Effect", "No_Side_Effect"])

compare_data["Manually_Labelled"] = manual["sideEffect"] # Brings a column from the other DataFrame

compare_data["Prediction"] = compare_data.Side_Effect.apply(one_or_zero) # Applies the binary output
compare_data["No_Side_Effect_%"] = compare_data.No_Side_Effect.apply(round_two) # Applies a format
compare_data["Side_Effect_%"] = compare_data.Side_Effect.apply(round_two) # Applies a format

compare_data["bool"] = np.where(compare_data["Manually_Labelled"] == compare_data["Prediction"], True, False) # Compares betwen the precdiction and the label

compare_data.drop(["Side_Effect", "No_Side_Effect"], axis = 1)

In [None]:
compare_data["bool"].value_counts()

In [None]:
compare_data.to_csv('data.csv')

!cp data.csv "drive/My Drive/"

In [None]:


# correct predictions / 100
# recall = correctly predicted side effects / total side effects
# precision = correctly predicted side effects / total predicted side effects

# predicted_topic = []



# (np.array(predicted_topic) == np.array(test_data.sideEffect)).sum()

In [None]:
# for row in range(len(test_data)):
#     if (lda_vectors[row][0] > lda_vectors[row][1]):
#         predicted_topic.append(0)
#     else: predicted_topic.append(1)

# CountVectorizer | Pipeline

In [None]:
import itertools

laplace = lidstone = range(1, 4)

lap_lid = list(itertools.product(laplace, lidstone))

lap_lid

In [None]:
# # Create Pipeline
# pipe = Pipeline([('Count', CountVectorizer()),
#                  ('nb', MultinomialNB())
#                 ])

# # Set parameters to search (model and vectorizer)
# parameters = {
#     'Count__ngram_range': (lap_lid), # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted
#     'Count__min_df': (np.linspace(0.01, 0.49, num = 10)),
#     'Count__max_df': (np.linspace(0.50, 0.99, num = 10)),
#     'Count__max_features' : ([1 , 2, 3, 4, 5]),
#     'nb__alpha': (np.linspace(0.01, 0.99, num = 10)), # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
#     }

# # Perform grid search
# grid_search = GridSearchCV(pipe, parameters, n_jobs=-1, 
#                            verbose=1, scoring = "accuracy", 
#                            refit=True, cv=5)

# grid_search.fit(X, y)

In [None]:
# grid_search.best_params_

In [None]:
# best_model = grid_search.best_estimator_