In [2]:
### imports


import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts
from sklearn.decomposition import LatentDirichletAllocation # Latent Dirichlet Allocation is a topic model that is used for discovering abstract topics from a collection of documents (variational Bayes algorithm)

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

import string # Collection of string operations
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer #Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk import word_tokenize

from nltk.sentiment.util import mark_negation


In [3]:
### import stopwords

import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/jack/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jack/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jack/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#### define preprocessing functions

def to_list(x):
    list_words = x.split(' ')
    return list_words

def to_string(x):
    string = " ".join(x)
    return string

#===============================================================

def punctuation(x):
    for punctuation in string.punctuation:
        x =  x.replace(punctuation, '')
    return x.lower()

def remove_numbers (x):
    words_only = ''.join([i for i in x if not i.isdigit()])
    return words_only

def m_negation(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    tokenized_neg = mark_negation(without_stopwords)
    return tokenized_neg

def remove_stopwords(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

def remove_our_stopwords(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in OUR_STOPWORDS]
    return without_stopwords

def lemmatize_review(x):
    lemma = WordNetLemmatizer()
    lista = []
    for w in x:
       lista.append(lemma.lemmatize(w))
    return lista

#===============================================================

def count_words(x):
    wordfreq = []
    for w in x:
        wordfreq.append(x.count(w))
    return dict(zip(x, wordfreq))

def total_count(x):
    total_count = {}
    for row in x:
        for key in row.keys():
          if key in total_count:
              total_count[key] += 1
          else:
              total_count[key] = 1
    return pd.DataFrame(sorted(total_count.items(), key=lambda x: x[1], reverse=True)).head(30).T

def round_two(x):
    return str(int(round(x, 2) * 100)) + "%"

def one_or_zero(x):
    # Makes the prediction a binary outpur
    if x > 0.5:
        x = 1
    else:
        x = 0
    return x

#===============================================================

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

In [5]:
side_effects = pd.read_csv('../raw_data/frequent_adr.csv')

side_effects.head(3)

Unnamed: 0,0
0,Abdominal pain
1,Gastrointestinal pain
2,Amblyopia


In [6]:
### read list into DF and preprocess side effects

stop_words = set(stopwords.words('english')) 

side_effects = pd.read_csv('../raw_data/frequent_adr.csv')

side_effects["Side_Effect"] = pd.DataFrame(side_effects)

side_effects["Side_Effect"] = side_effects.Side_Effect.apply(punctuation)
side_effects["Side_Effect"] = side_effects.Side_Effect.apply(remove_numbers)
side_effects["Side_Effect"] = side_effects.Side_Effect.apply(to_list)
side_effects["Side_Effect"] = side_effects.Side_Effect.apply(lemmatize_review)
side_effects["Side_Effect"] = side_effects.Side_Effect.apply(to_string)
side_effects["Side_Effect"] = side_effects.Side_Effect.apply(remove_stopwords)
side_effects = side_effects["Side_Effect"].tolist()

SE_LIST = [] 

for sublist in side_effects: 
    for item in sublist:
        if item not in SE_LIST:
            SE_LIST.append(item)

# SE_LIST

In [7]:
OUR_STOPWORDS = set(stopwords.words('english'))

for effect in SE_LIST: 
    OUR_STOPWORDS.add(effect)

for word in stop_words:
    OUR_STOPWORDS.remove(word)

In [8]:
### manually define side effect stop words

OUR_STOPWORDS = ['abdominal', 'constipation', 'diarrhea', 'skin', 'rash', 'vertigo', 'dizziness', 'drowsiness',
              'headache', 'mood disorders', 'insomnia', 'mood swings', "no side effect", "good response", "improvement"]

In [9]:
### test side effect selection function

list_merge_SE = ['juan', 'jack', 'peter', 'hendrike', 'headache']

def side_effects_lst(x):
    lista = []
    for i in x:
        y = remove_our_stopwords(i)
        if i not in y:
            lista.append(i)
    return lista

side_effects_lst(list_merge_SE)

['headache']

In [15]:
### read in all data

data = pd.read_csv('../raw_data/drugsComTrain_raw.csv')

data = data[data['rating'] < 5]
data = data.reset_index(drop = True)
data = data.sort_values(by=['drugName'])
data = data.reset_index(drop = True)
data = data.drop(["uniqueID", "date", "usefulCount"], axis = 1)
# print(pd.DataFrame(data["drugName"].value_counts()))

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uniqueID     161297 non-null  int64 
 1   drugName     161297 non-null  object
 2   condition    160398 non-null  object
 3   review       161297 non-null  object
 4   rating       161297 non-null  int64 
 5   date         161297 non-null  object
 6   usefulCount  161297 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 8.6+ MB


In [16]:
#data = pd.read_csv('drugsComTrain_raw.csv')

data["clean_review"] = data["review"].apply(punctuation)
data['clean_review'] = data.clean_review.apply(remove_numbers)
data['clean_review_lst'] = data.clean_review.apply(to_list)

data["NonStopwords_review_lst"] = data.clean_review.apply(remove_stopwords)
data["NonStopwords_review_str"] = data.NonStopwords_review_lst.apply(to_string)

data["NonStopwords_review_lst_MN"] = data.clean_review.apply(m_negation)
data["NonStopwords_review_str_MN"] = data.NonStopwords_review_lst_MN.apply(to_string)

data["Lemmatized_review_lst"] = data.NonStopwords_review_lst_MN.apply(lemmatize_review)
data["Lemmatized_review_str"] = data.Lemmatized_review_lst.apply(to_string)

data["Lemmatized_review_list"] = data.NonStopwords_review_lst.apply(lemmatize_review)
data["Lemmatized_review"] = data.Lemmatized_review_list.apply(to_string)
data["Lemmatized_review_list"] = data.Lemmatized_review.apply(remove_stopwords)
data["Lemmatized_review"] = data.Lemmatized_review_list.apply(to_string)
# data["our"] = data.Lemmatized_review.apply(remove_our_stopwords)
# data["our_str"] = data.our.apply(to_string)

data["words_count"] = data.Lemmatized_review_list.apply(count_words)

data = data.drop(["clean_review", "clean_review_lst", "NonStopwords_review_lst", "date", "NonStopwords_review_str"], axis = 1)

data["Side_Effects_mention"] = data.Lemmatized_review_list.apply(side_effects_lst)



Unnamed: 0,uniqueID,drugName,condition,review,rating,usefulCount,NonStopwords_review_lst_MN,NonStopwords_review_str_MN,Lemmatized_review_lst,Lemmatized_review_str,Lemmatized_review_list,Lemmatized_review,words_count,Side_Effects_mention
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,27,"[side, effect, take, combination, bystolic, mg...",side effect take combination bystolic mg fish oil,"[side, effect, take, combination, bystolic, mg...",side effect take combination bystolic mg fish oil,"[side, effect, take, combination, bystolic, mg...",side effect take combination bystolic mg fish oil,"{'side': 1, 'effect': 1, 'take': 1, 'combinati...",[]
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,192,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'week': ...",[]
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,17,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1...",[]
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,10,"[first, time, using, form, birth, control, im,...",first time using form birth control im glad we...,"[first, time, using, form, birth, control, im,...",first time using form birth control im glad we...,"[first, time, using, form, birth, control, im,...",first time using form birth control im glad we...,"{'first': 3, 'time': 1, 'using': 2, 'form': 1,...",[]
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,37,"[suboxone, completely, turned, life, around, f...",suboxone completely turned life around feel he...,"[suboxone, completely, turned, life, around, f...",suboxone completely turned life around feel he...,"[suboxone, completely, turned, life, around, f...",suboxone completely turned life around feel he...,"{'suboxone': 3, 'completely': 1, 'turned': 1, ...",[constipation]
5,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,43,"[nd, day, mg, started, work, rock, hard, erect...",nd day mg started work rock hard erections how...,"[nd, day, mg, started, work, rock, hard, erect...",nd day mg started work rock hard erection howe...,"[nd, day, mg, started, work, rock, hard, erect...",nd day mg started work rock hard erection howe...,"{'nd': 1, 'day': 2, 'mg': 1, 'started': 1, 'wo...",[headache]
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1,5,"[pulled, cummed, bit, took, plan, b, hours, la...",pulled cummed bit took plan b hours later took...,"[pulled, cummed, bit, took, plan, b, hour, lat...",pulled cummed bit took plan b hour later took ...,"[pulled, cummed, bit, took, plan, b, hour, lat...",pulled cummed bit took plan b hour later took ...,"{'pulled': 1, 'cummed': 1, 'bit': 1, 'took': 2...",[]
7,102654,Aripiprazole,Bipolar Disorde,"""Abilify changed my life. There is hope. I was...",10,32,"[abilify, changed, life, hope, zoloft, clonidi...",abilify changed life hope zoloft clonidine fir...,"[abilify, changed, life, hope, zoloft, clonidi...",abilify changed life hope zoloft clonidine fir...,"[abilify, changed, life, hope, zoloft, clonidi...",abilify changed life hope zoloft clonidine fir...,"{'abilify': 4, 'changed': 1, 'life': 1, 'hope'...",[]
8,74811,Keppra,Epilepsy,""" I Ve had nothing but problems with the Kepp...",1,11,"[nothing, problems_NEG, keppera_NEG, constant_...",nothing problems_NEG keppera_NEG constant_NEG ...,"[nothing, problems_NEG, keppera_NEG, constant_...",nothing problems_NEG keppera_NEG constant_NEG ...,"[nothing, problem, keppera, constant, shaking,...",nothing problem keppera constant shaking arm a...,"{'nothing': 1, 'problem': 1, 'keppera': 1, 'co...",[]
9,48928,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8,1,"[pill, many, years, doctor, changed, rx, chate...",pill many years doctor changed rx chateal effe...,"[pill, many, year, doctor, changed, rx, chatea...",pill many year doctor changed rx chateal effec...,"[pill, many, year, doctor, changed, rx, chatea...",pill many year doctor changed rx chateal effec...,"{'pill': 1, 'many': 1, 'year': 1, 'doctor': 1,...",[]


In [17]:
data.head(50)

Unnamed: 0,uniqueID,drugName,condition,review,rating,usefulCount,NonStopwords_review_lst_MN,NonStopwords_review_str_MN,Lemmatized_review_lst,Lemmatized_review_str,Lemmatized_review_list,Lemmatized_review,words_count,Side_Effects_mention
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,27,"[side, effect, take, combination, bystolic, mg...",side effect take combination bystolic mg fish oil,"[side, effect, take, combination, bystolic, mg...",side effect take combination bystolic mg fish oil,"[side, effect, take, combination, bystolic, mg...",side effect take combination bystolic mg fish oil,"{'side': 1, 'effect': 1, 'take': 1, 'combinati...",[]
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,192,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'week': ...",[]
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,17,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1...",[]
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,10,"[first, time, using, form, birth, control, im,...",first time using form birth control im glad we...,"[first, time, using, form, birth, control, im,...",first time using form birth control im glad we...,"[first, time, using, form, birth, control, im,...",first time using form birth control im glad we...,"{'first': 3, 'time': 1, 'using': 2, 'form': 1,...",[]
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,37,"[suboxone, completely, turned, life, around, f...",suboxone completely turned life around feel he...,"[suboxone, completely, turned, life, around, f...",suboxone completely turned life around feel he...,"[suboxone, completely, turned, life, around, f...",suboxone completely turned life around feel he...,"{'suboxone': 3, 'completely': 1, 'turned': 1, ...",[constipation]
5,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,43,"[nd, day, mg, started, work, rock, hard, erect...",nd day mg started work rock hard erections how...,"[nd, day, mg, started, work, rock, hard, erect...",nd day mg started work rock hard erection howe...,"[nd, day, mg, started, work, rock, hard, erect...",nd day mg started work rock hard erection howe...,"{'nd': 1, 'day': 2, 'mg': 1, 'started': 1, 'wo...",[headache]
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1,5,"[pulled, cummed, bit, took, plan, b, hours, la...",pulled cummed bit took plan b hours later took...,"[pulled, cummed, bit, took, plan, b, hour, lat...",pulled cummed bit took plan b hour later took ...,"[pulled, cummed, bit, took, plan, b, hour, lat...",pulled cummed bit took plan b hour later took ...,"{'pulled': 1, 'cummed': 1, 'bit': 1, 'took': 2...",[]
7,102654,Aripiprazole,Bipolar Disorde,"""Abilify changed my life. There is hope. I was...",10,32,"[abilify, changed, life, hope, zoloft, clonidi...",abilify changed life hope zoloft clonidine fir...,"[abilify, changed, life, hope, zoloft, clonidi...",abilify changed life hope zoloft clonidine fir...,"[abilify, changed, life, hope, zoloft, clonidi...",abilify changed life hope zoloft clonidine fir...,"{'abilify': 4, 'changed': 1, 'life': 1, 'hope'...",[]
8,74811,Keppra,Epilepsy,""" I Ve had nothing but problems with the Kepp...",1,11,"[nothing, problems_NEG, keppera_NEG, constant_...",nothing problems_NEG keppera_NEG constant_NEG ...,"[nothing, problems_NEG, keppera_NEG, constant_...",nothing problems_NEG keppera_NEG constant_NEG ...,"[nothing, problem, keppera, constant, shaking,...",nothing problem keppera constant shaking arm a...,"{'nothing': 1, 'problem': 1, 'keppera': 1, 'co...",[]
9,48928,Ethinyl estradiol / levonorgestrel,Birth Control,"""I had been on the pill for many years. When m...",8,1,"[pill, many, years, doctor, changed, rx, chate...",pill many years doctor changed rx chateal effe...,"[pill, many, year, doctor, changed, rx, chatea...",pill many year doctor changed rx chateal effec...,"[pill, many, year, doctor, changed, rx, chatea...",pill many year doctor changed rx chateal effec...,"{'pill': 1, 'many': 1, 'year': 1, 'doctor': 1,...",[]


In [None]:
new_data = data[[]]

In [12]:
countSE_drug_df = data[['drugName', 'Side_Effects_mention']]
countSE_drug_df.groupby(by='drugName')['Side_Effects_mention'].value_counts().unstack()


TypeError: unhashable type: 'list'

In [46]:
countSE_drug_df = data[['drugName', 'Side_Effects_mention']]
#countSE_drug_df = countSE_drug_df.reset_index()
countSE_drug_df.groupby(by='drugName')
#SE_count_drug.to_csv('SE_count_per_drug.csv')
countSE_drug_df.head(50)

Unnamed: 0,drugName,Side_Effects_mention
0,Valsartan,constipation
1,Guanfacine,headache
2,Lybrel,headache
3,Ortho Evra,improvement
4,Buprenorphine / naloxone,improvement
5,Cialis,diarrhea
6,Levonorgestrel,constipation
7,Aripiprazole,insomnia
8,Keppra,insomnia
9,Ethinyl estradiol / levonorgestrel,insomnia


In [30]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(data['Side_Effects_mention'])

X.toarray()

AttributeError: 'list' object has no attribute 'lower'