## Preprocessing - make features/columns 

### Imports and read in data

In [30]:
### imports
import pandas as pd # DataFrame Manipulation Package
import numpy as np
import matplotlib.pyplot as plt
import string

from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.decomposition import LatentDirichletAllocation # Latent Dirichlet Allocation is a topic model that is used for discovering abstract topics from a collection of documents (variational Bayes algorithm)
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

import string # Collection of string operations
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer #Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk import word_tokenize

from nltk.sentiment.util import mark_negation

In [22]:
### read in unlabed data

unlabeled_df =  pd.read_csv('/home/jack/code/jackoutthebox/adverse_drug_reactions/raw_data/drugsComTrain_raw.csv', nrows=100)
unlabeled_df.tail()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
95,45237,Fluoxetine,Major Depressive Disorde,"""I started Prozac as one of my first anti depr...",2,12-Jan-16,18
96,102810,Aripiprazole,Depression,"""Intake Effexor XR 375 mg, and lorazepam for d...",4,17-Aug-12,33
97,60280,NuvaRing,Birth Control,"""I am torn by the Nuvaring. The convenience is...",5,31-Oct-11,0
98,10677,Spironolactone,Acne,"""I&#039;m 30 years old. I started having real...",9,21-Aug-13,31
99,196244,Fluvoxamine,Anxiety and Stress,"""I&#039;ve suffered from panic attacks and anx...",9,3-Jan-11,44


In [3]:
### read in labeled data from Hendrike

labeled_df =  pd.read_csv('/home/jack/code/jackoutthebox/adverse_drug_reactions/raw_data/manually_labelled_data.csv')
labeled_df.head()


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sideEffect
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,0
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,1
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1


In [59]:
### import list of side effects

from csv import reader
# read csv file as a list of lists
with open('../raw_data/frequent_adr.csv', 'r') as read_obj:
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Pass reader object to list() to get a list of lists
    side_effects = list(csv_reader)

In [60]:
side_effects.pop(0)

['0']

In [61]:
print(type(side_effects))

<class 'list'>


In [52]:
side_effect_df = pd.DataFrame(side_effects)
side_effect_df.head()

Unnamed: 0,0
0,Abdominal pain
1,Gastrointestinal pain
2,Amblyopia
3,Anaemia
4,Anorexia


In [68]:
si_list = [] 
for sublist in side_effect: 
    for item in sublist: si_list.append(item)

print(type(si_list))

<class 'list'>


In [62]:
df_sideeffects = side_effect_df.Side_Effect.str.split(expand=True).stack().value_counts()
df_sideeffects.head()

AttributeError: 'DataFrame' object has no attribute 'Side_Effect'

### Preprocessing


In [76]:
## functions

#stop_words
stop_words = set(stopwords.words('english')) 

for negation in ["no", "not", "shouldn't", "aren't", "couldn't", "didn't", "doesn't", "don't", "wasn't", "weren't", "wouldn't"]:
    stop_words.remove(negation)

#stop_words_with side effects

stop_words_se = set(stopwords.words('english')) 

for negation in ["no", "not", "shouldn't", "aren't", "couldn't", "didn't", "doesn't", "don't", "wasn't", "weren't", "wouldn't"]:
    stop_words_se.remove(negation)

for effect in si_list: 
    stop_words_se.add(side_effect) 
    
    

def to_list(x):
    list_words = x.split(' ')
    return list_words

def to_string(x):
    string = " ".join(x)
    return string

#===============================================================

def punctuation(x):
    for punctuation in string.punctuation:
        x =  x.replace(punctuation, '')
    return x.lower()

def remove_numbers (x):
    words_only = ''.join([i for i in x if not i.isdigit()])
    return words_only

def m_negation(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    tokenized_neg = mark_negation(without_stopwords)
    return tokenized_neg

def remove_stopwords(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

def remove_stopwords_se(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words_se]
    return without_stopwords

def m_negation_se(x):
    tokenized = word_tokenize(x)
    without_stopwords = [word for word in tokenized if not word in stop_words_se]
    tokenized_neg = mark_negation(without_stopwords)
    return tokenized_neg

def lemmatize_review(x):
    lemma = WordNetLemmatizer()
    lista = []
    for w in x:
       lista.append(lemma.lemmatize(w))
    return lista

#===============================================================

def count_words(x):
    wordfreq = []
    for w in x:
        wordfreq.append(x.count(w))
    return dict(zip(x, wordfreq))

def total_count(x):
    total_count = {}
    for row in x:
        for key in row.keys():
          if key in total_count:
              total_count[key] += 1
          else:
              total_count[key] = 1
    return pd.DataFrame(sorted(total_count.items(), key=lambda x: x[1], reverse=True)).head(30).T

#===============================================================

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], round(topic[i], 2))
                        for i in topic.argsort()[:-10 - 1:-1]])

In [75]:
### enter in juan prepreprocessing - stemming, lemmitization, etc.

unlabeled_df["clean_review"] = unlabeled_df["review"].apply(punctuation)
unlabeled_df['clean_review'] = unlabeled_df.clean_review.apply(remove_numbers)
unlabeled_df['clean_review_lst'] = unlabeled_df.clean_review.apply(to_list)

unlabeled_df["nonStopwords_review_lst"] = unlabeled_df.clean_review.apply(remove_stopwords)
unlabeled_df["nonStopwords_review_str"] = unlabeled_df.NonStopwords_review_lst.apply(to_string)

unlabeled_df["nonStopwords_review_lst_MN"] = unlabeled_df.clean_review.apply(m_negation)
unlabeled_df["nonStopwords_review_str_MN"] = unlabeled_df.NonStopwords_review_lst_MN.apply(to_string)

unlabeled_df["lemmatized_review_lst"] = unlabeled_df.NonStopwords_review_lst_MN.apply(lemmatize_review)
unlabeled_df["lemmatized_review_str"] = unlabeled_df.Lemmatized_review_lst.apply(to_string)

unlabeled_df["lemmatized_review"] = unlabeled_df.NonStopwords_review_lst.apply(lemmatize_review)
unlabeled_df["lemmatized_review"] = unlabeled_df.Lemmatized_review.apply(to_string)

unlabeled_df["words_count"] = unlabeled_df.Lemmatized_review_lst.apply(count_words)

unlabeled_df.head(3)



Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,clean_review,clean_review_lst,NonStopwords_review_lst,...,Lemmatized_review_str,Lemmatized_review,words_count,nonStopwords_review_lst,nonStopwords_review_str,nonStopwords_review_lst_MN,nonStopwords_review_str_MN,lemmatized_review_lst,lemmatized_review_str,lemmatized_review
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,it has no side effect i take it in combination...,"[it, has, no, side, effect, i, take, it, in, c...","[no, side, effect, take, combination, bystolic...",...,no side_NEG effect_NEG take_NEG combination_NE...,no side effect take combination bystolic mg fi...,"{'no': 1, 'side_NEG': 1, 'effect_NEG': 1, 'tak...","[no, side, effect, take, combination, bystolic...",no side effect take combination bystolic mg fi...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,"[no, side_NEG, effect_NEG, take_NEG, combinati...",no side_NEG effect_NEG take_NEG combination_NE...,n o s i d e e f f e c t t a k e c o m ...
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,my son is halfway through his fourth week of i...,"[my, son, is, halfway, through, his, fourth, w...","[son, halfway, fourth, week, intuniv, became, ...",...,son halfway fourth week intuniv became concern...,son halfway fourth week intuniv became concern...,"{'son': 1, 'halfway': 1, 'fourth': 1, 'week': ...","[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,"[son, halfway, fourth, week, intuniv, became, ...",son halfway fourth week intuniv became concern...,s o n h a l f w a y f o u r t h w e e k ...
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,i used to take another oral contraceptive whic...,"[i, used, to, take, another, oral, contracepti...","[used, take, another, oral, contraceptive, pil...",...,used take another oral contraceptive pill cycl...,used take another oral contraceptive pill cycl...,"{'used': 1, 'take': 1, 'another': 1, 'oral': 1...","[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,"[used, take, another, oral, contraceptive, pil...",used take another oral contraceptive pill cycl...,u s e d t a k e a n o t h e r o r a l ...


## create vectorized dataframe without side effects (include side effects as stopwords)

In [53]:
### read in unlabed data

unlabeled_no_sideseffects_df =  pd.read_csv('/home/jack/code/jackoutthebox/adverse_drug_reactions/raw_data/drugsComTrain_raw.csv', nrows=100)
unlabeled_no_sideseffects_df.tail()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
95,45237,Fluoxetine,Major Depressive Disorde,"""I started Prozac as one of my first anti depr...",2,12-Jan-16,18
96,102810,Aripiprazole,Depression,"""Intake Effexor XR 375 mg, and lorazepam for d...",4,17-Aug-12,33
97,60280,NuvaRing,Birth Control,"""I am torn by the Nuvaring. The convenience is...",5,31-Oct-11,0
98,10677,Spironolactone,Acne,"""I&#039;m 30 years old. I started having real...",9,21-Aug-13,31
99,196244,Fluvoxamine,Anxiety and Stress,"""I&#039;ve suffered from panic attacks and anx...",9,3-Jan-11,44


In [77]:
unlabeled_no_sideseffects_df["clean_review"] = unlabeled_no_sideseffects_df["review"].apply(punctuation)
unlabeled_no_sideseffects_df['clean_review'] = unlabeled_no_sideseffects_df.clean_review.apply(remove_numbers)
unlabeled_no_sideseffects_df['clean_review_lst'] = unlabeled_no_sideseffects_df.clean_review.apply(to_list)

unlabeled_no_sideseffects_df["nonStopwords_review_lst"] = unlabeled_no_sideseffects_df.clean_review.apply(remove_stopwords_se)
unlabeled_no_sideseffects_df["nonStopwords_review_str"] = unlabeled_no_sideseffects_df.NonStopwords_review_lst.apply(to_string)

unlabeled_no_sideseffects_df["nonStopwords_review_lst_MN"] = unlabeled_no_sideseffects_df.clean_review.apply(m_negation_se)
unlabeled_no_sideseffects_df["nonStopwords_review_str_MN"] = unlabeled_no_sideseffects_df.NonStopwords_review_lst_MN.apply(to_string)

unlabeled_no_sideseffects_df["lemmatized_review_lst"] = unlabeled_no_sideseffects_df.NonStopwords_review_lst_MN.apply(lemmatize_review)
unlabeled_no_sideseffects_df["lemmatized_review_str"] = unlabeled_no_sideseffects_df.Lemmatized_review_lst.apply(to_string)

unlabeled_no_sideseffects_df["lemmatized_review"] = unlabeled_no_sideseffects_df.NonStopwords_review_lst.apply(lemmatize_review)
unlabeled_no_sideseffects_df["lemmatized_review"] = unlabeled_no_sideseffects_df.Lemmatized_review.apply(to_string)

unlabeled_no_sideseffects_df["words_count"] = unlabeled_no_sideseffects_df.Lemmatized_review_lst.apply(count_words)

unlabeled_no_sideseffects_df.head(3)

AttributeError: 'DataFrame' object has no attribute 'NonStopwords_review_lst'

## create vectorized side effects dataframe - orginal df minus df without side effects

In [None]:
side_effects_df = unlabeled_df - unlabeled_no_sideeffects_df

### Make columns for whether side effects are mentioned in a review

In [17]:
### take sides effects from dictionary and create column in dataframe

Dummy_side_effects= {
    'abdominal': ['constipation', 'diarrhea'],
    'skin' : ['rash'],
    'vertigo' : ['dizziness', 'drowsiness'],
    'headache' : ['headache'],
    'mood disorders' : ['insomnia', 'mood swings']}

def make_columns(SideEffectdict):
    for condition, words in SideEffectdict.items():
        #f"unlabed_df['{key}'] = unlabed_df['clean_review'].str.contains('{value}')"
        for word in words:
            contains_key = unlabed_df['clean_review'].str.contains()
            df[key] = 
    return unlabed_df
    

make_columns(Dummy_side_effects)

vectorized_df = 


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37
...,...,...,...,...,...,...,...
4995,45298,Fluoxetine,Hot Flashes,"""I was having hot flashes every night from bei...",10,25-Jan-16,26
4996,190504,Butorphanol,Pain,"""I was initially prescribed this for frequent ...",9,21-Feb-15,1
4997,183187,Cymbalta,Anxiety,"""I have suffered from anxiety problems my whol...",8,13-Jul-13,81
4998,114593,Codeine / promethazine,Cough,"""This is the only thing will stop my cough. I ...",10,16-Oct-15,21


In [27]:
Dummy_side_effects= {
    #'constipation': 'abdominal',
    'diarrhea' : 'abdominal',
    'rash' : 'skin',
    #'dizziness' : 'vertigo',
    'drowsiness' : 'vertigo',
    'headache' : 'headache',
    #'insomnia' : 'mood disorders',
    'mood swings' : 'mood disorders'}

vectorizer = TfidfVectorizer(min_df = 0.01, 
                             max_df = 0.50, 
                             max_features = None,
                             vocabulary = Dummy_side_effects,
                             ngram_range = (2, 2)).fit(unlabeled_df["review"]) #

# MINDF Ignore terms that have a document frequency strictly higher than the given threshold
# MAXDF When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold

data_vectorized = vectorizer.transform(unlabeled_df["review"])


ValueError: Vocabulary of size 5 doesn't contain index 0.