## Drug view data exploration

### Import subsample (5000 rows out of 53,000 rows)

In [4]:
## import data
import pandas as pd

raw_df = pd.read_csv('/home/jack/code/jackoutthebox/adverse_drug_reactions/raw_data/drugsComTrain_raw.csv', nrows=5000)
raw_df.head()


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [5]:
## look at individual reviews

raw_df.loc[14].at['review']

'"Started Nexplanon 2 months ago because I have a minimal amount of contraception&#039;s I can take due to my inability to take the hormone that is used in most birth controls. I&#039;m trying to give it time because it is one of my only options right now. But honestly if I had options I&#039;d get it removed.\r\nI&#039;ve never had acne problems in my life, and immediately broke out after getting it implanted. Sex drive is completely gone, and I used to have sex with my boyfriend a few days a week, now its completely forced and not even fun for me anymore. I mean I&#039;m on birth control because I like having sex but don&#039;t want to get pregnant, why take a birth control that takes away sex? Very unhappy and hope that I get it back with time or I&#039;m getting it removed."'

### Preprocess subsample - lowercase, get rid of numbers, tokens, etc.

In [7]:
## make reviews lower_case
import nltk as nl

raw_df['review'] = raw_df['review'].str.lower()
raw_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""it has no side effect, i take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""my son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""i used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""this is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""suboxone has completely turned my life around...",9,27-Nov-16,37


In [None]:
## remove numbers 
raw_df['review'] = raw_df['review'].apply(lambda x:''.join(x for x in raw_df['review'] if not x.isdigit()))
raw_df.head()

In [8]:
### remove punctuation
import string

for punctuation in string.punctuation:
    raw_df['review'] = raw_df['review'].replace(punctuation, '') 

raw_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""it has no side effect, i take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""my son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""i used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""this is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""suboxone has completely turned my life around...",9,27-Nov-16,37


In [13]:
### remove stop words

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 

raw_df['review'] = word_tokenize(raw_df['review']) 
  
#raw_df['review'] = raw_df['review'].apply(lambda x: x for x in raw_df['review'] if not x in stop_words]   
  
raw_df['review'] = [w for w in word_tokens if not w in stop_words] 
      
raw_df.head()

TypeError: expected string or bytes-like object

In [10]:
## vectorize based on ngrams of subsample
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer(ngram_range = (2,2))

df_ngram2 = tf_idf_vectorizer.fit_transform(raw_df['review'])

df_ngram2.toarray()

pd.DataFrame(df_ngram2.toarray(),columns = tf_idf_vectorizer.get_feature_names())


Unnamed: 0,00 039,00 am,00 and,00 before,00 co,00 feel,00 for,00 had,00 in,00 month,...,zyrtec amp,zyrtec and,zyrtec antihistamine,zyrtec in,zyrtec now,zyrtec started,zyvox for,zzzquil and,zzzquil my,zzzquil or
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
## first LDA

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer


lda_model = LatentDirichletAllocation(n_components=3).fit(df_ngram2)

In [15]:
def print_topics(model, tf_idf_vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(tf_idf_vectorizer.get_feature_names()[i], topic[i])
                       for i in topic.argsort()[:-20 - 1:-1]])
        

print_topics(lda_model, tf_idf_vectorizer)

Topic 0:
[('039 ve', 19.391216797062803), ('side effects', 17.18178195898352), ('for me', 17.107108548710652), ('have been', 16.39320068237749), ('it 039', 13.800116301435766), ('been on', 13.633559321386562), ('in the', 12.826455702830941), ('and it', 12.46656882343594), ('it was', 11.110240775705536), ('don 039', 10.69337563563473), ('and have', 10.632381477764996), ('of the', 10.624223735803485), ('the first', 10.395497911199687), ('for the', 10.07661280384195), ('ve been', 9.999104261074702), ('my life', 9.40175601155198), ('to the', 9.308677321309485), ('birth control', 9.060706130685608), ('didn 039', 8.9889982846919), ('the only', 8.914290200496607)]
Topic 1:
[('side effects', 19.44504794788125), ('039 ve', 19.092829436593338), ('it 039', 14.948261771615167), ('have been', 13.32247729056592), ('for me', 13.048237991201615), ('and it', 12.834009316386938), ('it was', 12.205742869621986), ('in the', 12.049770219653746), ('don 039', 11.921125190550915), ('been on', 11.3366693521262