In [18]:
import pandas as pd
import tensorflow as tf
import preprocess_kgptalkie as ps
import nltk

Importing Data + Filtering
* total data count: 7978

In [312]:
# training data
train_dataset = pd.read_csv('drugsComTrain_raw.csv')
train_dataset.dropna(inplace=True)

antidepressants_list = ['Escitalopram', 'Vilazodone', 'Lexapro', 'Trintellix', 'Paroxetine', 'Sertraline', 
                        'Citalopram', 'Fluoxetine', 'Zoloft', 'Celexa', 'Prozac', 'Venlafaxine', 'Pristiq', 
                        'Desvenlafaxine', 'Duloxetine', 'Cymbalta', 'Viibryd', 'Effexor', 'Effexor XR', 
                        'Amitriptyline', 'Amoxapine', 'Desipramine', 'Doxepin', 'Imipramine', 'Nortriptyline', 
                        'Protriptyline', 'Trimipramine', 'Isocarboxazid', 'Phenelzine', 'Selegiline', 
                        'Tranylcypromine']

train_dataset = train_dataset[train_dataset.drugName.isin(antidepressants_list)]
train_dataset = train_dataset[train_dataset.condition.isin(['Depression'])]

test_dataset = pd.read_csv('drugsComTest_raw.csv')
test_dataset.dropna(inplace=True)
test_dataset = test_dataset[test_dataset.condition.isin(['Depression'])]
test_dataset = test_dataset[test_dataset.drugName.isin(antidepressants_list)]

In [313]:
frames = [train_dataset, test_dataset]
dataset = pd.concat(frames)

In [314]:
dataset

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
31,96233,Sertraline,Depression,"""1 week on Zoloft for anxiety and mood swings....",8,7-May-11,3
44,121333,Venlafaxine,Depression,"""my gp started me on Venlafaxine yesterday to ...",4,27-Apr-16,3
67,131909,Effexor XR,Depression,"""This medicine saved my life. I was at my wits...",10,20-Jun-13,166
143,186257,Desvenlafaxine,Depression,"""This medication is amazing! After 3 days of b...",10,10-Mar-13,101
150,45238,Fluoxetine,Depression,"""My genius psychiatrist started me on this dru...",1,11-Jan-16,19
...,...,...,...,...,...,...,...
53580,96067,Sertraline,Depression,"""I have been taking this medicine for years af...",10,23-Jul-12,10
53582,205800,Pristiq,Depression,"""I have been on Pristiq for almost 2 weeks. I...",10,28-Feb-11,15
53600,28837,Lexapro,Depression,"""Lexapro quickly lifted me out of depression a...",8,27-Oct-09,12
53691,61639,Citalopram,Depression,"""Best medication ever I&#039;m on 20 mg .its v...",10,22-Jul-16,43


Preprocessing
* lowercase, punctuation removal, stopword removal

In [315]:
import gensim

In [316]:
dataset['review'] = dataset['review'].apply(lambda x: str(x).lower())
dataset['review'] = dataset['review'].apply(gensim.parsing.preprocessing.strip_punctuation)

In [317]:
from gensim.parsing import remove_stopwords
dataset['review'] = dataset['review'].apply(gensim.parsing.preprocessing.remove_stopwords)

In [318]:
dataset

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
31,96233,Sertraline,Depression,1 week zoloft anxiety mood swings 50mg morning...,8,7-May-11,3
44,121333,Venlafaxine,Depression,gp started venlafaxine yesterday help depressi...,4,27-Apr-16,3
67,131909,Effexor XR,Depression,medicine saved life wits end anti depressants ...,10,20-Jun-13,166
143,186257,Desvenlafaxine,Depression,medication amazing 3 days extremely sick start...,10,10-Mar-13,101
150,45238,Fluoxetine,Depression,genius psychiatrist started drug fall 2008 wee...,1,11-Jan-16,19
...,...,...,...,...,...,...,...
53580,96067,Sertraline,Depression,taking medicine years spending years trying me...,10,23-Jul-12,10
53582,205800,Pristiq,Depression,pristiq 2 weeks medicine past 10 yrs started z...,10,28-Feb-11,15
53600,28837,Lexapro,Depression,lexapro quickly lifted depression kept way imp...,8,27-Oct-09,12
53691,61639,Citalopram,Depression,best medication 039 m 20 mg important medicati...,10,22-Jul-16,43


In [319]:
dataset['tokens'] = dataset.apply(lambda row: nltk.word_tokenize(row['review']), axis=1)

In [320]:
sentences = dataset['tokens'].tolist()

Word2Vec

In [321]:
from gensim.models import Word2Vec
w2v = Word2Vec(window=4, workers=4)
w2v.build_vocab(dataset.tokens)
w2v.train(dataset.tokens, total_examples=w2v.corpus_count, epochs = w2v.epochs)
w2v.save("word2vec.model")

In [322]:
from gensim.models import KeyedVectors
word_vectors = w2v.wv
word_vectors.save("word2vec.wordvectors")

# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

In [323]:
import numpy as np
def sent_vec(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in wv:
            ctr += 1
            wv_res += wv[w]
    wv_res = wv_res/ctr
    return wv_res

In [324]:
dataset['vec'] = dataset['tokens'].apply(sent_vec)

In [329]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
dataset['sequences']=tokenizer.fit_on_texts(dataset['tokens'])

In [330]:
dataset

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,tokens,vec,sequences
31,96233,Sertraline,Depression,1 week zoloft anxiety mood swings 50mg morning...,8,7-May-11,3,"[1, week, zoloft, anxiety, mood, swings, 50mg,...","[0.29022203732281926, 0.21574973179027437, 0.0...",
44,121333,Venlafaxine,Depression,gp started venlafaxine yesterday help depressi...,4,27-Apr-16,3,"[gp, started, venlafaxine, yesterday, help, de...","[0.014146054091317386, 0.4669385936496587, 0.2...",
67,131909,Effexor XR,Depression,medicine saved life wits end anti depressants ...,10,20-Jun-13,166,"[medicine, saved, life, wits, end, anti, depre...","[-0.021505487991905794, 0.17619656871368244, 0...",
143,186257,Desvenlafaxine,Depression,medication amazing 3 days extremely sick start...,10,10-Mar-13,101,"[medication, amazing, 3, days, extremely, sick...","[0.18076271643041678, 0.26242222388585407, 0.1...",
150,45238,Fluoxetine,Depression,genius psychiatrist started drug fall 2008 wee...,1,11-Jan-16,19,"[genius, psychiatrist, started, drug, fall, 20...","[0.18187484412691465, 0.18738767708076493, 0.1...",
...,...,...,...,...,...,...,...,...,...,...
53580,96067,Sertraline,Depression,taking medicine years spending years trying me...,10,23-Jul-12,10,"[taking, medicine, years, spending, years, try...","[0.11876317418108766, 0.25776778992552024, 0.0...",
53582,205800,Pristiq,Depression,pristiq 2 weeks medicine past 10 yrs started z...,10,28-Feb-11,15,"[pristiq, 2, weeks, medicine, past, 10, yrs, s...","[0.14592836408923238, 0.24186878998653363, 0.1...",
53600,28837,Lexapro,Depression,lexapro quickly lifted depression kept way imp...,8,27-Oct-09,12,"[lexapro, quickly, lifted, depression, kept, w...","[0.016067516942577506, 0.3021940365778627, 0.1...",
53691,61639,Citalopram,Depression,best medication 039 m 20 mg important medicati...,10,22-Jul-16,43,"[best, medication, 039, m, 20, mg, important, ...","[0.10715766549110413, 0.3542056029041608, 0.38...",


In [345]:
X = dataset['vec'].to_list()
y = dataset['rating'].to_list()

In [350]:
dataset['sequences']=tokenizer.fit_on_texts(dataset['tokens'])

In [351]:
dataset['sequences']=tokenizer.texts_to_sequences(dataset['sequences'])

AttributeError: 'NoneType' object has no attribute 'lower'

In [357]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(data)

In [358]:
data = tokenizer.texts_to_sequences(data)

In [359]:
dataset['sequences'] = data

In [361]:
dataset['sequences'].astype(float)

ValueError: setting an array element with a sequence.

In [362]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()

model.fit(X_train,y_train)

DecisionTreeRegressor()

In [365]:
from sklearn import metrics
predicted = model.predict(X_test)
##print("Accuracy: ", metrics.accuracy_score(y_test, predicted))
# print("Precision micro: ", metrics.precision_score(y_test, predicted, average = 'micro'))
# print("Precision macro: ", metrics.precision_score(y_test, predicted, average = 'macro'))
# print("Precision weighted: ",metrics.precision_score(y_test, predicted, average = 'weighted'))
# print("Recall micro: ", metrics.recall_score(y_test, predicted, average = 'micro'))
# print("Recall macro: ", metrics.recall_score(y_test, predicted, average = 'macro'))
# print("Recall weighted: ", metrics.recall_score(y_test, predicted, average = 'weighted'))
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
print("R2 score: ",metrics.r2_score(y_test, predicted, multioutput='variance_weighted'))
print("R2 score: ",metrics.r2_score(y_test, predicted, multioutput='uniform_average'))
print("R2 score: ",metrics.r2_score(y_test, predicted, multioutput='raw_values'))
print("Mean absolute error: ", metrics.mean_absolute_error(y_test, predicted, multioutput='uniform_average'))
print("Mean absolute error: ", metrics.mean_absolute_error(y_test, predicted, multioutput='raw_values'))
print("Mean squared error: ", metrics.mean_squared_error(y_test, predicted, multioutput='uniform_average'))
print("Mean squared error: ", metrics.mean_squared_error(y_test, predicted, multioutput='raw_values'))

R2 score:  0.33347600498692087
R2 score:  0.33347600498692087
R2 score:  [0.333476]
Mean absolute error:  1.2167919799498748
Mean absolute error:  [1.21679198]
Mean squared error:  6.468295739348371
Mean squared error:  [6.46829574]
