In [22]:
%reload_ext autoreload
%autoreload 2
%matplotlib notebook
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk import pos_tag, pos_tag_sents
from nltk.tag.perceptron import PerceptronTagger
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from math import *

import nltk
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Importing Data and Inital Preprocessing

In [34]:
drugs = pd.read_csv("train.tsv",delimiter="\t")
drugs_test = pd.read_csv("test.tsv",delimiter="\t")
split = len(drugs)
drugs = pd.concat([drugs, drugs_test])
len(drugs)

215063

In [35]:
# change first column name to "id"
col = list(drugs.columns)
col[0] = "id"
drugs.columns = col

# create positive, neutral, and negative categories for rating. 1-4 -> negative, 5-7 -> netural, 8-10 -> positive

drugs["rating_categorized"] = drugs.rating.apply(lambda x : 2 if x > 7 else (1 if (x > 4 and x <= 7) else 0))

In [36]:
drugs.sample(5)

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,rating_categorized
14311,57701,Acetaminophen / hydrocodone,Pain,"""I suffered a .45 caliber GSW to the head in 2...",9.0,"June 26, 2015",10,2
72301,168695,Vilazodone,Major Depressive Disorde,"""I started on Viibryd after having a DNA test ...",1.0,"December 27, 2015",1,0
23041,1607,Eluxadoline,Irritable Bowel Syndrome,"""Not covered by insurance""",6.0,"December 19, 2016",0,1
36424,17559,Ethinyl estradiol / etonogestrel,Birth Control,"""I previously did a review on Nuva Ring about ...",8.0,"November 1, 2016",8,2
133612,158793,Methylphenidate,ADHD,"""My son was prescribed the drug just before su...",6.0,"September 3, 2009",17,1


In [37]:
np.random.seed(42)

# remove garbage words
drugs["review"] = list(map(lambda x : x.replace("&#039;", "").replace("&quot",""), drugs.review))

# use a smaller subset of the data for quicker processing
data = drugs[:].copy()

In [38]:
# drop rows with na values
data.dropna(inplace=True)

In [39]:
# change all review to lowercase
data["review"] = [review.lower() for review in data.review]

In [40]:
# tokenize each word
data["tk"] = data.review.apply(lambda x : word_tokenize(x))

In [41]:
data.tail()

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,rating_categorized,tk
53761,159999,Tamoxifen,"Breast Cancer, Prevention","""i have taken tamoxifen for 5 years. side effe...",10.0,"September 13, 2014",43,2,"[``, i, have, taken, tamoxifen, for, 5, years,..."
53762,140714,Escitalopram,Anxiety,"""ive been taking lexapro (escitaploprgram) sin...",9.0,"October 8, 2016",11,2,"[``, ive, been, taking, lexapro, (, escitaplop..."
53763,130945,Levonorgestrel,Birth Control,"""im married, 34 years old and i have no kids. ...",8.0,"November 15, 2010",7,2,"[``, im, married, ,, 34, years, old, and, i, h..."
53764,47656,Tapentadol,Pain,"""i was prescribed nucynta for severe neck/shou...",1.0,"November 28, 2011",20,0,"[``, i, was, prescribed, nucynta, for, severe,..."
53765,113712,Arthrotec,Sciatica,"""it works!!!""",9.0,"September 13, 2009",46,2,"[``, it, works, !, !, !, '']"


In [42]:
# WordNet POS tags are: NOUN = 'n', ADJ = 's', VERB = 'v', ADV = 'r', ADJ_SAT = 'a'
# convert from wordnet tags to lemmatizer tags
tag_map = defaultdict(lambda: wn.NOUN)
mappings = {
        'CD':wn.NOUN, # cardinal number (one, two)                          
        'EX':wn.ADV, # existential ‘there’ (there)                      
        'IN':wn.ADV, # preposition/sub-conj (of, in, by)   
        'JJ':wn.ADJ, # adjective (yellow)                  
        'JJR':wn.ADJ, # adj., comparative (bigger)          
        'JJS':wn.ADJ, # adj., superlative (wildest)                    
        'NN':wn.NOUN, # noun, sing. or mass (llama)          
        'NNS':wn.NOUN, # noun, plural (llamas)                  
        'NNP':wn.NOUN, # proper noun, sing. (IBM)              
        'NNPS':wn.NOUN, # proper noun, plural (Carolinas)
        'PDT':wn.ADJ, # predeterminer (all, both)            
        'RB':wn.ADV, # adverb (quickly, never)            
        'RBR':wn.ADV, # adverb, comparative (faster)        
        'RBS':wn.ADV, # adverb, superlative (fastest)     
        'RP':wn.ADJ, # particle (up, off)
        'VB':wn.VERB, # verb base form (eat)
        'VBD':wn.VERB, # verb past tense (ate)
        'VBG':wn.VERB, # verb gerund (eating)
        'VBN':wn.VERB, # verb past participle (eaten)
        'VBP':wn.VERB, # verb non-3sg pres (eat)
        'VBZ':wn.VERB, # verb 3sg pres (eats)
    }

for key in mappings:
    tag_map[key] = mappings[key]

In [43]:
%%time

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# tag part of speech to each word in every review.
#tagger = PerceptronTagger()
tagged = pos_tag_sents(data.tk)

# remove stopwords and non-alpha words
cachedStopWords = stopwords.words("english")
tagged = [[word_tag for word_tag in review if (word_tag[0] not in cachedStopWords and word_tag[0].isalpha())] for review in tagged]

# lemmatize words
lemmatized = [[lemmatizer.lemmatize(word_tag[0], tag_map[word_tag[1]]) for word_tag in review] for review in tagged]

# Save lemmatized words in "text_final" column
data["text_final"] = lemmatized

CPU times: user 14min 37s, sys: 3.34 s, total: 14min 41s
Wall time: 14min 41s


In [44]:
# save processed data
data.to_csv("data_processed.csv")
data = pd.read_csv("data_processed.csv")
data

Unnamed: 0.1,Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,rating_categorized,tk,text_final
0,0,206461,Valsartan,Left Ventricular Dysfunction,"""it has no side effect, i take it in combinati...",9.0,"May 20, 2012",27,2,"['``', 'it', 'has', 'no', 'side', 'effect', ',...","['side', 'effect', 'take', 'combination', 'bys..."
1,1,95260,Guanfacine,ADHD,"""my son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,2,"['``', 'my', 'son', 'is', 'halfway', 'through'...","['son', 'halfway', 'fourth', 'week', 'intuniv'..."
2,2,92703,Lybrel,Birth Control,"""i used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,1,"['``', 'i', 'used', 'to', 'take', 'another', '...","['use', 'take', 'another', 'oral', 'contracept..."
3,3,138000,Ortho Evra,Birth Control,"""this is my first time using any form of birth...",8.0,"November 3, 2015",10,2,"['``', 'this', 'is', 'my', 'first', 'time', 'u...","['first', 'time', 'use', 'form', 'birth', 'con..."
4,4,35696,Buprenorphine / naloxone,Opiate Dependence,"""suboxone has completely turned my life around...",9.0,"November 27, 2016",37,2,"['``', 'suboxone', 'has', 'completely', 'turne...","['suboxone', 'completely', 'turn', 'life', 'ar..."
...,...,...,...,...,...,...,...,...,...,...,...
213864,53761,159999,Tamoxifen,"Breast Cancer, Prevention","""i have taken tamoxifen for 5 years. side effe...",10.0,"September 13, 2014",43,2,"['``', 'i', 'have', 'taken', 'tamoxifen', 'for...","['take', 'tamoxifen', 'year', 'side', 'effect'..."
213865,53762,140714,Escitalopram,Anxiety,"""ive been taking lexapro (escitaploprgram) sin...",9.0,"October 8, 2016",11,2,"['``', 'ive', 'been', 'taking', 'lexapro', '('...","['ive', 'take', 'lexapro', 'escitaploprgram', ..."
213866,53763,130945,Levonorgestrel,Birth Control,"""im married, 34 years old and i have no kids. ...",8.0,"November 15, 2010",7,2,"['``', 'im', 'married', ',', '34', 'years', 'o...","['im', 'marry', 'year', 'old', 'kid', 'take', ..."
213867,53764,47656,Tapentadol,Pain,"""i was prescribed nucynta for severe neck/shou...",1.0,"November 28, 2011",20,0,"['``', 'i', 'was', 'prescribed', 'nucynta', 'f...","['prescribe', 'nucynta', 'severe', 'pain', 'ta..."


In [6]:
# Split data into train and test sets
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data['text_final'],data['rating_categorized'],test_size=0.2)

In [7]:
# Ecode labels to integer values. Use this if labels are not already integers (ex. 'positive', ''negative', etc.)

#Encoder = LabelEncoder()
#Train_Y = Encoder.fit_transform(Train_Y)
#Test_Y = Encoder.fit_transform(Test_Y)

In [8]:
# Vectorize word list

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(data["text_final"])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [9]:
#print(Tfidf_vect.vocabulary_)

In [10]:
#print(Train_X_Tfidf)

In [61]:
names = [#"Nearest Neighbors", 
         "Linear SVM"] 
         #"RBF SVM", 
         #"Gaussian Process",
         #"Decision Tree", 
         #"Random Forest"] 
         #"Neural Net", 
         #"AdaBoost",
         #"Multinomial Naive Bayes"]#, 
         #"QDA"]
classifiers = [
    #KNeighborsClassifier(30, n_jobs=-1),
    BaggingClassifier(SVC(kernel='linear', cache_size=10000, probability=False, class_weight='balanced'), max_samples=1.0 / 10 , n_estimators=10, n_jobs=8, verbose=3)] #SVC(kernel="linear", C=0.025),
    #SVC(gamma='scale', C=10),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    #DecisionTreeClassifier(max_depth=5),
    #RandomForestClassifier(max_depth=None, n_estimators=40, max_features="auto", n_jobs=8, verbose=2, warm_start=True)]
    #MLPClassifier(alpha=1, max_iter=1000),
    #AdaBoostClassifier(),
    #naive_bayes.MultinomialNB(alpha=1e-10)]#,
    #QuadraticDiscriminantAnalysis()]

In [62]:
for name, clf in zip(names, classifiers):
    
    # fit the training dataset on the classifier
    clf.fit(Train_X_Tfidf, Train_Y)
    
    # predict the labels on validation dataset
    predictions = clf.predict(Test_X_Tfidf)
    
    # Use accuracy_score function to get the accuracy
    print(name,": Accuracy Score -> ",round(accuracy_score(predictions, Test_Y)*100,3))

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:  1.7min remaining:  5.0min
[Parallel(n_jobs=8)]: Done   5 out of   8 | elapsed:  1.7min remaining:  1.0min
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  2.9min finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:  2.5min remaining:  7.4min
[Parallel(n_jobs=8)]: Done   5 out of   8 | elapsed:  2.5min remaining:  1.5min


Linear SVM : Accuracy Score ->  66.693


[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  4.3min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  4.3min finished


In [35]:
# parameter tuning

parameters = {'alpha':np.arange(9,20,1e-1)}
svc = naive_bayes.MultinomialNB()
clf = model_selection.GridSearchCV(svc, parameters)
clf.fit(Train_X_Tfidf, Train_Y)
clf.best_params_, clf.best_score_

({'alpha': 9.0}, 0.6446484647959069)

Here we start deep learning classification

In [None]:
from fastai.vision import *
from fastai.metrics import *
from fastai.callbacks.hooks import *
from fastai.utils.mem import *
from fastai.text import *
path = "/root/"
torch.cuda.empty_cache()
!/opt/bin/nvidia-smi

In [None]:
# Import data. There are two clases of data, one for determining the language model (how to read the words), and the other for classification (into ratings 1-10)

data_lm = TextLMDataBunch.from_csv(path, "train.tsv", delimiter="\t", text_cols="review", label_cols="rating", bs=96)
data_clas = TextClasDataBunch.from_csv(path, "train.tsv", delimiter="\t", text_cols="review", label_cols="rating", vocab=data_lm.train_ds.vocab, bs=96)   

In [None]:
# Save databunch
data_lm.save('data_lm_export.pkl')
data_clas.save('data_clas_export.pkl')

In [None]:
# Load Databunch

bs = 48
data_lm = load_data(path, 'data_lm_export.pkl', bs=bs)
data_clas = load_data(path, 'data_clas_export.pkl', bs=bs)

In [None]:
# Setup language model learner using the AWD LSTM model.

learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=.05)

In [None]:
#fit the model to the data using the one cycle policy

learn.fit_one_cycle(1, 1e-3)

In [None]:
# find the losses at different learning rates and plot the curve

learn.lr_find();
learn.recorder.plot()

In [None]:
# example prediction using the fitted language model

learn.predict("it could have been", n_words=10)

In [None]:
# save the trained weights of the model

learn.save("fit_head")
learn.load("fit_head");

In [None]:
# unfreeze the entire model and train again
learn.unfreeze()
learn.fit_one_cycle(2, max_lr=slice(1e-2, 1e-1))

In [None]:
# save the encoder from the fitted language model
learn.save_encoder("ft_enc")

In [None]:
# Setup the classification learning model and load the language model encoder

learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics = [accuracy, rmse])
learn.load_encoder("ft_enc");

In [None]:
# Preview the classification batch. It contains one column for the tokenized reviews and another column for the rating labels

data_clas.show_batch()

In [None]:
# Train model using the fit one cycle policy

learn.fit_one_cycle(2)

In [None]:
# save these initally trained weight

learn.save("first_clas")
learn.load("first_clas");

In [None]:
# freeeze the model excep the last two layers for retraining

learn.freeze_to(-2)

In [None]:
# Plot the learning rate 

learn.lr_find()
learn.recorder.plot()

In [None]:
# Fit partially unfrozen model

learn.fit_one_cycle(1, slice(1e-4, 1e-3))

In [None]:
# fit fully unfrozen model

learn.unfreeze()
learn.fit_one_cycle(3, slice(1e-4, 1e-3))

In [None]:
# Save the final classification weights.

learn.save("final_clas")
learn.load("final_clas");