In [46]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# NLP Processing
import nltk
#nltk.download("averaged_perceptron_tagger")
#nltk.download("punkt")
#nltk.download("wordnet")
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk import pos_tag_sents
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import *
from sklearn.model_selection import ParameterGrid

# Classification Models
from sklearn.linear_model import *
from sklearn.neighbors import *
from sklearn.svm import *
from sklearn.gaussian_process import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.naive_bayes import *
from sklearn.discriminant_analysis import *
from sklearn.multiclass import *

from math import *
# set a random seed for validation split 
np.random.seed(42)

# parallel fit for hyper-parameter optimization
import parfit.parfit as pf
from parfit import *

# model explanation libraries
import shap
import lime
from sklearn.pipeline import make_pipeline

# Importing Data and Inital Preprocessing

In [2]:
drugs = pd.concat([pd.read_csv("train.tsv",delimiter="\t"), pd.read_csv("test.tsv",delimiter="\t")])

In [3]:
# change first column name to "id"
col = list(drugs.columns)
col[0] = "id"
drugs.columns = col

# feature engineer positive, neutral, and negative categories for rating.
# 1-4 -> negative (0)
# 5-7 -> netural (1)
# 8-10 -> positive (2)
drugs["rating_categorized"] = drugs.rating.apply(lambda x : 2 if x > 7 else (1 if (x > 4 and x <= 7) else 0))

In [4]:
drugs.sample(5)

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,rating_categorized
14311,57701,Acetaminophen / hydrocodone,Pain,"""I suffered a .45 caliber GSW to the head in 2...",9.0,"June 26, 2015",10,2
72301,168695,Vilazodone,Major Depressive Disorde,"""I started on Viibryd after having a DNA test ...",1.0,"December 27, 2015",1,0
23041,1607,Eluxadoline,Irritable Bowel Syndrome,"""Not covered by insurance""",6.0,"December 19, 2016",0,1
36424,17559,Ethinyl estradiol / etonogestrel,Birth Control,"""I previously did a review on Nuva Ring about ...",8.0,"November 1, 2016",8,2
133612,158793,Methylphenidate,ADHD,"""My son was prescribed the drug just before su...",6.0,"September 3, 2009",17,1


In [5]:
# remove HTML numeric codes
drugs["review"] = list(map(lambda x : x.replace("&#039;", "").replace("&quot",""), drugs.review))

# use a smaller subset of the data for quicker processing
data = drugs[:10000].copy()

In [6]:
# drop rows with NaN values
data.dropna(inplace=True)

In [7]:
# change all reviews to lowercase
data["review"] = [review.lower() for review in data.review]

In [8]:
# tokenize each word
data["tk"] = data.review.apply(lambda x : word_tokenize(x))

In [9]:
data.sample(5)

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,rating_categorized,tk
7554,2855,Acetaminophen / pamabrom,Back Pain,"""i found this while looking for acetaminophen ...",10.0,"February 13, 2017",13,2,"[``, i, found, this, while, looking, for, acet..."
4679,116091,OnabotulinumtoxinA,Overactive Bladde,"""had the botox for overactive bladder november...",10.0,"December 23, 2015",48,2,"[``, had, the, botox, for, overactive, bladder..."
4632,44044,Ethinyl estradiol / norgestimate,Birth Control,"""i switched from loestrin 24 fe to ortho tri-c...",8.0,"January 11, 2012",0,2,"[``, i, switched, from, loestrin, 24, fe, to, ..."
1450,169167,Zelapar,Parkinson's Disease,"""i had to stop taking zelapar because i was ha...",9.0,"January 19, 2009",30,2,"[``, i, had, to, stop, taking, zelapar, becaus..."
7633,39201,Contrave,Weight Loss,"""61 adult male- adhd, 5 10;-215 lbs. heart dis...",9.0,"April 16, 2017",8,2,"[``, 61, adult, male-, adhd, ,, 5, 10, ;, -215..."


In [10]:
# Set mappings to convert from the Treebank tag set to WordNet tag set

# WordNet POS tags are: NOUN = 'n', ADJ = 's', VERB = 'v', ADV = 'r', ADJ_SAT = 'a'
tag_map = defaultdict(lambda: wn.NOUN)
mappings = {
        'CD':wn.NOUN, # cardinal number (one, two)                          
        'EX':wn.ADV, # existential ‘there’ (there)                      
        'IN':wn.ADV, # preposition/sub-conj (of, in, by)   
        'JJ':wn.ADJ, # adjective (yellow)                  
        'JJR':wn.ADJ, # adj., comparative (bigger)          
        'JJS':wn.ADJ, # adj., superlative (wildest)                    
        'NN':wn.NOUN, # noun, sing. or mass (llama)          
        'NNS':wn.NOUN, # noun, plural (llamas)                  
        'NNP':wn.NOUN, # proper noun, sing. (IBM)              
        'NNPS':wn.NOUN, # proper noun, plural (Carolinas)
        'PDT':wn.ADJ, # predeterminer (all, both)            
        'RB':wn.ADV, # adverb (quickly, never)            
        'RBR':wn.ADV, # adverb, comparative (faster)        
        'RBS':wn.ADV, # adverb, superlative (fastest)     
        'RP':wn.ADJ, # particle (up, off)
        'VB':wn.VERB, # verb base form (eat)
        'VBD':wn.VERB, # verb past tense (ate)
        'VBG':wn.VERB, # verb gerund (eating)
        'VBN':wn.VERB, # verb past participle (eaten)
        'VBP':wn.VERB, # verb non-3sg pres (eat)
        'VBZ':wn.VERB, # verb 3sg pres (eats)
    }

for key in mappings:
    tag_map[key] = mappings[key]

In [11]:
%%time

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# tag part of speech to each word in every review.
tagged = pos_tag_sents(data.tk)

# remove stopwords and non-alpha words
cachedStopWords = stopwords.words("english")
tagged = [[word_tag for word_tag in review if (word_tag[0] not in cachedStopWords and word_tag[0].isalpha())] for review in tagged]

# lemmatize words
lemmatized = [[lemmatizer.lemmatize(word_tag[0], tag_map[word_tag[1]]) for word_tag in review] for review in tagged]

# Save lemmatized words in "text_final" column
data["text_final"] = lemmatized

CPU times: user 39.3 s, sys: 148 ms, total: 39.4 s
Wall time: 39.4 s


In [None]:
# save processed data
#data.to_csv("data_processed_10000.csv")
data = pd.read_csv("data_processed_10000.csv")
data

In [3]:
# Split data into train and test sets with a 20% test set percentage
x_train, x_test, y_train, y_test = model_selection.train_test_split(data['text_final'],data['rating_categorized'],test_size=0.2)

In [4]:
# Encode labels to integer values. Use this if labels are not already integers (ex. 'positive', ''negative', etc.)

#Encoder = LabelEncoder()
#Train_Y = Encoder.fit_transform(Train_Y)
#Test_Y = Encoder.fit_transform(Test_Y)

In [28]:
# Vectorize word list using Term Frequency-Inverse Document Frequency statistic with a maximum of 5000 features.

Tfidf_vect = TfidfVectorizer(max_features=5000, ngram_range=(2,4))
train_vectors = Tfidf_vect.fit_transform(x_train)
test_vectors = Tfidf_vect.transform(x_test)

# Model Training

In [8]:
# Define and initailize list of classifiers
names = ["Nearest Neighbors", 
         "SGDClassifier",
         "Linear SVM", 
         "Random Forest",
         "AdaBoost",
         "Multinomial Naive Bayes"]
classifiers = [
    KNeighborsClassifier(30, n_jobs=-1),
    SGDClassifier(n_jobs=8, penalty="l2"),
    SVC(kernel="linear", C=0.025),
    RandomForestClassifier(max_depth=None, n_estimators=40, max_features="auto", n_jobs=-1, verbose=0, warm_start=True),
    AdaBoostClassifier(),
    naive_bayes.MultinomialNB(alpha=1e-10)]

In [29]:
%%time
# Fit all classifiers to training data and get accuracy score from test data

for name, clf in zip(names, classifiers):
    
    # fit the training dataset on the classifier
    clf.fit(train_vectors, y_train)
    
    # predict the labels on validation dataset
    predictions = clf.predict(test_vectors)
    
    # Use accuracy_score function to get the accuracy
    print(name,": Accuracy Score -> ", round(accuracy_score(predictions, y_test)*100,3), " F1 Score -> ",round(f1_score(y_test, predictions, average='weighted'),3))

Nearest Neighbors : Accuracy Score ->  59.527  F1 Score ->  0.444
SGDClassifier : Accuracy Score ->  66.918  F1 Score ->  0.63
Linear SVM : Accuracy Score ->  59.527  F1 Score ->  0.444
Random Forest : Accuracy Score ->  57.416  F1 Score ->  0.519
AdaBoost : Accuracy Score ->  60.332  F1 Score ->  0.517
Multinomial Naive Bayes : Accuracy Score ->  66.013  F1 Score ->  0.603
CPU times: user 13.7 s, sys: 516 ms, total: 14.2 s
Wall time: 13.8 s


In [None]:
# Use parfit library to optimize hyper-paraeters with parallelization

paramGrid = ParameterGrid({
    'min_samples_leaf': [1],#,3,5,10,15,25,50,100,125,150,175,200],
    'max_features': ['sqrt'],#['sqrt', 'log2', 0.4, 0.5, 0.6, 0.7],
    'n_estimators': [45],
    'n_jobs': [-1],
    'random_state': [42]
})
best_model, best_score, all_models, all_scores = pf.bestFit(RandomForestClassifier, paramGrid, 
     Train_X_Tfidf, Train_Y, Test_X_Tfidf, Test_Y, 
     metric=accuracy_score, scoreLabel='accuracy_score')
print(best_model, best_score)
#plotScores(all_scores, paramGrid)

# Classifier Explainer

In [47]:
clf = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1), n_jobs=-1)
clf.fit(train_vectors, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1)

In [48]:
from lime.lime_text import LimeTextExplainer

c = make_pipeline(Tfidf_vect, clf)
print(c.predict_proba([data.review[1]]).round(3))

[[0.503 0.189 0.307]]


In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=data.rating_categorized.unique())
idx = 2
class_names = [0,1,2]
exp = explainer.explain_instance(data.review[idx], c.predict_proba, labels=[0, 1, 2])
print('Document id: %d' % idx)
print('Predicted class =', class_names[clf.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[data.rating_categorized[idx]])

In [None]:
exp = explainer.explain_instance(data.review[idx], c.predict_proba, top_labels=3)
print(exp.available_labels())

In [None]:
exp.show_in_notebook(text=True)