In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("drugsComTest_raw.csv")
df

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4
...,...,...,...,...,...,...,...
53761,159999,Tamoxifen,"Breast Cancer, Prevention","""I have taken Tamoxifen for 5 years. Side effe...",10,13-Sep-14,43
53762,140714,Escitalopram,Anxiety,"""I&#039;ve been taking Lexapro (escitaploprgra...",9,8-Oct-16,11
53763,130945,Levonorgestrel,Birth Control,"""I&#039;m married, 34 years old and I have no ...",8,15-Nov-10,7
53764,47656,Tapentadol,Pain,"""I was prescribed Nucynta for severe neck/shou...",1,28-Nov-11,20


In [4]:
#DATA PREPROCESSING#
df.isna().sum()
print("Summary statistics of numerical features : \n", df.describe())

Summary statistics of numerical features : 
             uniqueID        rating   usefulCount
count   53766.000000  53766.000000  53766.000000
mean   116386.701187      6.976900     27.989752
std     67017.739881      3.285207     36.172833
min         0.000000      1.000000      0.000000
25%     58272.500000      4.000000      6.000000
50%    116248.500000      8.000000     16.000000
75%    174586.750000     10.000000     36.000000
max    232284.000000     10.000000    949.000000


In [5]:
print("\nTotal number of reviews: ", len(df))


Total number of reviews:  53766


In [6]:
print("\nTotal number of brands: ", len(list(set(df['drugName']))))


Total number of brands:  2637


In [7]:
print("\nTotal number of unique products: ", len(list(set(df['condition']))))


Total number of unique products:  709


In [9]:
print("\nPercentage of reviews with neutral sentiment : {:.2f}%".format(
    df[df['rating'] == 3]["review"].count() / len(df) * 100))


Percentage of reviews with neutral sentiment : 4.10%


In [10]:
print("\nPercentage of reviews with positive sentiment : {:.2f}%".format(
    df[df['rating'] > 3]["review"].count() / len(df) * 100))


Percentage of reviews with positive sentiment : 77.98%


In [11]:
print("\nPercentage of reviews with negative sentiment : {:.2f}%".format(
    df[df['rating'] < 3]["review"].count() / len(df) * 100))


Percentage of reviews with negative sentiment : 17.92%


In [14]:
import plotly.express as px

fig = px.bar(df['rating'].value_counts().sort_index(), x='rating', title='Distribution of Rating')
fig.show()

drug = df["drugName"].value_counts()


In [13]:
import plotly.express as px

fig = px.bar(drug[:20], x='drugName', title='Number of Reviews for Top 20 Drugs')
fig.show()

conditions = df["condition"].value_counts()


In [16]:
import plotly.express as px

fig = px.bar(conditions[:30], x='condition', title='Number of Reviews for Top 30 conditions')
fig.show()

In [17]:
review_length = df["review"].dropna().map(lambda x: len(x))
review_length = review_length.loc[review_length < 1500]
review_length = pd.DataFrame(review_length)
fig = px.histogram(review_length, x="review")
fig.show()

In [18]:
#DATA PREPARATION#
# uncomment to use full set of data
df = df.sample(frac=1, random_state=0)  
# Drop missing values
df.dropna(inplace=True)
# Encode 4s and 5s as 1 (positive sentiment) and 1s and 2s as 0 (negative sentiment)
# df['Sentiment'] = np.where(df['rating'] > 6, 1, 0)
df['Sentiment'] = np.where(df['rating'] > 3, 1, 0)
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,Sentiment
50287,131487,Generess Fe,Birth Control,"""Was on various birth control pills, last one ...",9,24-Dec-12,27,1
43081,209341,Zomig,Migraine,"""I have suffered with migraines for over 40 ye...",10,20-Dec-08,6,1
23595,229685,Lurasidone,Bipolar Disorde,"""Latuda has changed my life. I&#039;ve been t...",10,27-Nov-13,178,1
24767,16314,Ethinyl estradiol / norethindrone,Menstrual Disorders,"""I got my tubes tied about 8 months ago and it...",9,29-Feb-16,14,1
27888,206815,Duac,Acne,"""For the first 3-4 days of use my face was so ...",9,28-Jun-16,5,1


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['review'], df['Sentiment'], \
                                                    test_size=0.1, random_state=0)

print('Load %d training examples and %d validation examples. \n' % (X_train.shape[0], X_test.shape[0]))
print('Show a review in the training set : \n', X_train.iloc[10])
X_train, y_train

Load 48123 training examples and 5348 validation examples. 

Show a review in the training set : 
 "Works well.  Watch for diaper rash (yeast type) in infants.  Apply cream to diaper area liberally at every diaper change and give probiotics."


(31853          "Nicobate is rubbish.\r\r\nNicotill works."
 43232    "Very effective to me. Considering, greatly su...
 5951     "The only good thing about this pill is pregna...
 27167    "Originally I started taking trazodone for ins...
 7643     "No good for depression or anxiety. For me jus...
                                ...                        
 42846    "Had severe urinary tract infection. It was ve...
 26885    "Have been taking this off and on for about 3 ...
 42057    "I&rsquo;ve been coughing for days (unable to ...
 5928          "Cymbalta did nothing to lessen my anxiety."
 23719    "I&#039;ve been on Lamictal for a few years an...
 Name: review, Length: 48123, dtype: object,
 31853    0
 43232    1
 5951     0
 27167    1
 7643     0
         ..
 42846    1
 26885    0
 42057    1
 5928     0
 23719    1
 Name: Sentiment, Length: 48123, dtype: int32)

In [22]:
'''Bag of Words
<br>
**Step 1 : Preprocess raw reviews to cleaned reviews**
**Step 2 : Create BoW using CountVectorizer / Tfidfvectorizer in sklearn**
**Step 3 : Transform review text to numerical representations (feature vectors)**
**Step 4 : Fit feature vectors to supervised learning algorithm (eg. Naive Bayes, Logistic regression, etc.)**
**Step 5 : Improve the model performance by GridSearch**
# Text Preprocessing
<br>
**Step 1 : remove html tags using BeautifulSoup**
**Step 2 : remove non-character such as digits and symbols**
**Step 3 : convert to lower case**
**Step 4 : remove stop words such as "the" and "and" if needed**
**Step 5 : convert to root words by stemming if needed**'''

'Bag of Words\n<br>\n**Step 1 : Preprocess raw reviews to cleaned reviews**\n**Step 2 : Create BoW using CountVectorizer / Tfidfvectorizer in sklearn**\n**Step 3 : Transform review text to numerical representations (feature vectors)**\n**Step 4 : Fit feature vectors to supervised learning algorithm (eg. Naive Bayes, Logistic regression, etc.)**\n**Step 5 : Improve the model performance by GridSearch**\n# Text Preprocessing\n<br>\n**Step 1 : remove html tags using BeautifulSoup**\n**Step 2 : remove non-character such as digits and symbols**\n**Step 3 : convert to lower case**\n**Step 4 : remove stop words such as "the" and "and" if needed**\n**Step 5 : convert to root words by stemming if needed**'

In [23]:
def cleanData(raw_data, remove_stopwords=False, stemming=False, split_text=False):
    text = BeautifulSoup(raw_data, 'html.parser').get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    words = letters_only.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    if stemming == True:
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(w) for w in words]

    if split_text == True:
        return (words)

    return (" ".join(words))

In [24]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag
from bs4 import BeautifulSoup


In [25]:
X_train_cleaned = []
X_test_cleaned = []

for d in X_train:
    X_train_cleaned.append(cleanData(d))
print('Show a cleaned review in the training set : \n', X_train_cleaned[10])

for d in X_test:
    X_test_cleaned.append(cleanData(d))


Show a cleaned review in the training set : 
 works well watch for diaper rash yeast type in infants apply cream to diaper area liberally at every diaper change and give probiotics


In [26]:
#CountVectorizer with Mulinomial Naive Bayes #

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

countVect = CountVectorizer()
X_train_countVect = countVect.fit_transform(X_train_cleaned)
print("Number of features : %d \n" % len(countVect.get_feature_names()))  # 6378
print("Show some feature names : \n", countVect.get_feature_names()[::1000])

Number of features : 29778 

Show some feature names : 
 ['aa', 'amitriiptyline', 'avitan', 'boats', 'cavities', 'column', 'crossing', 'dexaphen', 'drunken', 'ethin', 'flags', 'glided', 'hfs', 'inconvenient', 'julia', 'living', 'metasomethingoranother', 'naperstacks', 'oily', 'peanutbutter', 'poz', 'quasi', 'resemble', 'scolded', 'slumped', 'stories', 'tbey', 'trapper', 'urinary', 'whala']


In [27]:
mnb = MultinomialNB()
mnb.fit(X_train_countVect, y_train)


def modelEvaluation(predictions):
    '''
    Print model evaluation to predicted result
    '''
    print("\nAccuracy on validation set: {:.4f}".format(accuracy_score(y_test, predictions)*2.0))
    print("\nAUC score : {:.4f}".format(roc_auc_score(y_test, predictions)*2.0))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))

In [28]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score

predictions = mnb.predict(countVect.transform(X_test_cleaned))
modelEvaluation(predictions)



Accuracy on validation set: 1.5550

AUC score : 1.4993

Classification report : 
               precision    recall  f1-score   support

           0       0.69      0.66      0.67      1880
           1       0.82      0.84      0.83      3468

    accuracy                           0.78      5348
   macro avg       0.76      0.75      0.75      5348
weighted avg       0.78      0.78      0.78      5348


Confusion Matrix : 
 [[1233  647]
 [ 543 2925]]


In [29]:
from  sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf = TfidfVectorizer(min_df=5)  # minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(X_train)


In [31]:
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)
predictions = lr.predict(tfidf.transform(X_test_cleaned))
modelEvaluation(predictions)



Accuracy on validation set: 1.6260

AUC score : 1.5421

Classification report : 
               precision    recall  f1-score   support

           0       0.80      0.63      0.70      1880
           1       0.82      0.91      0.86      3468

    accuracy                           0.81      5348
   macro avg       0.81      0.77      0.78      5348
weighted avg       0.81      0.81      0.81      5348


Confusion Matrix : 
 [[1184  696]
 [ 304 3164]]
