In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb

import src.helper as dp

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Jessie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
data = pd.read_csv('data/drugsComTrain_raw.tsv', sep='\t')

In [5]:
data=data.rename(columns={'Unnamed: 0':'id'})

In [6]:
# drop rows with na for condition, because only 0.5% of dataset is missing 
data = data.dropna(how = 'any', axis = 0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160398 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           160398 non-null  int64  
 1   drugName     160398 non-null  object 
 2   condition    160398 non-null  object 
 3   review       160398 non-null  object 
 4   rating       160398 non-null  float64
 5   date         160398 non-null  object 
 6   usefulCount  160398 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 9.8+ MB


In [7]:
data['condition'].unique()

array(['Left Ventricular Dysfunction', 'ADHD', 'Birth Control',
       'Opiate Dependence', 'Benign Prostatic Hyperplasia',
       'Emergency Contraception', 'Bipolar Disorde', 'Epilepsy',
       'Migraine Prevention', 'Depression', "Crohn's Disease", 'Cough',
       'Obesity', 'Urinary Tract Infection', 'ibromyalgia',
       'Chronic Myelogenous Leukemia', 'HIV Infection', 'Insomnia',
       'Rheumatoid Arthritis', 'Vaginal Yeast Infection',
       'Chlamydia Infection', 'Hirsutism', 'Panic Disorde', 'Migraine',
       'Pain', 'Irritable Bowel Syndrome', 'Osteoarthritis',
       'Constipation', 'Bowel Preparation', 'Psychosis', 'Muscle Spasm',
       'Hepatitis C', 'Overactive Bladde', 'Diabetes, Type 2',
       'Asthma, Maintenance', 'Non-Small Cell Lung Cance',
       'Schizophrenia', 'Dysuria', 'Smoking Cessation', 'Anxiety', 'Acne',
       'emale Infertility', 'Constipation, Acute',
       'Constipation, Drug Induced', 'Erectile Dysfunction',
       'Trigeminal Neuralgia', 'Undera

In [8]:
data[(data['condition']=='146</span> users found this comment helpful.') & (data['drugName']=='Efudex')]

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount
26338,184684,Efudex,146</span> users found this comment helpful.,"""I am currently using Efudex for the second ti...",10.0,"December 15, 2009",146


In [9]:
pd.options.display.max_rows = 62

In [10]:
data[(data['usefulCount']==146)]

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount
3113,204394,BuSpar,Anxiety,"""BEST medication I&#039;ve ever taken for anxi...",10.0,"February 18, 2016",146
4269,40879,Leuprolide,Prostate Cance,"""Have been on Lupron for about 2 years and my ...",9.0,"October 2, 2013",146
7191,160452,Buspirone,Anxiety,"""BEST medication I&#039;ve ever taken for anxi...",10.0,"February 18, 2016",146
8792,190851,Lupron,Prostate Cance,"""Have been on Lupron for about 2 years and my ...",9.0,"October 2, 2013",146
12955,162200,Denosumab,Osteoporosis,"""I had shot 3 months ago and have pain in leg,...",2.0,"November 14, 2015",146
17401,102752,Aripiprazole,Depression,"""I have been taking different depression medic...",10.0,"July 22, 2013",146
22669,24889,Deplin,Depression,"""I&#039;ve been on so many different medicatio...",10.0,"December 17, 2011",146
26338,184684,Efudex,146</span> users found this comment helpful.,"""I am currently using Efudex for the second ti...",10.0,"December 15, 2009",146
28250,77430,Prolia,Osteoporosis,"""After two shots, had swelling in both elbows,...",1.0,"July 28, 2016",146
30579,108279,Bupropion / naltrexone,Obesity,"""I am a 35 year old woman who has been fightin...",10.0,"July 26, 2015",146


In [14]:
data = data[data['condition'].str.contains('</span>')==False]

### Splitting into Slightly Useful (<10), Useful (10<=x<60), Very Useful (>=60)

In [15]:
data_three = data.copy()

In [16]:
data_three['useful_class'] = data_three['usefulCount'].apply(lambda x: 0 if x < 10 else(1 if 10<=x<60 else 2))

In [17]:
data_three.tail()

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,useful_class
161292,191035,Campral,Alcohol Dependence,"""I wrote my first report in Mid-October of 201...",10.0,"May 31, 2015",125,2
161293,127085,Metoclopramide,Nausea/Vomiting,"""I was given this in IV before surgey. I immed...",1.0,"November 1, 2011",34,1
161294,187382,Orencia,Rheumatoid Arthritis,"""Limited improvement after 4 months, developed...",2.0,"March 15, 2014",35,1
161295,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10.0,"September 19, 2015",79,2
161296,215220,Lubiprostone,"Constipation, Chronic","""I&#039;ve had chronic constipation all my adu...",9.0,"December 13, 2014",116,2


In [18]:
data_three.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159498 entries, 0 to 161296
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            159498 non-null  int64  
 1   drugName      159498 non-null  object 
 2   condition     159498 non-null  object 
 3   review        159498 non-null  object 
 4   rating        159498 non-null  float64
 5   date          159498 non-null  object 
 6   usefulCount   159498 non-null  int64  
 7   useful_class  159498 non-null  int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 11.0+ MB


### Tokenize, Stopwords, Stemming 

In [19]:
review1 = data_three.review[1]
review1

'"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."'

In [20]:
review2 = data_three.review[5]
review2

'"2nd day on 5mg started to work with rock hard erections however experianced headache, lower bowel preassure. 3rd day erections would wake me up &amp; hurt! Leg/ankles aches   severe lower bowel preassure like you need to go #2 but can&#039;t! Enjoyed the initial rockhard erections but not at these side effects or $230 for months supply! I&#039;m 50 &amp; work out 3Xs a week. Not worth side effects!"'

In [21]:
# check list of stopwords that are included
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [22]:
stop_list = stopwords.words('english')
len(stop_list)

179

In [23]:
# creating a list of words that should not be included in stopwords
not_stop = ["not", "don't","aren't","couldn't","didn't","doesn't","hadn't","hasn't","haven't","isn't","mightn't","needn't",
           "shan't","shouldn't","wasn't","weren't","won't","wouldn't","nor", "no"]
for word in not_stop:
    stop_list.remove(word)

len(not_stop)
#len(stop_list)

20

In [24]:
# html tag removal
review1 = review1.replace('"', '')
review2 = review2.replace('"', '')
soup = BeautifulSoup (review1, 'html.parser')
stripped_text = soup.get_text(separator=" ")
stripped_text

'My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective.'

In [26]:
# lowercase, stemming, remove stopwrods, remove numbers & punctuation 
stemmer = PorterStemmer()

dp.words_cleaned(review1, stemmer)

'son halfway fourth week intuniv becam concern began last week start take highest dose two day could hardli get bed cranki slept nearli hour drive home school vacat unusu call doctor monday morn said stick day see school get morn last two day problem free much agreeabl ever less emot good thing less cranki rememb thing overal behavior better tri mani differ medic far effect'

In [28]:
data_three['review_clean'] = data_three['review'].apply(lambda x : dp.words_cleaned(x, stemmer))

In [29]:
data_three.head()

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,useful_class,review_clean
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27,1,no side effect take combin bystol mg fish oil
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,2,son halfway fourth week intuniv becam concern ...
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,1,use take anoth oral contracept pill cycl happi...
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,1,first time use form birth control glad went pa...
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,1,suboxon complet turn life around feel healthie...


### Modeling Undersampling

In [30]:
X = data_three['review_clean']
y = data_three['useful_class']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [32]:
# tfidf 
tfidf = TfidfVectorizer(analyzer = 'word',
                       tokenizer = None,
                       preprocessor = None,
                       stop_words = None,
                       ngram_range=(1,2),
                       max_features=20000)


tfidf_xtrain = tfidf.fit_transform(X_train)
tfidf_xtest = tfidf.transform(X_test)

In [33]:
nm = NearMiss()
X_train_under, y_train_under = nm.fit_resample(tfidf_xtrain, y_train)

In [34]:
y_train_under.value_counts()

2    16365
1    16365
0    16365
Name: useful_class, dtype: int64

In [51]:
# decision tree classifier
dt = DecisionTreeClassifier()
dt.fit(X_train_under, y_train_under)
y_pred = dt.predict(tfidf_xtest)

In [52]:
accuracy_score(y_test, y_pred)

0.5056109725685786

In [53]:
target_names = ['class 0 - slightly', 'class 1 - useful', 'class 2 - very']
print(classification_report(y_test, y_pred, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.55      0.63      0.59     11413
  class 1 - useful       0.70      0.36      0.48     16613
    class 2 - very       0.29      0.74      0.42      4054

          accuracy                           0.51     32080
         macro avg       0.51      0.58      0.49     32080
      weighted avg       0.60      0.51      0.51     32080



In [62]:
# random forest classifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_under,y_train_under)
y_pred_rf = rf.predict(tfidf_xtest)

In [63]:
accuracy_score(y_test, y_pred_rf)

0.5222880299251871

In [64]:
print(classification_report(y_test, y_pred_rf, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.61      0.75      0.67     11413
  class 1 - useful       0.78      0.29      0.42     16613
    class 2 - very       0.29      0.84      0.43      4054

          accuracy                           0.52     32080
         macro avg       0.56      0.63      0.51     32080
      weighted avg       0.65      0.52      0.51     32080



In [79]:
# gradient boosting
gbc = GradientBoostingClassifier()
gbc.fit(X_train_under,y_train_under)
y_pred_gbc = gbc.predict(tfidf_xtest)

In [80]:
accuracy_score(y_test, y_pred_gbc)

0.45517456359102243

In [81]:
print(classification_report(y_test, y_pred_gbc, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.56      0.67      0.61     11413
  class 1 - useful       0.62      0.27      0.38     16613
    class 2 - very       0.23      0.63      0.33      4054

          accuracy                           0.46     32080
         macro avg       0.47      0.52      0.44     32080
      weighted avg       0.55      0.46      0.45     32080



In [68]:
from sklearn.naive_bayes import MultinomialNB

In [69]:
mnb = MultinomialNB()
mnb.fit(X_train_under,y_train_under)
y_pred_mnb = mnb.predict(tfidf_xtest)

In [70]:
accuracy_score(y_test, y_pred_mnb)

0.49242518703241894

In [71]:
print(classification_report(y_test, y_pred_mnb, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.58      0.73      0.65     11413
  class 1 - useful       0.66      0.28      0.40     16613
    class 2 - very       0.26      0.68      0.37      4054

          accuracy                           0.49     32080
         macro avg       0.50      0.56      0.47     32080
      weighted avg       0.58      0.49      0.48     32080



In [40]:
from xgboost import XGBClassifier

In [76]:
xgb = XGBClassifier()
xgb.fit(X_train_under,y_train_under)
y_pred_xgb = xgb.predict(tfidf_xtest)

In [77]:
accuracy_score(y_test, y_pred_xgb)

0.45236907730673315

In [78]:
print(classification_report(y_test, y_pred_xgb, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.56      0.65      0.60     11413
  class 1 - useful       0.62      0.27      0.38     16613
    class 2 - very       0.22      0.62      0.33      4054

          accuracy                           0.45     32080
         macro avg       0.47      0.52      0.44     32080
      weighted avg       0.55      0.45      0.45     32080



### Modeling Oversampling

In [35]:
from imblearn.over_sampling import SMOTE

In [36]:
sm = SMOTE(random_state=42)
X_train_over, y_train_over = sm.fit_resample(tfidf_xtrain, y_train)

In [37]:
y_train_under.value_counts()

2    16365
1    16365
0    16365
Name: useful_class, dtype: int64

In [85]:
# random forest classifier
rf_over = RandomForestClassifier(n_estimators=100)
rf_over.fit(X_train_over, y_train_over)
y_pred_rf_over = rf_over.predict(tfidf_xtest)

In [86]:
accuracy_score(y_test, y_pred_rf_over)

0.7943266832917706

In [88]:
print(classification_report(y_test, y_pred_rf_over, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.86      0.70      0.77     11413
  class 1 - useful       0.75      0.91      0.82     16613
    class 2 - very       0.85      0.59      0.69      4054

          accuracy                           0.79     32080
         macro avg       0.82      0.73      0.76     32080
      weighted avg       0.81      0.79      0.79     32080



In [41]:
xgb = XGBClassifier()
xgb.fit(X_train_over, y_train_over)
y_pred_xgb_over = xgb.predict(tfidf_xtest)

In [42]:
accuracy_score(y_test, y_pred_xgb_over)

0.6139498432601881

In [43]:
target_names = ['class 0 - slightly', 'class 1 - useful', 'class 2 - very']
print(classification_report(y_test, y_pred_xgb_over, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.70      0.47      0.56     11266
  class 1 - useful       0.60      0.83      0.70     16647
    class 2 - very       0.37      0.14      0.20      3987

          accuracy                           0.61     31900
         macro avg       0.56      0.48      0.49     31900
      weighted avg       0.61      0.61      0.59     31900



### Adding other features Classification

In [44]:
from scipy.sparse import hstack
from scipy.sparse import coo_matrix
from sklearn.metrics import mean_squared_error

In [46]:
data

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37
...,...,...,...,...,...,...,...
161292,191035,Campral,Alcohol Dependence,"""I wrote my first report in Mid-October of 201...",10.0,"May 31, 2015",125
161293,127085,Metoclopramide,Nausea/Vomiting,"""I was given this in IV before surgey. I immed...",1.0,"November 1, 2011",34
161294,187382,Orencia,Rheumatoid Arthritis,"""Limited improvement after 4 months, developed...",2.0,"March 15, 2014",35
161295,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10.0,"September 19, 2015",79


In [48]:
data_train = dp.features_classification(data)

In [49]:
data_train

Unnamed: 0,rating,review_clean,review_len,count_unique_word,useful_class,compound
0,9.0,no side effect take combin bystol mg fish oil,17,9,1,-0.2960
1,8.0,son halfway fourth week intuniv becam concern ...,141,54,2,0.8603
2,5.0,use take anoth oral contracept pill cycl happi...,133,50,1,0.7645
3,8.0,first time use form birth control glad went pa...,89,26,1,0.7184
4,9.0,suboxon complet turn life around feel healthie...,134,51,1,0.9403
...,...,...,...,...,...,...
161292,10.0,wrote first report midoctob not alcohol sinc p...,134,57,2,0.9561
161293,1.0,given iv surgey immedi becam anxiou could not ...,50,24,1,-0.4767
161294,2.0,limit improv month develop bad rash md refus c...,14,10,1,-0.7430
161295,10.0,thyroid medic year spent first synthroid vario...,137,60,2,0.6197


In [51]:
X_train_f, X_test_f, y_train_f, y_test_f = dp.split_classification(data_train)

In [52]:
X_train_f

Unnamed: 0,rating,review_clean,review_len,count_unique_word,compound
160029,7.0,found medic help get sleep worst part opiat wi...,61,21,0.8243
119242,10.0,ok got skyla hour ago far great never children...,106,42,-0.3400
101809,9.0,oh product work alright took around right morn...,38,18,-0.7964
59952,1.0,not help sleep regular basi twice help get sle...,27,11,-0.3089
59446,8.0,alway sleep problem even kid sometim bad somet...,140,58,0.7939
...,...,...,...,...,...
121251,9.0,nuvigil work great insur not pay diagnosi not ...,72,26,0.1681
104879,10.0,first week lost lb doctor said could anomali n...,122,47,-0.9154
133434,10.0,take donnat reglan minut meal bedtim help inte...,51,20,0.9347
148542,8.0,great pain relief complic kidney stone iv hosp...,23,13,0.5994


In [53]:
# tfidf 
tfidf = TfidfVectorizer(analyzer = 'word',
                       tokenizer = None,
                       preprocessor = None,
                       stop_words = None,
                       ngram_range=(1,2),
                       max_features=20000)


tfidf_xtrain = tfidf.fit_transform(X_train_f['review_clean'])
tfidf_xtest = tfidf.transform(X_test_f['review_clean'])

In [54]:
X_train_f = X_train_f.drop(columns=['review_clean'])
X_test_f = X_test_f.drop(columns=['review_clean'])

In [55]:
X_train_feat = hstack([tfidf_xtrain, coo_matrix(X_train_f)])

In [56]:
X_test_feat = hstack([tfidf_xtest, coo_matrix(X_test_f)])

In [57]:
# decision tree classifier
dt_feat = DecisionTreeClassifier()
dt_feat.fit(X_train_feat, y_train_f)
y_pred_dt_feat = dt_feat.predict(X_test_feat)

In [58]:
accuracy_score(y_test_f, y_pred_dt_feat)

0.7510031347962383

In [59]:
print(classification_report(y_test, y_pred_dt_feat, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.76      0.72      0.74     11266
  class 1 - useful       0.77      0.80      0.79     16647
    class 2 - very       0.63      0.63      0.63      3987

          accuracy                           0.75     31900
         macro avg       0.72      0.72      0.72     31900
      weighted avg       0.75      0.75      0.75     31900



In [60]:
# random forest classifier
rf_feat = RandomForestClassifier(n_estimators=100)
rf_feat.fit(X_train_feat, y_train_f)
y_pred_rf_feat = rf_feat.predict(X_test_feat)

In [61]:
accuracy_score(y_test_f, y_pred_rf_feat)

0.8031347962382445

In [62]:
print(classification_report(y_test_f, y_pred_rf_feat, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.89      0.70      0.78     11266
  class 1 - useful       0.75      0.94      0.83     16647
    class 2 - very       1.00      0.52      0.69      3987

          accuracy                           0.80     31900
         macro avg       0.88      0.72      0.77     31900
      weighted avg       0.83      0.80      0.80     31900



In [63]:
# xgbclassifier
xgb.fit(X_train_feat, y_train_f)
y_pred_xgb_feat = xgb.predict(X_test_feat)

In [64]:
accuracy_score(y_test_f, y_pred_xgb_feat)

0.6292163009404389

In [65]:
print(classification_report(y_test_f, y_pred_xgb_feat, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.73      0.47      0.57     11266
  class 1 - useful       0.60      0.88      0.71     16647
    class 2 - very       0.56      0.02      0.04      3987

          accuracy                           0.63     31900
         macro avg       0.63      0.46      0.44     31900
      weighted avg       0.64      0.63      0.58     31900



In [66]:
from sklearn.naive_bayes import MultinomialNB

In [70]:
# gbc = GradientBoostingClassifier()
# gbc.fit(X_train_feat, y_train_f)
# y_pred_gbc_feat = gbc.predict(X_test_feat)

In [None]:
accuracy_score(y_test_f, y_pred_gbc_feat)

In [None]:
print(classification_report(y_test_f, y_pred_gbc_feat, target_names=target_names))

In [35]:
from sklearn.metrics import confusion_matrix

In [37]:
confusion_matrix(y_test_f, y_pred_rf_feat, labels=[0,1,2])

array([[ 7888,  3523,     2],
       [  943, 15664,     6],
       [   54,  1826,  2174]])

### Save Tfidfvectorizer and model, load

In [71]:
# save model to disk
import pickle

filename = 'finalized_model_cla.sav'
pickle.dump(rf_feat, open(filename, 'wb'))

In [73]:
pickle.dump(tfidf, open('tfidf.pickle','wb'))
tfidf = pickle.load(open('tfidf.pickle','rb'))
load_model = pickle.load(open(filename, 'rb'))

### Testing Models on Actual Test Data

In [74]:
from sklearn.metrics import plot_confusion_matrix


In [75]:
data_test = pd.read_csv('data/drugsComTest_raw.tsv', sep='\t')

In [76]:
data_test.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [77]:
data_test = dp.features_classification(data_test)

In [79]:
X_t = data_test.drop(columns=['useful_class'])
y_t = data_test['useful_class']

In [80]:
tfidf_test = tfidf.transform(X_t['review_clean'])

In [82]:
X_t = X_t.drop(columns=['review_clean'])
X_t_stacked = hstack([tfidf_test, coo_matrix(X_t)])

In [83]:
y_pred_rf_test = load_model.predict(X_t_stacked)

In [84]:
accuracy_score(y_t, y_pred_rf_test)

0.7988133764832794

In [85]:
print(classification_report(y_t, y_pred_rf_test, target_names=target_names))

                    precision    recall  f1-score   support

class 0 - slightly       0.88      0.69      0.77     19005
  class 1 - useful       0.74      0.94      0.83     28028
    class 2 - very       1.00      0.52      0.69      6733

          accuracy                           0.80     53766
         macro avg       0.87      0.72      0.76     53766
      weighted avg       0.82      0.80      0.79     53766



In [86]:
plot_confusion_matrix(load_model, X_t_stacked, y_test)

NameError: name 'confusion_matrix' is not defined