In [41]:
#### mcahine learning on text data attempt #2 ############
## link to tutorial: https://ai.plainenglish.io/sentiment-classification-using-xgboost-7abdaf4771f9
# importing the relevant packages
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd 
import itertools
import nltk


### import df 
train  = pd.read_csv("coding/text_cleaning_data/drugsComTrain_raw.tsv", sep = '\t') 

In [42]:
## since I’m only interested in the text and rating
## subset the data on columns 
train = train.loc[:, ['review', 'rating']] 
## only accepting ratings that're >= 7 and <= 4
train = train.iloc[[x[0] or x[1] for x in list(zip((train['rating'] <= 4), (train['rating'] >= 7)))], :]

# the same is done with the test set 
test = pd.read_csv('coding/text_cleaning_data/drugsComTest_raw.tsv', sep = '\t') 
test = test.loc[:, ['review', 'rating']]  
test = test.iloc[[x[0] or x[1] for x in list(zip((test['rating'] <= 4), (test['rating'] >= 7)))], :]

### take a look at the data 
train

Unnamed: 0,review,rating
0,"""It has no side effect, I take it in combinati...",9.0
1,"""My son is halfway through his fourth week of ...",8.0
3,"""This is my first time using any form of birth...",8.0
4,"""Suboxone has completely turned my life around...",9.0
5,"""2nd day on 5mg started to work with rock hard...",2.0
...,...,...
161292,"""I wrote my first report in Mid-October of 201...",10.0
161293,"""I was given this in IV before surgey. I immed...",1.0
161294,"""Limited improvement after 4 months, developed...",2.0
161295,"""I&#039;ve been on thyroid medication 49 years...",10.0


In [45]:
### all we want to do is clean the text for these columns of interest. We should be able to do this with a basic loop 
# Pre Processing
stop_words = stopwords.words('english') # creates a list of English stop words
wnl = WordNetLemmatizer() # I used lemmatizing instead of stemming

train['review'] = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+",' ',str(train['review']).lower()).strip()
train['review'] =  re.sub("\d+", " ", str(train['review']))

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]



## lematize here 
train['text_lemmatized'] = train.review.apply(lemmatize_text)

#df = pd.DataFrame(['this was cheesy', 'she likes these books', 'wow this is great'], columns=['text'])
#df['text_lemmatized'] = df.text.apply(lemmatize_text)
#for i, element in enumerate(train):
#train['review'][i] = [wnl.lemmatize(i) for i in train['review'].split(' ') if i not in stop_words]
train

Unnamed: 0,review,rating,text_lemmatized
0,it has no side effect i take it ...,9.0,"[it, ha, no, side, effect, i, take, it, in, it..."
1,it has no side effect i take it ...,8.0,"[it, ha, no, side, effect, i, take, it, in, it..."
3,it has no side effect i take it ...,8.0,"[it, ha, no, side, effect, i, take, it, in, it..."
4,it has no side effect i take it ...,9.0,"[it, ha, no, side, effect, i, take, it, in, it..."
5,it has no side effect i take it ...,2.0,"[it, ha, no, side, effect, i, take, it, in, it..."
...,...,...,...
161292,it has no side effect i take it ...,10.0,"[it, ha, no, side, effect, i, take, it, in, it..."
161293,it has no side effect i take it ...,1.0,"[it, ha, no, side, effect, i, take, it, in, it..."
161294,it has no side effect i take it ...,2.0,"[it, ha, no, side, effect, i, take, it, in, it..."
161295,it has no side effect i take it ...,10.0,"[it, ha, no, side, effect, i, take, it, in, it..."


In [None]:
### try again to get rid of zeroes 


In [35]:
### now let's try to vectorize the data 
# importing the relevant modules
from sklearn.feature_extraction.text import CountVectorizer
# vectorizing the sentences
cv = CountVectorizer(binary = True) # implies that it indicates whether the word is present or not.
cv.fit(train['review']) # find all the unique words from the training set
train_x = cv.transform(train['review'])
test_x = cv.transform(test['review'])

In [37]:
### now take a look at the text 
train_x ## it is a sparse matrix 
train['text_lemmatized'].describe()

count                                                146941
unique                                                    1
top       [0, it, ha, no, side, effect, i, take, it, in,...
freq                                                 146941
Name: text_lemmatized, dtype: object

In [38]:
## get description of the review column 
train['review'].describe() ## appears to be the same length as above. 

count                                                146941
unique                                                    1
top       0 it has no side effect i take it in combinati...
freq                                                 146941
Name: review, dtype: object

In [56]:
### try it on the lemmatized text 
### we need to use the multilabel binarizer to get this to work on lematized text 
from sklearn.preprocessing import MultiLabelBinarizer

count_vec = MultiLabelBinarizer()
mlb = count_vec.fit(train["text_lemmatized"])
rr =pd.DataFrame(mlb.transform(train["text_lemmatized"]), columns=[mlb.classes_])

#train_y = cv.transform(train['text_lemmatized'])

In [57]:
rr

Unnamed: 0,",",...,Length:,Name:,dtype:,effect,ha,i,in,it,no,object,"review,",side,take
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146936,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
146937,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
146938,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
146939,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [50]:
### now let's see what we can do with the data in train_x

# importing the relevant modules
import xgboost as xgb
import numpy as np
xgb_train_labels = []
for rating in train['rating']:
    if rating >= 7:
        xgb_train_labels.append(1)
    elif rating <= 4:
        xgb_train_labels.append(0)
    else:
        xgb_train_labels.append(None)

xgb_test_labels = []
for rating in test['rating']:
    if rating >= 7:
        xgb_test_labels.append(1)
    elif rating <= 4:
        xgb_test_labels.append(0)
    else:
        xgb_test_labels.append(None)
# creating a variable for the new train and test sets
xgb_train = xgb.DMatrix(train_x, xgb_train_labels)
xgb_test = xgb.DMatrix(test_x, xgb_test_labels)

#text = train["text_lemmatized"].map(' '.join)
#count_vec = CountVectorizer()
#cv = count_vec.fit(text)

#pd.DataFrame(cv.transform(text).toarray(), columns=[mlb.classes_])

In [53]:
# Setting the Parameters of the Model
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

param = {'eta': 0.75,
         'max_depth': 50,
         'objective': 'binary:logitraw'}
# Training the Model
xgb_model = xgb.train(param, xgb_train, num_boost_round = 30)
# Predicting using the Model
y_pred = xgb_model.predict(xgb_test)
y_pred = np.where(np.array(y_pred) > 0.5, 1, 0) # converting them to 1/0’s
# Evaluation of Model
accuracy_score(xgb_test_labels, y_pred)     #
f1_score(xgb_test_labels, y_pred) # supposed to be  94.83%

0.8400393472154734

In [55]:
accuracy_score(xgb_test_labels, y_pred) # supposed to be  92.47%; problematic that it is not 

0.724196415799906

<xgboost.core.Booster at 0x282ae469580>

NameError: name 'rmp_df' is not defined

In [63]:
xgb_test_labels.type()

AttributeError: 'list' object has no attribute 'type'