In [2]:
from __future__ import division, print_function
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from random_forest.random_forest_model import RandomForest

from textblob import TextBlob
from nltk.corpus import stopwords
import warnings; warnings.simplefilter('ignore')
import nltk
import string
from nltk import ngrams
from nltk.stem import SnowballStemmer
from sklearn.preprocessing import LabelEncoder

In [3]:
# local path of the data source
testing_data = 'data/testing.csv'
training_data = 'data/training.csv'
validation_data = 'data/validation.csv'

In [4]:
def review_clean(review): 
    # changing to lower case
    lower = review.str.lower()
    # Replacing the repeating pattern of &#039;
    pattern_remove = lower.str.replace("&#039;", "")
    # Removing all the special Characters
    special_remove = pattern_remove.str.replace(r'[^\w\d\s]',' ')
    # Removing all the non ASCII characters
    ascii_remove = special_remove.str.replace(r'[^\x00-\x7F]+',' ')
    # Removing the leading and trailing Whitespaces
    whitespace_remove = ascii_remove.str.replace(r'^\s+|\s+?$','')
    # Replacing multiple Spaces with Single Space
    multiw_remove = whitespace_remove.str.replace(r'\s+',' ')
    # Replacing Two or more dots with one
    dataframe = multiw_remove.str.replace(r'\.{2,}', ' ')
    return dataframe

In [5]:
def sentiment(review):
    # Sentiment polarity of the reviews
    pol = []
    for i in review:
        analysis = TextBlob(i)
        pol.append(analysis.sentiment.polarity)
    return pol

In [6]:
# three data given by the COMP5434 project
train = pd.read_csv(training_data)
validation = pd.read_csv(validation_data)
test = pd.read_csv(testing_data)

data = pd.concat([train, validation])
data['review_clean'] = review_clean(data['reviewComment'])
# Removing the stopwords
stop_words = set(stopwords.words('english'))
data['review_clean'] = data['review_clean'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
# Removing the word stems using the Snowball Stemmer
Snow_ball = SnowballStemmer("english")
data['review_clean'] = data['review_clean'].apply(lambda x: " ".join(Snow_ball.stem(word) for word in x.split()))

data['sentiment'] = sentiment(data['reviewComment'])
data['sentiment_clean'] = sentiment(data['review_clean'])
# Cleaning the reviews without removing the stop words and using snowball stemmer
data['review_clean_ss'] = review_clean(data['reviewComment'])
data['sentiment_clean_ss'] = sentiment(data['review_clean_ss'])
data = data.dropna(how="any", axis=0)
#Word count in each review
data['count_word']=data["review_clean_ss"].apply(lambda x: len(str(x).split()))
#Unique word count 
data['count_unique_word']=data["review_clean_ss"].apply(lambda x: len(set(str(x).split())))
#Letter count
data['count_letters']=data["review_clean_ss"].apply(lambda x: len(str(x)))
#punctuation count
data["count_punctuations"] = data["reviewComment"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
#upper case words count
data["count_words_upper"] = data["reviewComment"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#title case words count
data["count_words_title"] = data["reviewComment"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
#Number of stopwords
data["count_stopwords"] = data["reviewComment"].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))
#Average length of the words
data["mean_word_len"] = data["review_clean_ss"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# Label Encoding Drugname and Conditions
label_encoder_feat = {}
for feature in ['drugName', 'condition']:
    label_encoder_feat[feature] = LabelEncoder()
    data[feature] = label_encoder_feat[feature].fit_transform(data[feature])

# converting the date into datetime format
data['date'] = pd.to_datetime(data['date'], errors = 'coerce')

# now extracting year from date
data['Year'] = data['date'].dt.year

# extracting the month from the date
data['month'] = data['date'].dt.month

# extracting the days from the date
data['day'] = data['date'].dt.day

data.loc[(data['rating'] >= 5), 'Review_Sentiment'] = 1
data.loc[(data['rating'] < 5), 'Review_Sentiment'] = 0

data['Review_Sentiment'].value_counts()



0.0    4155
1.0    4004
Name: Review_Sentiment, dtype: int64

In [7]:
data.head()

Unnamed: 0,recordId,drugName,condition,reviewComment,date,usefulCount,sideEffects,rating,review_clean,sentiment,...,count_letters,count_punctuations,count_words_upper,count_words_title,count_stopwords,mean_word_len,Year,month,day,Review_Sentiment
0,163740,833,123,"""I&#039;ve tried a few antidepressants over th...",2012-02-28,22,Mild Side Effects,5,ive tri antidepress year citalopram fluoxetin ...,0.0,...,410,22,2,4,27,5.134328,2012,2,28,1.0
1,206473,785,116,"""My son has Crohn&#039;s disease and has done ...",2009-05-17,17,Severe Side Effects,4,son crohn diseas done well asacol complaint sh...,0.566667,...,246,13,0,4,22,4.145833,2009,5,17,0.0
2,159672,153,402,"""Quick reduction of symptoms""",2017-09-29,3,No Side Effects,5,quick reduct symptom,0.333333,...,27,2,0,1,1,6.0,2017,9,29,1.0
3,39293,300,417,"""Contrave combines drugs that were used for al...",2017-03-05,35,Mild Side Effects,5,contrav combin drug use alcohol smoke opioid c...,0.139063,...,724,42,10,14,71,4.0,2017,3,5,1.0
4,97768,315,73,"""I have been on this birth control for one cyc...",2015-10-22,4,Severe Side Effects,5,birth control one cycl read review type simila...,0.260926,...,739,17,10,17,80,3.966443,2015,10,22,1.0


In [8]:
features = data[['condition', 'usefulCount', 'sentiment', 'day', 'month', 'Year',
                'sentiment_clean_ss', 'count_word', 'count_unique_word', 'count_letters',
                'count_punctuations', 'count_words_upper', 'count_words_title',
                'count_stopwords', 'mean_word_len']]

target = data['rating']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 20)
print("X_train.shape:", X_train.shape)
print("Y_train.shape:", y_train.shape)

# clf = RandomForest(n_estimators=100)
# clf.fit(X_train, y_train)


X_train.shape: (6527, 15)
Y_train.shape: (6527,)


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_train, y_train)

RandomForestClassifier(oob_score=True, random_state=10)

In [13]:
y_pred = rf0.predict(X_test)

In [14]:
y_pred

array([5, 5, 1, ..., 5, 5, 1])

In [15]:
from utils import train_test_split, accuracy_score, Plot
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5263480392156863


In [152]:
import numpy
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
%matplotlib inline

In [20]:
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return numpy.array(dataX), numpy.array(dataY)

# fix random seed for reproducibility
numpy.random.seed(7)

In [27]:
trainX, trainY = create_dataset(train, look_back)

Unnamed: 0,condition,usefulCount,sentiment,day,month,Year,sentiment_clean_ss,count_word,count_unique_word,count_letters,count_punctuations,count_words_upper,count_words_title,count_stopwords,mean_word_len
527,81,9,-0.060000,21,1,2016,-0.060000,58,44,293,9,6,9,23,4.068966
1270,359,54,0.213420,17,8,2014,0.213420,132,97,713,20,4,9,55,4.409091
6442,371,60,0.076894,21,7,2014,0.028819,132,94,746,29,4,10,57,4.659091
72,23,13,0.292000,11,8,2015,0.292000,153,99,718,23,17,21,73,3.699346
866,163,38,0.051369,26,3,2016,0.053452,143,106,712,41,6,12,64,3.986014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,355,5,0.458333,3,9,2011,0.458333,45,35,212,8,3,5,25,3.733333
428,408,9,0.033333,7,6,2017,0.033333,54,45,278,13,3,4,27,4.166667
4390,279,1,0.248571,5,3,2008,0.248571,34,30,180,5,2,2,17,4.323529
2537,364,12,0.045000,23,3,2016,0.005000,102,74,564,15,4,13,44,4.539216


In [45]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,
              solver='lbfgs')

In [46]:
clf.predict(X_test)

array([5, 5, 5, ..., 5, 5, 5])

In [47]:
from utils import train_test_split, accuracy_score, Plot
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5263480392156863


### test data predict

In [37]:
# three data given by the COMP5434 project

test = pd.read_csv(testing_data)
test['review_clean'] = review_clean(test['reviewComment'])
# Removing the stopwords
stop_words = set(stopwords.words('english'))
test['review_clean'] = test['review_clean'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
# Removing the word stems using the Snowball Stemmer
Snow_ball = SnowballStemmer("english")
test['review_clean'] = test['review_clean'].apply(lambda x: " ".join(Snow_ball.stem(word) for word in x.split()))

test['sentiment'] = sentiment(test['reviewComment'])
test['sentiment_clean'] = sentiment(test['review_clean'])
# Cleaning the reviews without removing the stop words and using snowball stemmer
test['review_clean_ss'] = review_clean(test['reviewComment'])
test['sentiment_clean_ss'] = sentiment(test['review_clean_ss'])

#Word count in each review
test['count_word']=test["review_clean_ss"].apply(lambda x: len(str(x).split()))
#Unique word count 
test['count_unique_word']=test["review_clean_ss"].apply(lambda x: len(set(str(x).split())))
#Letter count
test['count_letters']=test["review_clean_ss"].apply(lambda x: len(str(x)))
#punctuation count
test["count_punctuations"] = test["reviewComment"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
#upper case words count
test["count_words_upper"] = test["reviewComment"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#title case words count
test["count_words_title"] = test["reviewComment"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
#Number of stopwords
test["count_stopwords"] = test["reviewComment"].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))
#Average length of the words
test["mean_word_len"] = test["review_clean_ss"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# Label Encoding Drugname and Conditions
label_encoder_feat = {}
for feature in ['drugName', 'condition']:
    label_encoder_feat[feature] = LabelEncoder()
    test[feature] = label_encoder_feat[feature].fit_transform(test[feature])

# converting the date into datetime format
test['date'] = pd.to_datetime(test['date'], errors = 'coerce')

# now extracting year from date
test['Year'] = test['date'].dt.year

# extracting the month from the date
test['month'] = test['date'].dt.month

# extracting the days from the date
test['day'] = test['date'].dt.day




In [41]:
testXXX = test[['condition', 'usefulCount', 'sentiment', 'day', 'month', 'Year',
                'sentiment_clean_ss', 'count_word', 'count_unique_word', 'count_letters',
                'count_punctuations', 'count_words_upper', 'count_words_title',
                'count_stopwords', 'mean_word_len']]

In [29]:
test = pd.read_csv(testing_data)

In [42]:
testXXX

Unnamed: 0,condition,usefulCount,sentiment,day,month,Year,sentiment_clean_ss,count_word,count_unique_word,count_letters,count_punctuations,count_words_upper,count_words_title,count_stopwords,mean_word_len
0,42,1,0.012500,10,9,2017,0.012500,105,73,539,23,8,15,45,4.142857
1,64,0,-0.102381,29,8,2017,-0.102381,78,68,439,16,2,7,31,4.641026
2,90,13,0.020455,6,7,2015,0.020455,70,54,359,31,5,7,23,4.142857
3,221,7,-0.137753,7,7,2011,-0.137753,112,79,574,19,7,13,54,4.133929
4,150,11,-0.017045,1,10,2009,-0.017045,143,86,708,32,16,25,69,3.958042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1793,128,176,0.151937,9,8,2014,0.104650,141,89,682,35,9,13,68,3.843972
1794,234,8,0.021693,26,3,2016,0.021693,58,44,293,13,6,8,23,4.068966
1795,90,2,0.240774,12,8,2015,0.216369,72,53,330,14,9,11,36,3.597222
1796,177,58,-0.066667,2,2,2010,-0.049167,132,88,721,23,6,19,59,4.469697


In [43]:
temp_array = rf0.predict(testXXX)

In [44]:
len(temp_array)

1798

In [45]:
import csv
test = pd.read_csv(testing_data)
test['rating'] = temp_array

In [47]:
test.to_csv('data/testing2.csv')