In [6]:
from __future__ import division, print_function
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from random_forest.random_forest_model import RandomForest

from textblob import TextBlob
from nltk.corpus import stopwords
import warnings; warnings.simplefilter('ignore')
import nltk
import string
from nltk import ngrams
from nltk.stem import SnowballStemmer
from sklearn.preprocessing import LabelEncoder

In [7]:
# local path of the data source
testing_data = 'data/testing.csv'
training_data = 'data/training.csv'
validation_data = 'data/validation.csv'

In [8]:
def review_clean(review): 
    # changing to lower case
    lower = review.str.lower()
    # Replacing the repeating pattern of &#039;
    pattern_remove = lower.str.replace("&#039;", "")
    # Removing all the special Characters
    special_remove = pattern_remove.str.replace(r'[^\w\d\s]',' ')
    # Removing all the non ASCII characters
    ascii_remove = special_remove.str.replace(r'[^\x00-\x7F]+',' ')
    # Removing the leading and trailing Whitespaces
    whitespace_remove = ascii_remove.str.replace(r'^\s+|\s+?$','')
    # Replacing multiple Spaces with Single Space
    multiw_remove = whitespace_remove.str.replace(r'\s+',' ')
    # Replacing Two or more dots with one
    dataframe = multiw_remove.str.replace(r'\.{2,}', ' ')
    return dataframe

In [9]:
def sentiment(review):
    # Sentiment polarity of the reviews
    pol = []
    for i in review:
        analysis = TextBlob(i)
        pol.append(analysis.sentiment.polarity)
    return pol

### train data 

In [10]:
# three data given by the COMP5434 project
train = pd.read_csv(training_data)
validation = pd.read_csv(validation_data)
test = pd.read_csv(training_data)

data = pd.concat([train, validation])
data['review_clean'] = review_clean(data['reviewComment'])
# Removing the stopwords
stop_words = set(stopwords.words('english'))
data['review_clean'] = data['review_clean'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
# Removing the word stems using the Snowball Stemmer
Snow_ball = SnowballStemmer("english")
data['review_clean'] = data['review_clean'].apply(lambda x: " ".join(Snow_ball.stem(word) for word in x.split()))

data['sentiment'] = sentiment(data['reviewComment'])
data['sentiment_clean'] = sentiment(data['review_clean'])
# Cleaning the reviews without removing the stop words and using snowball stemmer
data['review_clean_ss'] = review_clean(data['reviewComment'])
data['sentiment_clean_ss'] = sentiment(data['review_clean_ss'])
data = data.dropna(how="any", axis=0)
#Word count in each review
data['count_word']=data["review_clean_ss"].apply(lambda x: len(str(x).split()))
#Unique word count 
data['count_unique_word']=data["review_clean_ss"].apply(lambda x: len(set(str(x).split())))
#Letter count
data['count_letters']=data["review_clean_ss"].apply(lambda x: len(str(x)))
#punctuation count
data["count_punctuations"] = data["reviewComment"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
#upper case words count
data["count_words_upper"] = data["reviewComment"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#title case words count
data["count_words_title"] = data["reviewComment"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
#Number of stopwords
data["count_stopwords"] = data["reviewComment"].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))
#Average length of the words
data["mean_word_len"] = data["review_clean_ss"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# Label Encoding Drugname and Conditions
label_encoder_feat = {}
for feature in ['drugName', 'condition']:
    label_encoder_feat[feature] = LabelEncoder()
    data[feature] = label_encoder_feat[feature].fit_transform(data[feature])

# converting the date into datetime format
data['date'] = pd.to_datetime(data['date'], errors = 'coerce')

# now extracting year from date
data['Year'] = data['date'].dt.year

# extracting the month from the date
data['month'] = data['date'].dt.month

# extracting the days from the date
data['day'] = data['date'].dt.day

data.loc[(data['rating'] >= 5), 'Review_Sentiment'] = 1
data.loc[(data['rating'] < 5), 'Review_Sentiment'] = 0

data['Review_Sentiment'].value_counts()



0.0    4155
1.0    4004
Name: Review_Sentiment, dtype: int64

### test data


In [16]:
# three data given by the COMP5434 project

test['review_clean'] = review_clean(test['reviewComment'])
# Removing the stopwords
stop_words = set(stopwords.words('english'))
test['review_clean'] = test['review_clean'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
# Removing the word stems using the Snowball Stemmer
Snow_ball = SnowballStemmer("english")
test['review_clean'] = test['review_clean'].apply(lambda x: " ".join(Snow_ball.stem(word) for word in x.split()))

test['sentiment'] = sentiment(test['reviewComment'])
test['sentiment_clean'] = sentiment(test['review_clean'])
# Cleaning the reviews without removing the stop words and using snowball stemmer
test['review_clean_ss'] = review_clean(test['reviewComment'])
test['sentiment_clean_ss'] = sentiment(test['review_clean_ss'])
test = test.dropna(how="any", axis=0)
#Word count in each review
test['count_word']=test["review_clean_ss"].apply(lambda x: len(str(x).split()))
#Unique word count 
test['count_unique_word']=test["review_clean_ss"].apply(lambda x: len(set(str(x).split())))
#Letter count
test['count_letters']=test["review_clean_ss"].apply(lambda x: len(str(x)))
#punctuation count
test["count_punctuations"] = test["reviewComment"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
#upper case words count
test["count_words_upper"] = test["reviewComment"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#title case words count
test["count_words_title"] = test["reviewComment"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
#Number of stopwords
test["count_stopwords"] = test["reviewComment"].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))
#Average length of the words
test["mean_word_len"] = test["review_clean_ss"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# Label Encoding Drugname and Conditions
label_encoder_feat = {}
for feature in ['drugName', 'condition']:
    label_encoder_feat[feature] = LabelEncoder()
    test[feature] = label_encoder_feat[feature].fit_transform(test[feature])

# converting the date into datetime format
test['date'] = pd.to_datetime(test['date'], errors = 'coerce')

# now extracting year from date
test['Year'] = test['date'].dt.year

# extracting the month from the date
test['month'] = test['date'].dt.month

# extracting the days from the date
test['day'] = test['date'].dt.day

test.loc[(test['rating'] >= 5), 'Review_Sentiment'] = 1
test.loc[(test['rating'] < 5), 'Review_Sentiment'] = 0

test['Review_Sentiment'].value_counts()



0.0    3553
1.0    3412
Name: Review_Sentiment, dtype: int64

In [17]:
print('Training data shape : ', X_train.shape, y_train.shape)
print('Testing data shape : ', X_test.shape, y_test.shape)

NameError: name 'X_train' is not defined

In [18]:
b = "'@#$%^()&*;!.-"
X_train = np.array(train['reviewComment'])
X_test = np.array(validation['reviewComment'])

def clean(X):
    for index, review in enumerate(X):
        for char in b:
            X[index] = X[index].replace(char, "")
    return(X)

X_train = clean(X_train)
X_test = clean(X_test)
print(X_train[:2])

['"I039ve tried a few antidepressants over the years citalopram, fluoxetine, amitriptyline, but none of those helped with my depression, insomnia amp anxiety My doctor suggested and changed me onto 45mg mirtazapine and this medicine has saved my life Thankfully I have had no side effects especially the most common  weight gain, I039ve actually lost alot of weight I still have suicidal thoughts but mirtazapine has saved me"'
 '"My son has Crohn039s disease and has done very well on the Asacol  He has no complaints and shows no side effects  He has taken as many as nine tablets per day at one time  I039ve been very happy with the results, reducing his bouts of diarrhea drastically"']


In [19]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from tensorflow.keras.utils import to_categorical
from gensim.models import Word2Vec
from nltk.cluster import KMeansClusterer
import nltk

vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'),lowercase=True, max_features=5000)
#vectorizer = TfidfVectorizer(binary=True, stop_words=stopwords.words('english'), lowercase=True, max_features=5000)
test_train = np.concatenate([X_train, X_test])
print(test_train.shape)
X_onehot = vectorizer.fit_transform(test_train)
stop_words = vectorizer.get_stop_words()
print(type(X_onehot))

2022-04-01 14:47:18.772116: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-01 14:47:18.772157: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


(8198,)
<class 'scipy.sparse._csr.csr_matrix'>


In [None]:
print(X_onehot.shape)
print(X_onehot.toarray())

In [None]:
names_list = vectorizer.get_feature_names()
names = [[i] for i in names_list]
names = Word2Vec(names, min_count=1)

In [None]:
def score_transform(X):
    y_reshaped = np.reshape(X['rating'].values, (-1, 1))
    for index, val in enumerate(y_reshaped):
        if val >= 8:
            y_reshaped[index] = 1
        elif val >= 5:
            y_reshaped[index] = 2
        else:
            y_reshaped[index] = 0
    y_result = to_categorical(y_reshaped)
    return y_result
    
    print(X_onehot)

In [None]:
y_train_test = pd.concat([train, test], ignore_index=True)
y_train = score_transform(y_train_test)
print(y_train)
print(y_train.shape)

In [None]:
from numpy.random import seed

np.random.seed(1)
model = Sequential()
model.add(Dense(units=256, activation='relu', input_dim=len(vectorizer.get_feature_names())))
model.add(Dense(units=3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary

In [None]:
import tensorflow as tf
X_onehot = tf.sparse.reorder(X_onehot)

In [22]:
features = data[['condition', 'usefulCount', 'sentiment', 'day', 'month', 'Year',
                'sentiment_clean_ss', 'count_word', 'count_unique_word', 'count_letters',
                'count_punctuations', 'count_words_upper', 'count_words_title',
                'count_stopwords', 'mean_word_len', 'rating', 'review_clean']]
df_train, df_test = train_test_split(features, test_size=0.33, random_state=42) 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

vectorizer = CountVectorizer(analyzer = 'word', 
                             tokenizer = None,
                             preprocessor = None, 
                             stop_words = None, 
                             min_df = 2, # 토큰이 나타날 최소 문서 개수
                             ngram_range=(4, 4),
                             max_features = 20000
                            )
vectorizer

In [None]:
#https://stackoverflow.com/questions/28160335/plot-a-document-tfidf-2d-graph
pipeline = Pipeline([
    ('vect', vectorizer),
])

In [None]:
%time train_data_features = pipeline.fit_transform(df_train['review_clean'])
%time test_data_features = pipeline.fit_transform(df_test['review_clean'])

In [21]:
train_data_features = df_train[['condition', 'usefulCount', 'sentiment', 'day', 'month', 'Year',
                'sentiment_clean_ss', 'count_word', 'count_unique_word', 'count_letters',
                'count_punctuations', 'count_words_upper', 'count_words_title',
                'count_stopwords', 'mean_word_len', 'rating', 'review_clean']]
test_data_features = df_test[['condition', 'usefulCount', 'sentiment', 'day', 'month', 'Year',
                'sentiment_clean_ss', 'count_word', 'count_unique_word', 'count_letters',
                'count_punctuations', 'count_words_upper', 'count_words_title',
                'count_stopwords', 'mean_word_len', 'rating', 'review_clean']]
%time train_data_features = pipeline.fit_transform(train_data_features)
%time test_data_features = pipeline.fit_transform(test_data_features)

NameError: name 'df_train' is not defined

In [None]:
from tensorflow.python.keras.models import Sequential
from tensorflow.keras.layers import Dense, Bidirectional, LSTM, BatchNormalization, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [23]:
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
import random

# 1. Dataset
y_train = df_train['rating']
y_test = df_test['rating']
solution = y_test.copy()

# 2. Model Structure
model = keras.models.Sequential()

model.add(keras.layers.Dense(200, input_shape=(3426,)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dropout(0.5))

model.add(keras.layers.Dense(300))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dropout(0.5))

model.add(keras.layers.Dense(100, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

# 3. Model compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

2022-04-01 14:47:32.169308: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-01 14:47:32.169345: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-01 14:47:32.169368: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (VM-20-12-ubuntu): /proc/driver/nvidia/version does not exist
2022-04-01 14:47:32.169595: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 200)               685400    
                                                                 
 batch_normalization (BatchN  (None, 200)              800       
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, 200)               0         
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 dense_1 (Dense)             (None, 300)               60300     
                                                                 
 batch_normalization_1 (Batc  (None, 300)              1200      
 hNormalization)                                        

In [25]:
# 4. Train model
train_data_features = train_data_features.toarray()
hist = model.fit(train_data_features, y_train, epochs=10, batch_size=64)

# 5. Traing process
%matplotlib inline
import matplotlib.pyplot as plt

fig, loss_ax = plt.subplots()

acc_ax = loss_ax.twinx()

loss_ax.set_ylim([0.0, 1.0])
acc_ax.set_ylim([0.0, 1.0])

loss_ax.plot(hist.history['loss'], 'y', label='train loss')
acc_ax.plot(hist.history['acc'], 'b', label='train acc')

loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
acc_ax.set_ylabel('accuray')

loss_ax.legend(loc='upper left')
acc_ax.legend(loc='lower left')

plt.show()

# 6. Evaluation
loss_and_metrics = model.evaluate(test_data_features, y_test, batch_size=32)
print('loss_and_metrics : ' + str(loss_and_metrics))

NameError: name 'train_data_features' is not defined