In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

Mounted at /content/drive


# 1. load data

In [None]:
# load the cleaned data and split the data into training set and testing set
path = '/content/drive/MyDrive/DSO 560 NLP Team Project/'
df_1 = pd.read_csv(f'{path}clean_data/bumble_hinge_review.csv')
df = df_1[df_1['App']=='Bumble'].copy()
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'Review':'text','App':'app','Rating':'score','index':'id'},inplace=True)
df = df[['id','text','score','app']]
train_df, test_df = train_test_split(df, test_size=0.3, random_state=0, stratify=df['score'])

# 2. preprocessing

In [None]:
train_docs = list(train_df['text'])
train_labels = np.array(train_df['score'])
train_labels

array([5, 2, 1, ..., 1, 2, 4])

In [None]:
test_docs = list(test_df['text'])
test_labels = np.array(test_df['score'])

In [None]:
# use spacy to remove stopwords in the text
import spacy
nlp = spacy.load('en_core_web_sm', disable=["ner", "pos", "tagger"])
stopwords_removed_train_docs = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), train_docs))
stopwords_removed_test_docs = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), test_docs))

In [None]:
# tokenize the text
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=500, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(stopwords_removed_train_docs)

In [None]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [None]:
encoded_train_docs = integer_encode_documents(stopwords_removed_train_docs, tokenizer)
encoded_test_docs = integer_encode_documents(stopwords_removed_test_docs, tokenizer)

In [None]:
# padding and generate embeddings
from keras.preprocessing.sequence import pad_sequences
max_length = 90
padded_train_docs = pad_sequences(encoded_train_docs, maxlen=max_length, padding='post')
padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')
padded_train_docs

array([[  1,   2,   1, ...,   0,   0,   0],
       [  1,   2,  28, ...,   0,   0,   0],
       [  1,  61,   1, ...,   0,   0,   0],
       ...,
       [262, 168,  79, ...,   0,   0,   0],
       [  2,  55, 113, ...,   0,   0,   0],
       [  3, 248,   0, ...,   0,   0,   0]], dtype=int32)

In [None]:
vocab_size = int(len(tokenizer.word_index) * 1.3)
print(f"Vocab size is {vocab_size} unique tokens.")

Vocab size is 24397 unique tokens.


In [None]:
EMBEDDING_SIZE = 90

# 3. construct model

In [None]:
# use keras to construct model
# define R squared metrics
from keras import backend as K
def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [None]:
# use keras to construct neural network
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Flatten

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
model.add(Flatten()) 
model.add(Dense(1, activation='relu')) 
model.compile(loss='mean_squared_error', optimizer='adam',metrics=[r2_score,'mse'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 90, 90)            2195730   
                                                                 
 flatten_4 (Flatten)         (None, 8100)              0         
                                                                 
 dense_4 (Dense)             (None, 1)                 8101      
                                                                 
Total params: 2,203,831
Trainable params: 2,203,831
Non-trainable params: 0
_________________________________________________________________


In [None]:
# model training and result reporting
model.fit(padded_train_docs, train_labels, epochs=20, verbose=1)
loss = model.evaluate(padded_train_docs, train_labels, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
