In [1]:
import io
import re
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Mount gdrive
from google.colab import drive, files
drive.mount('gdrive')

Drive already mounted at gdrive; to attempt to forcibly remount, call drive.mount("gdrive", force_remount=True).


In [3]:
# Specify the location of training and test file
training_file = 'gdrive/My Drive/shopee/train.csv'
test_file = 'gdrive/My Drive/shopee/test.csv'

In [4]:
# Load to dataframe
training_data = pd.read_csv(training_file)
test_data = pd.read_csv(test_file)

In [5]:
# Show some examples of training data
training_data.head()

Unnamed: 0,review_id,review,rating
0,0,Ga disappointed neat products .. Meletot Hilsn...,1
1,1,"Rdtanya replace broken glass, broken chargernya",1
2,2,Nyesel bngt dsni shopping antecedent photo mes...,1
3,3,Sent a light blue suit goods ga want a refund,1
4,4,Pendants came with dents and scratches on its ...,1


In [6]:
# Show number of samples
training_data.shape[0]

146811

In [7]:
# Show some examples of test data
test_data.head()

Unnamed: 0,review_id,review
0,1,"Great danger, cool, motif and cantik2 jg model..."
1,2,One of the shades don't fit well
2,3,Very comfortable
3,4,Fast delivery. Product expiry is on Dec 2022. ...
4,5,it's sooooo cute! i like playing with the glit...


In [8]:
# Convert rating to score
training_data['score'] = training_data.rating.apply(
    lambda rating: rating / 5 - 0.1
)

In [9]:
# Get review data
reviews = training_data.review.tolist()
print(reviews[-1])

 Excellent product quality excellent product price is very good delivery speed


In [10]:
# Function to tokenize the words
def tokenize(texts, num_words=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters='',
        num_words=num_words,
        oov_token='<UNK>'
    )
    tokenizer.fit_on_texts(texts)
  
    sequences = tokenizer.texts_to_sequences(texts)
    sequences = tf.keras.preprocessing.sequence.pad_sequences(
        sequences,
        padding='post'
    )

    return sequences, tokenizer

In [11]:
# Tokenize the words
sequences, tokenizer = tokenize(reviews, num_words=8000)

In [12]:
# Get score data
scores = training_data.score.tolist()

In [13]:
# Creating training and validation sets using an 80-20 split
x_train, x_validation, y_train, y_validation = train_test_split(
    sequences, 
    scores, 
    test_size=0.2
)

In [14]:
# Create tf dataset
BUFFER_SIZE = len(x_train)
BATCH_SIZE = 64

train_set = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_set = train_set.shuffle(BUFFER_SIZE)
train_set = train_set.batch(BATCH_SIZE)

validation_set = tf.data.Dataset.from_tensor_slices(
    (x_validation, y_validation)
)
validation_set = validation_set.batch(BATCH_SIZE)

In [15]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.num_words, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [16]:
# Compile the model
model.compile(
  loss=tf.keras.losses.MeanAbsoluteError(),
  optimizer=tf.keras.optimizers.Adam(1e-4),
  metrics=['mean_absolute_error']
)

In [17]:
# Create a callback that saves the model
callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='model.h5',
    save_best_only=True,
    verbose=1
)

In [18]:
# Train the model
history = model.fit(
  train_set, 
  epochs=20,
  validation_data=validation_set,
  callbacks=[callback]
)

Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.14677, saving model to model.h5
Epoch 2/20
Epoch 00002: val_loss improved from 0.14677 to 0.13944, saving model to model.h5
Epoch 3/20
Epoch 00003: val_loss improved from 0.13944 to 0.13839, saving model to model.h5
Epoch 4/20
Epoch 00004: val_loss improved from 0.13839 to 0.13672, saving model to model.h5
Epoch 5/20
Epoch 00005: val_loss improved from 0.13672 to 0.13663, saving model to model.h5
Epoch 6/20
Epoch 00006: val_loss improved from 0.13663 to 0.13620, saving model to model.h5
Epoch 7/20
Epoch 00007: val_loss improved from 0.13620 to 0.13521, saving model to model.h5
Epoch 8/20
Epoch 00008: val_loss did not improve from 0.13521
Epoch 9/20
Epoch 00009: val_loss improved from 0.13521 to 0.13386, saving model to model.h5
Epoch 10/20
Epoch 00010: val_loss improved from 0.13386 to 0.13329, saving model to model.h5
Epoch 11/20
Epoch 00011: val_loss did not improve from 0.13329
Epoch 12/20
Epoch 00012: val_loss did not improve 

In [19]:
# Load the best model
model = tf.keras.models.load_model('model.h5')
model.evaluate(validation_set)



[0.13164809346199036, 0.13164809346199036]

In [20]:
# Make prediction
score_prediction = model.predict(x_validation)
score_prediction = score_prediction.flatten().tolist()

In [21]:
# Function to convert score to rating
def score_to_rating(input_list):
    
    result = []
    for score in input_list:
        if score < 0.2:
            result.append(1)
        elif 0.2 <= score < 0.4:
            result.append(2)
        elif 0.4 <= score < 0.6:
            result.append(3)
        elif 0.6 <= score < 0.8:
            result.append(4)
        else:
            result.append(5)
    
    return(result)

In [22]:
# Convert score to rating
rating_prediction = score_to_rating(score_prediction)
actual_prediction = score_to_rating(y_validation)

In [23]:
# Show accuracy 
accuracy_score(actual_prediction, rating_prediction)

0.4623505772570923

In [24]:
# Get test data
test_reviews = test_data.review.tolist()
test_sequences = tokenizer.texts_to_sequences(test_reviews)
test_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences,
    padding='post',
    maxlen=sequences.shape[1]
)
print(test_reviews[-1])

Rapid response and detail ...
Thanks gan, the goods have been received well n packing a secure ....


In [25]:
# Predict the test data
prediction = model.predict(test_sequences)
prediction = prediction.flatten().tolist()

In [26]:
# Convert score to rating
prediction = score_to_rating(prediction)

In [27]:
# Add prediction to dataframe
test_data['rating'] = prediction

In [28]:
# Export to a csv file
test_data.loc[:, ['review_id', 'rating']].to_csv(
    'submission.csv', index=False, header=True
)