In [1]:
!pip install pytypo
!pip install emoji

Collecting pytypo
[?25l  Downloading https://files.pythonhosted.org/packages/9f/80/b0578690bcac288cf9af76abecf2fd30978ea75ebe67da817c018c444abb/pytypo-0.3.0.tar.gz (74kB)
[K     |████▍                           | 10kB 27.5MB/s eta 0:00:01[K     |████████▉                       | 20kB 2.8MB/s eta 0:00:01[K     |█████████████▎                  | 30kB 3.8MB/s eta 0:00:01[K     |█████████████████▋              | 40kB 4.2MB/s eta 0:00:01[K     |██████████████████████          | 51kB 3.4MB/s eta 0:00:01[K     |██████████████████████████▌     | 61kB 3.8MB/s eta 0:00:01[K     |███████████████████████████████ | 71kB 4.1MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 3.6MB/s 
[?25hBuilding wheels for collected packages: pytypo
  Building wheel for pytypo (setup.py) ... [?25l[?25hdone
  Created wheel for pytypo: filename=pytypo-0.3.0-cp36-none-any.whl size=72687 sha256=c63effc266a6189f44a9484da0260a53dc91f1ac6437e3226678f4dd0eae0dad
  Stored in directory: /root/

In [2]:
import io
import re
import pytypo
import numpy as np
import pandas as pd
import tensorflow as tf

from emoji import UNICODE_EMOJI
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
# Mount gdrive
from google.colab import drive, files
drive.mount('gdrive')

Drive already mounted at gdrive; to attempt to forcibly remount, call drive.mount("gdrive", force_remount=True).


In [4]:
# Specify the location of training and test file
training_file = 'gdrive/My Drive/shopee/train.csv'
test_file = 'gdrive/My Drive/shopee/test.csv'

In [5]:
# Load to dataframe
training_data = pd.read_csv(training_file)
test_data = pd.read_csv(test_file)

In [6]:
# Show some examples of training data
training_data.head()

Unnamed: 0,review_id,review,rating
0,0,Ga disappointed neat products .. Meletot Hilsn...,1
1,1,"Rdtanya replace broken glass, broken chargernya",1
2,2,Nyesel bngt dsni shopping antecedent photo mes...,1
3,3,Sent a light blue suit goods ga want a refund,1
4,4,Pendants came with dents and scratches on its ...,1


In [7]:
# Show number of samples
training_data.shape[0]

146811

In [8]:
# Show some examples of test data
test_data.head()

Unnamed: 0,review_id,review
0,1,"Great danger, cool, motif and cantik2 jg model..."
1,2,One of the shades don't fit well
2,3,Very comfortable
3,4,Fast delivery. Product expiry is on Dec 2022. ...
4,5,it's sooooo cute! i like playing with the glit...


In [9]:
# Convert rating to score
training_data['score'] = training_data.rating.apply(
    lambda rating: rating / 5 - 0.1
)

In [10]:
# Check if a character is emoji
def is_emoji(char):
    return char in UNICODE_EMOJI

# Add whitespaces and remove duplicate emoji
def preprocess_emoji(text):

    # Initialization
    result = ''
    emoji_list = []
    
    # Scan the text
    for char in text:
        if is_emoji(char):
            if char not in emoji_list:
                result += ' ' + char + ' '
                emoji_list.append(char)
        else:
            result += char
    
    return(result)

In [11]:
# Function to check elongated word
def has_long(sentence):
    elong = re.compile("([a-zA-Z])\\1{2,}")
    return bool(elong.search(sentence))

In [12]:
# Function to fix a sentence with elongated word
def fix_long(sentence):

    # Initialization
    result = ''

    # Fix the sentence
    for word in sentence.split(' '):
        if has_long(word):
            word = pytypo.correct(word)
        result += word + ' '
    
    return(result)

In [13]:
# Function for text preprocessing
def preprocessing(text):
    
    # Create a space between a word and punctuation
    result = preprocess_emoji(text)
    result = re.sub(r"([?.!,¿()-/])", r" \1 ", result)
    result = re.sub(r'[" "]+', " ", result)

    # Fix elongated word
    if has_long(result):
        result = fix_long(result)

    return(result)

In [14]:
# Get review data
reviews = training_data.review.tolist()
reviews = [preprocessing(review) for review in reviews]
print(reviews[-1])

 Excellent product quality excellent product price is very good delivery speed


In [15]:
# Function to tokenize the words
def tokenize(texts, num_words=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters='',
        num_words=num_words,
        oov_token='<UNK>'
    )
    tokenizer.fit_on_texts(texts)
  
    sequences = tokenizer.texts_to_sequences(texts)
    sequences = tf.keras.preprocessing.sequence.pad_sequences(
        sequences,
        padding='post'
    )

    return sequences, tokenizer

In [16]:
# Tokenize the words
sequences, tokenizer = tokenize(reviews, num_words=8000)

In [17]:
# Get score data
scores = training_data.score.tolist()

In [18]:
# Creating training and validation sets using an 80-20 split
x_train, x_validation, y_train, y_validation = train_test_split(
    sequences, 
    scores, 
    test_size=0.2
)

In [19]:
# Create tf dataset
BUFFER_SIZE = len(x_train)
BATCH_SIZE = 64

train_set = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_set = train_set.shuffle(BUFFER_SIZE)
train_set = train_set.batch(BATCH_SIZE)

validation_set = tf.data.Dataset.from_tensor_slices(
    (x_validation, y_validation)
)
validation_set = validation_set.batch(BATCH_SIZE)

In [20]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.num_words, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [21]:
# Compile the model
model.compile(
  loss=tf.keras.losses.MeanAbsoluteError(),
  optimizer=tf.keras.optimizers.Adam(1e-4),
  metrics=['mean_absolute_error']
)

In [22]:
# Create a callback that saves the model
callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='model.h5',
    save_best_only=True,
    verbose=1
)

In [23]:
# Train the model
history = model.fit(
  train_set, 
  epochs=20,
  validation_data=validation_set,
  callbacks=[callback]
)

Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.14099, saving model to model.h5
Epoch 2/20
Epoch 00002: val_loss improved from 0.14099 to 0.13767, saving model to model.h5
Epoch 3/20
Epoch 00003: val_loss improved from 0.13767 to 0.13615, saving model to model.h5
Epoch 4/20
Epoch 00004: val_loss improved from 0.13615 to 0.13552, saving model to model.h5
Epoch 5/20
Epoch 00005: val_loss improved from 0.13552 to 0.13437, saving model to model.h5
Epoch 6/20
Epoch 00006: val_loss improved from 0.13437 to 0.13389, saving model to model.h5
Epoch 7/20
Epoch 00007: val_loss did not improve from 0.13389
Epoch 8/20
Epoch 00008: val_loss improved from 0.13389 to 0.13241, saving model to model.h5
Epoch 9/20
Epoch 00009: val_loss did not improve from 0.13241
Epoch 10/20
Epoch 00010: val_loss improved from 0.13241 to 0.13121, saving model to model.h5
Epoch 11/20
Epoch 00011: val_loss improved from 0.13121 to 0.13106, saving model to model.h5
Epoch 12/20
Epoch 00012: val_loss improved from 0.

In [24]:
# Load the best model
model = tf.keras.models.load_model('model.h5')
model.evaluate(validation_set)



[0.12998637557029724, 0.12998637557029724]

In [25]:
# Make prediction
score_prediction = model.predict(x_validation)
score_prediction = score_prediction.flatten().tolist()

In [26]:
# Function to convert score to rating
def score_to_rating(input_list):
    
    result = []
    for score in input_list:
        if score < 0.2:
            result.append(1)
        elif 0.2 <= score < 0.4:
            result.append(2)
        elif 0.4 <= score < 0.6:
            result.append(3)
        elif 0.6 <= score < 0.8:
            result.append(4)
        else:
            result.append(5)
    
    return(result)

In [27]:
# Convert score to rating
rating_prediction = score_to_rating(score_prediction)
rating_actual = score_to_rating(y_validation)

In [28]:
# Show accuracy 
accuracy_score(rating_actual, rating_prediction)

0.46395123114123216

In [29]:
# Get test data
test_reviews = test_data.review.tolist()
test_reviews = [preprocessing(review) for review in test_reviews]
test_sequences = tokenizer.texts_to_sequences(test_reviews)
test_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences,
    padding='post',
    maxlen=sequences.shape[1]
)
print(test_reviews[-1])

Rapid response and detail . . . 
Thanks gan , the goods have been received well n packing a secure . . . . 


In [30]:
# Predict the test data
prediction = model.predict(test_sequences)
prediction = prediction.flatten().tolist()

In [31]:
# Convert score to rating
prediction = score_to_rating(prediction)

In [32]:
# Add prediction to dataframe
test_data['rating'] = prediction

In [33]:
# Export to a csv file
test_data.loc[:, ['review_id', 'rating']].to_csv(
    'submission.csv', index=False, header=True
)