In [42]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report 
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import re

from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

In [43]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [44]:
# input pipeline that delivers data for the next step before the current step has finished.
# The tf.data API helps to build flexible and efficient input pipelines.
# This document demonstrates how to use the tf.data 
# API to build highly performant TensorFlow input pipelines.
AUTO = tf.data.experimental.AUTOTUNE
# upload data into google cloud storage
GCS_DS_PATH = KaggleDatasets().get_gcs_path()
# Configuration
EPOCHS = 10
BATCH_SIZE = 24 * strategy.num_replicas_in_sync
MAX_LEN = 256
MODEL = 'albert-xlarge-v2' # MODEL = 'bert-base-multilingual-cased'

## Functions

In [45]:
def _PREPROCESS(text):
    """
    Function:     Acts as a sub function to the bigger _PREPROCESS.
                  This function seeks to only preprocess the questions
                     
    Input:        Questions column
       
    Returns:      A column of reprocessed questions
    """
    ## for manipulating the questions
    stop_words = stopwords.words('english')
    # stop_words.extend(['hi', 'hello', 'amp'])

    #ps = PorterStemmer()
    wnl = WordNetLemmatizer()

    contractions = {
              "ain't": "am not / are not",
              "aren't": "are not / am not",
              "can't": "cannot",
              "can't've": "cannot have",
              "'cause": "because",
              "could've": "could have",
              "couldn't": "could not",
              "couldn't've": "could not have",
              "didn't": "did not",
              "doesn't": "does not",
              "don't": "do not",
              "hadn't": "had not",
              "hadn't've": "had not have",
              "hasn't": "has not",
              "haven't": "have not",
              "he'd": "he had / he would",
              "he'd've": "he would have",
              "he'll": "he shall / he will",
              "he'll've": "he shall have / he will have",
              "he's": "he has / he is",
              "how'd": "how did",
              "how'd'y": "how do you",
              "how'll": "how will",
              "how's": "how has / how is",
              "i'd": "I had / I would",
              "i'd've": "I would have",
              "i'll": "I shall / I will",
              "i'll've": "I shall have / I will have",
              "i'm": "I am",
              "i've": "I have",
              "isn't": "is not",
              "it'd": "it had / it would",
              "it'd've": "it would have",
              "it'll": "it shall / it will",
              "it'll've": "it shall have / it will have",
              "it's": "it has / it is",
              "let's": "let us",
              "ma'am": "madam",
              "mayn't": "may not",
              "might've": "might have",
              "mightn't": "might not",
              "mightn't've": "might not have",
              "must've": "must have",
              "mustn't": "must not",
              "mustn't've": "must not have",
              "needn't": "need not",
              "needn't've": "need not have",
              "o'clock": "of the clock",
              "oughtn't": "ought not",
              "oughtn't've": "ought not have",
              "shan't": "shall not",
              "sha'n't": "shall not",
              "shan't've": "shall not have",
              "she'd": "she had / she would",
              "she'd've": "she would have",
              "she'll": "she shall / she will",
              "she'll've": "she shall have / she will have",
              "she's": "she has / she is",
              "should've": "should have",
              "shouldn't": "should not",
              "shouldn't've": "should not have",
              "so've": "so have",
              "so's": "so as / so is",
              "that'd": "that would / that had",
              "that'd've": "that would have",
              "that's": "that has / that is",
              "there'd": "there had / there would",
              "there'd've": "there would have",
              "there's": "there has / there is",
              "they'd": "they had / they would",
              "they'd've": "they would have",
              "they'll": "they shall / they will",
              "they'll've": "they shall have / they will have",
              "they're": "they are",
              "they've": "they have",
              "to've": "to have",
              "wasn't": "was not",
              "we'd": "we had / we would",
              "we'd've": "we would have",
              "we'll": "we will",
              "we'll've": "we will have",
              "we're": "we are",
              "we've": "we have",
              "weren't": "were not",
              "what'll": "what shall / what will",
              "what'll've": "what shall have / what will have",
              "what're": "what are",
              "what's": "what has / what is",
              "what've": "what have",
              "when's": "when has / when is",
              "when've": "when have",
              "where'd": "where did",
              "where's": "where has / where is",
              "where've": "where have",
              "who'll": "who shall / who will",
              "who'll've": "who shall have / who will have",
              "who's": "who has / who is",
              "who've": "who have",
              "why's": "why has / why is",
              "why've": "why have",
              "will've": "will have",
              "won't": "will not",
              "won't've": "will not have",
              "would've": "would have",
              "wouldn't": "would not",
              "wouldn't've": "would not have",
              "y'all": "you all",
              "y'all'd": "you all would",
              "y'all'd've": "you all would have",
              "y'all're": "you all are",
              "y'all've": "you all have",
              "you'd": "you had / you would",
              "you'd've": "you would have",
              "you'll": "you shall / you will",
              "you'll've": "you shall have / you will have",
              "you're": "you are",
              "you've": "you have"}

    def contract(text):
        for word in text.split():
            if word.lower() in contractions:
                text = text.replace(word, contractions[word.lower()])
        return text
       
    def preprocess(text_column):
        """
        Function:     This NLP pre processing function takes in a sentence,
                      replaces all the useless letters and symbols, and takes 
                      out all the stop words. This would hopefully leave only 
                      the important key words
                            
        Input:        A list of sentences
              
        Returns:      A list of sentences that has been cleaned
        """
        # Remove link,user and special characters
        # And Lemmatize the words
        new_review = []
        for review in tqdm(text_column):
            # this text is a list of tokens for the review
            text = re.sub("w/", 'with', str(review).lower()).strip()
            text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9\U00010000-\U0010ffff]+", ' ', text)
            text = contract(text).split()
                     
            # Stemming and removing stopwords
            # text = [wnl.lemmatize(i) for i in text if i not in stop_words]
                     
            new_review.append(' '.join(text))
        return new_review
       
    text = preprocess(text)
    return text

In [46]:
def regular_encode(texts, tokenizer, maxlen = 512):
    """
    Function to encode the word
    """
    # encode the word to vector of integer
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [47]:
def build_model(transformer, max_len=512):
    """
    This function to build and compile Keras model
    
    """
    #Input: for define input layer
    #shape is vector with 512-dimensional vectors
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids") # name is optional 
    sequence_output = transformer(input_word_ids)[0]
    # to get the vector
    cls_token = sequence_output[:, 0, :]
    # define output layers
    out = Dense(5, activation = 'softmax')(cls_token)
    
    # initiate the model with inputs and outputs
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Nadam(lr=1e-5), loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [48]:
def split_data(data, n):
    ref = {i: j for i, j in data['rating'].value_counts().reset_index().values.tolist()}
    result = data.loc[data['rating'] == 1, :].sample(min(n, ref[1]))
    for i in range(2, 6):
        result = result.append(data.loc[data['rating'] == i, :].sample(min(n, ref[i])))
    return result

## Preprocessing

In [49]:
# importing dataset
data = pd.read_csv("/kaggle/input/shopee-sentiment-analysis/dataset/train.csv")
data = split_data(data, 18500)
data = data.sample(frac = 1)
train, val = train_test_split(data, test_size = 0.1)
test = pd.read_csv('/kaggle/input/shopee-sentiment-analysis/dataset/test.csv')

In [50]:
#use the pre-trained model bert as a tokenizer 
#bert tokenizer has vocabulary for emoji. this is the reason we don't need to remove emoji from 
#datasets, for more details see the (EDA & data cleaning) notebook

tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=685.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




In [51]:
%%time 
#call the function regular encode on for all the 3 dataset to convert each words after the tokenizer
#into a vector
#x_train,x_test, and x_validation will have the comment text column only,(in test called "content")
x_train = regular_encode(_PREPROCESS(train.review.values), tokenizer, maxlen=MAX_LEN)
x_val = regular_encode(_PREPROCESS(val.review.values), tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(_PREPROCESS(test.review.values), tokenizer, maxlen=MAX_LEN)

y_train = train.rating.values - 1
y_train = tf.keras.utils.to_categorical(y_train, 5)
y_val = val.rating.values - 1
y_val = tf.keras.utils.to_categorical(y_val, 5)

HBox(children=(FloatProgress(value=0.0, max=82935.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9215.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=60427.0), HTML(value='')))


CPU times: user 42 s, sys: 224 ms, total: 42.3 s
Wall time: 42.1 s


In [52]:
# Create a source dataset from your input data.
# Apply dataset transformations to preprocess the data.
# Iterate over the dataset and process the elements.
train_dataset = (
    tf.data.Dataset # create dataset
    .from_tensor_slices((x_train, y_train)) # Once you have a dataset, you can apply transformations 
    .repeat()
    .shuffle(1024)
    .batch(BATCH_SIZE)# Combines consecutive elements of this dataset into batches.
    .prefetch(AUTO) #This allows later elements to be prepared while the current element is being processed.
)
valid_dataset = (
    tf.data.Dataset # create dataset
    .from_tensor_slices((x_val, y_val)) # Once you have a dataset, you can apply transformations 
    .repeat()
    .shuffle(1024)
    .batch(BATCH_SIZE)# Combines consecutive elements of this dataset into batches.
    .prefetch(AUTO) #This allows later elements to be prepared while the current element is being processed.
)
test_dataset = (
    tf.data.Dataset# create dataset
    .from_tensor_slices(x_test) # Once you have a dataset, you can apply transformations 
    .batch(BATCH_SIZE)
)

## Build the Model

In [53]:
%%time
# in the TPU
with strategy.scope():
    #take the encoder results of bert from transformers and use it as an input in the NN model
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=251868920.0, style=ProgressStyle(descri…


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 256)]             0         
_________________________________________________________________
tf_albert_model (TFAlbertMod ((None, 256, 2048), (None 58724864  
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 2048)]            0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 10245     
Total params: 58,735,109
Trainable params: 58,735,109
Non-trainable params: 0
_________________________________________________________________
CPU times: user 19.7 s, sys: 3.45 s, total: 23.2 s
Wall time: 25.9 s


In [54]:
#train the model
# training the data and tune our model with the results of the metrics we get from the validation dataset
n_steps = x_train.shape[0] // BATCH_SIZE
val_steps = x_val.shape[0] // BATCH_SIZE
train_history = model.fit(train_dataset, 
                          steps_per_epoch = n_steps, 
                          validation_data = valid_dataset,
                          validation_steps = val_steps,
                          epochs = EPOCHS,
                          callbacks = [EarlyStopping(monitor='val_loss', min_delta = 0.01, patience = 1)])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


Testing on validation first

In [55]:
v_pred = model.predict(tf.data.Dataset.from_tensor_slices(x_val).batch(BATCH_SIZE), verbose = 1)

v_pred = np.argmax(v_pred, axis = 1) + 1

print(classification_report(np.argmax(y_val, axis = 1) + 1, v_pred))

              precision    recall  f1-score   support

           1       0.70      0.73      0.72      1835
           2       0.54      0.54      0.54      1800
           3       0.57      0.51      0.54      1833
           4       0.42      0.64      0.51      1885
           5       0.44      0.24      0.31      1862

    accuracy                           0.53      9215
   macro avg       0.54      0.53      0.52      9215
weighted avg       0.53      0.53      0.52      9215



Exporting file

In [56]:
pred = model.predict(test_dataset, verbose = 1)
pred = np.argmax(pred, axis = 1) + 1
result = pd.DataFrame({'review_id': test.review_id.values, 'rating': pred})
result.to_csv('submission_8 (BERT).csv', index = False)

