# Importing the Required Libraries

load all necessary libraries here

In [1]:
import copy
from googletrans import Translator
translator = Translator()
import pandas as pd 
import numpy as np
import re
import pickle
%matplotlib notebook
import matplotlib.pyplot as plt

from nltk.stem import WordNetLemmatizer
from gensim.parsing.preprocessing import remove_stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense,LSTM,Bidirectional
from tensorflow.keras.layers import Flatten,Input
from tensorflow.keras.layers import GlobalMaxPooling1D,Conv1D,MaxPooling1D
from tensorflow.keras.layers import Embedding,concatenate
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras import regularizers
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
import bert
import tensorflow_hub as hub

# function
import sys
sys.path.append('function/')
from ursar import nlp

# Importing the Dataset

In [None]:
test  = pd.read_csv('DATA/test_data_restaurant_rest.tsv', sep='\t',header=None)
test.columns = ['sentence', 'label']

# Text Preprocessing before translate

In [None]:
def preprocess_text(sentence):
    # Remove all the special characters
    # remove all the non-word characters (letters and numbers) from a string and keep the remaining characters
    sentence = re.sub(r'\W', ' ', str(sentence))

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    
    # Single character removal
    # Sometimes removing punctuation marks, such as an apostrophe, results in a single character which has no meaning
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Remove single characters from the start
    sentence = re.sub(r'\^[a-zA-Z]\s+', ' ', sentence)
    
    # Remove enter type to space
    sentence = sentence.replace("\n"," ")
    
    # Substituting multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence)
    
    #removes spaces from the start and end 
    sentence = re.sub(r"\s+$", "", sentence)
    sentence = re.sub(r"^\s+", "", sentence)
    
    # Converting to Lowercase
    sentence = sentence.lower()
    return(sentence)

# Translating the Sentences

In [None]:
translatedList = []
for index, row in test.iterrows():
    # REINITIALIZE THE API
    translator = Translator()
    newrow = copy.deepcopy(row)
    try:
        # translate the 'text' column
        translated = translator.translate(preprocess_text(row['sentence']), dest='en')
        newrow['translated'] = translated.text
    except Exception as e:
        print(str(e))
        continue
    translatedList.append(newrow)
    print(newrow)

In [None]:
qwer = pd.DataFrame(translatedList)

In [None]:
qwer["translated"].to_csv('DATA/test_raw_translate.txt',index=False,header=False)

## Save to CSV

In [None]:
pd.DataFrame(translatedList).to_csv('DATA/train_raw_translate.txt',index=False,header=False)

# Load file

In [2]:
train  = pd.read_csv('DATA/train_raw_translate.txt', header=None)
test = pd.read_csv('DATA/test_raw_translate.txt', header=None)
train.columns = ['sentence']
test.columns = ['sentence']
print("data training shape")
print(train.shape)
print("data testing shape")
print(test.shape)

data training shape
(1780, 1)
data testing shape
(185, 1)


In [3]:
# load label train dataset file here
with open('DATA/label_train', 'rb') as picklefile:
    y_train = pickle.load(picklefile)

# load label test dataset file here
with open('DATA/label_test', 'rb') as picklefile:
    y_test = pickle.load(picklefile)

# Text Preprocessing

In [4]:
stemmer = WordNetLemmatizer()

def preprocess_text(sen):
    # Converting to Lowercase
    sentence = sen.lower()
    # Lemmatization
    # reduce the word into dictionary root form
    sentence = sentence.split()
    sentence = [stemmer.lemmatize(word) for word in sentence]
    sentence = ' '.join(sentence)
    sentence = remove_stopwords(sentence)
    return sentence

In [5]:
X_train = []
sentences = list(train['sentence'])
for sen in sentences:
    X_train.append(preprocess_text(sen))

In [6]:
X_test = []
sentences = list(test['sentence'])
for sen in sentences:
    X_test.append(preprocess_text(sen))

# Creating a BERT Tokenizer

In order to use BERT text embeddings as input to train text classification model, we need to tokenize our text reviews. Tokenization refers to dividing a sentence into individual words. To tokenize our text, we will be using the BERT tokenizer.

# example BERT

In [8]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In the script above we first 

1. create an object of the FullTokenizer class from the bert.bert_tokenization module. 
2. create a BERT embedding layer by importing the BERT model from hub.KerasLayer. 
    The trainable parameter is set to False, which means that we will not be training the BERT embedding. 
3. create a BERT vocabulary file in the form a numpy array. 
4. set the text to lowercase 
5. pass our vocabulary_file and to_lower_case variables to the BertTokenizer object.

In [9]:
# tokenize a random sentence
tokenizer.tokenize("don't be so judgmental")
# get the ids of the tokens using the convert_tokens_to_ids() of the tokenizer object
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("dont be so judgmental"))

[2123, 2102, 2022, 2061, 8689, 2389]

In [10]:
def tokenize_reviews(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

### to actually tokenize all the reviews in the input dataset

In [12]:
tokenized_train = [tokenize_reviews(review) for review in X_train]

In [13]:
tokenized_test = [tokenize_reviews(review) for review in X_test]

# Preparing Data For Training

To train the model, the input sentences should be of equal length. 
To create sentences of equal length, one way is to pad the shorter sentences by 0s. 
However, this can result in a sparse matrix contain large number of 0s. 
The other way is to pad sentences within each batch. Since we will be training the model in batches, 
we can pad the sentences within the training batch locally depending upon the length of the longest sentence.

In [15]:
# list contains tokenized review, the label of the review and 
#length of the review:
reviews_with_len = [[review, y_train[i], len(review)]
                 for i, review in enumerate(tokenized_train)]

In [16]:
# shuffle the review
import random
random.shuffle(reviews_with_len)

In [17]:
# sort the data by the length of the reviews
#sort base on third columns
reviews_with_len.sort(key=lambda x: x[2])

In [18]:
#remove the length attribute from all the reviews
sorted_reviews_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len]

# convert dataset for train TensorFlow 2.0 models

In [19]:
# train TensorFlow 2.0 models
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_reviews_labels, output_types=(tf.int32, tf.int32))

NameError: name 'tf' is not defined

# pad  dataset for each batch

The batch size we are going to use is 32 which means that after processing 32 reviews, the weights of the neural network will be updated and pad the reviews locally with respect to batches

In [29]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [30]:
# print the first batch and see how padding has been applied to it
next(iter(batched_dataset))

(<tf.Tensor: shape=(32, 21), dtype=int32, numpy=
 array([[ 3191,  1996,  2338,  5293,  1996,  3185,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 3078,  5436,  3078,  3257,  3532,  7613,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 2054,  5896,  2054,  2466,  2054,  6752,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 2062, 23873,  3993,  2062, 11259,  2172,  2172,  2062, 14888,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 2876,  9278,  2023,  2028,  2130,  2006,  7922, 12635,  2305,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 2023,  3185,  2003,  6659,  2021,  2009,  2038,  2070,  2204,
    

From the last five reviews, you can see that the total number of words in the largest sentence were 21.

in the first five reviews the 0s are added at the end of the sentences so that their total length is also 21. The padding for the next batch will be different depending upon the size of the largest sentence in the batch.