<a href="https://colab.research.google.com/github/hw-tan/Capstone-Project/blob/main/3_Word_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

BERT, or Bidirectional Embedding Representations from Transformers is a language representation model that is pre-trained from a huge amount of plain text on the web. The model can be fine-tuned with an additional output layer, and can handle a wide range of task [Read more](https://arxiv.org/abs/1810.04805). 

LaBSE, Language-agnostic BERT Sentence Embedding is the multilingual adaptation of BERT. [Read more](https://arxiv.org/abs/2007.01852)

In this notebook, we process the title data (description of item) from the Shopee dataset to fit it into a pre-train BERT model to extract word embeddings of the titles.

With the word embeddings we will determine its Nearest Neighbors and select a cut-off score to determine what other titles can be classified as duplicates.



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True) 

Mounted at /content/drive


In [None]:
!nvidia-smi

Wed Aug  4 04:02:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     7W /  75W |      0MiB /  7611MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# New Section

In [None]:
!pip install bert-for-tf2

import bert



In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.neighbors import NearestNeighbors

# Neural Network
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense



#NLP libraries
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [None]:
# Preprocess dataset
directory = '/content/drive/MyDrive/Capstone/'
train = pd.read_csv(directory + 'Data/train.csv')

train['filepath'] = train['image'].apply(lambda x: f'{directory}Data/train_images/{x}').values

# Create dictionary of items by label group
label_dict = train.groupby('label_group')['posting_id'].unique().to_dict()

# Create list of matching products
train['matches'] = train['label_group'].map(label_dict)

# Create dictionary of items by image_phash
label_dict = train.groupby('image_phash')['posting_id'].unique().to_dict()

# Create list of image duplicates by phash
train['image_duplicates'] = train['image_phash'].map(label_dict)

Pre processing documents

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Create function to clean text

def tokenize_text(text, stop_words):

    # Instantiate NLTK regextokenizer
    tokenizer = nltk.tokenize.RegexpTokenizer(pattern='\w+')

    # Create tokens
    tokens = tokenizer.tokenize(text.lower())

    # Remove stopwords
    #clean_tokens = [w for w in tokens if w not in stop_words]
    
    return(' '.join(tokens))


In [None]:
# Create list of stop words
stop_words = stopwords.words('indonesian') + stopwords.words('english')

In [None]:
# Clean text in new column

token = train['title'].map(lambda x: tokenize_text(x, stop_words)).to_numpy()

TFIDF Vectorizer Embeddings

In [None]:
tvec = TfidfVectorizer()

In [None]:
tfidf_embedding = tvec.fit_transform(token)


In [None]:
type(tfidf_embedding)

scipy.sparse.csr.csr_matrix

In [None]:
sparse.save_npz(f'{directory}/Data/tfidf_embedding.npz', tfidf_embedding)

LaBSE Embeddings

https://tfhub.dev/google/LaBSE/1

In [None]:
def get_model(model_url, max_seq_length):
  labse_layer = hub.KerasLayer(model_url, trainable=True)

  # Define input.
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                         name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                     name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                      name="segment_ids")

  # LaBSE layer.
  pooled_output,  _ = labse_layer([input_word_ids, input_mask, segment_ids])

  # The embedding is l2 normalized.
  pooled_output = tf.keras.layers.Lambda(
      lambda x: tf.nn.l2_normalize(x, axis=1))(pooled_output)

  # Define model.
  return tf.keras.Model(
        inputs=[input_word_ids, input_mask, segment_ids],
        outputs=pooled_output), labse_layer


In [None]:
max_seq_length = 64
labse_model, labse_layer = get_model(
    model_url="https://tfhub.dev/google/LaBSE/1", max_seq_length=max_seq_length)

In [None]:
vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
def create_input(input_strings, tokenizer, max_seq_length):

  input_ids_all, input_mask_all, segment_ids_all = [], [], []
  for input_string in input_strings:
    # Tokenize input.
    input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    sequence_length = min(len(input_ids), max_seq_length)

    # Padding or truncation.
    if len(input_ids) >= max_seq_length:
      input_ids = input_ids[:max_seq_length]
    else:
      input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    segment_ids_all.append([0] * max_seq_length)

  return np.array(input_ids_all), np.array(input_mask_all), np.array(segment_ids_all)


In [None]:
def encode(input_text):
  input_ids, input_mask, segment_ids = create_input(
    input_text, tokenizer, max_seq_length)
  return labse_model([input_ids, input_mask, segment_ids])

### Generate Word Embedding

In [None]:
# To deal with the dataset size, we run the model in groups
group_size = 1000
groups = np.arange(np.ceil(len(train) / group_size))

# Create empty list for embeddings
embeddings = []

for i in groups:
  # Start and end index
  start = int(i * group_size)
  end = int((i + 1) * group_size)

  # Get tokens
  set_of_tokens = token[start:end]

  # Generate embeddings
  word_embeddings = encode(set_of_tokens)

  # Append to embeddings list
  embeddings.append(word_embeddings)

  # Print status
  print(f'Group {i} completed')

train_word_embeddings = np.concatenate(embeddings)

# Delete temporary variables to free memory
del embeddings
del set_of_tokens
del word_embeddings

Group 0.0 completed
Group 1.0 completed
Group 2.0 completed
Group 3.0 completed
Group 4.0 completed
Group 5.0 completed
Group 6.0 completed
Group 7.0 completed
Group 8.0 completed
Group 9.0 completed
Group 10.0 completed
Group 11.0 completed
Group 12.0 completed
Group 13.0 completed
Group 14.0 completed
Group 15.0 completed
Group 16.0 completed
Group 17.0 completed
Group 18.0 completed
Group 19.0 completed
Group 20.0 completed
Group 21.0 completed
Group 22.0 completed
Group 23.0 completed
Group 24.0 completed
Group 25.0 completed
Group 26.0 completed
Group 27.0 completed
Group 28.0 completed
Group 29.0 completed
Group 30.0 completed
Group 31.0 completed
Group 32.0 completed
Group 33.0 completed
Group 34.0 completed


In [None]:
# Save embeddings as npy file
np.save(f'{directory}Data/labse_embeddings.npy', train_word_embeddings)