# Img Embs + BERT emb concatenation for posting matching
We take the concatenation of img embs and tf-idf embs

# 0. Setup

### Imports

In [1]:
#! pip install annoy
#! pip install efficientnet
#! pip install cupy
#import gc
!pip install ../input/keras-efficientnet-whl/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/keras-efficientnet-whl/efficientnet-1.1.1-py3-none-any.whl

Processing /kaggle/input/keras-efficientnet-whl/Keras_Applications-1.0.8-py3-none-any.whl
Installing collected packages: Keras-Applications
Successfully installed Keras-Applications-1.0.8
Processing /kaggle/input/keras-efficientnet-whl/efficientnet-1.1.1-py3-none-any.whl
Installing collected packages: efficientnet
Successfully installed efficientnet-1.1.1


In [2]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import cupy as cp # for efficiently calculating cosine similarity
from tqdm.auto import tqdm # same too
import glob
import os.path
from PIL import Image
from efficientnet.preprocessing import center_crop_and_resize
import time
from efficientnet.tfkeras import EfficientNetB3
import matplotlib.image as mpimg
import json
#from annoy import AnnoyIndex
from scipy import spatial
import math
#from tensorflow.keras.applications import EfficientNetB0

In [3]:
# we'll use AutoModel and AutoTokenizer for easy use
from transformers import AutoModel, AutoTokenizer
from transformers import get_linear_schedule_with_warmup

# import torch
import torch

In [4]:
train_path =  "../input/shopee-product-matching/train_images/"
test_path = "../input/shopee-product-matching/test_images/"
path_emb = "drive/My Drive/NN/train/emb/"
path_emb_test = "drive/My Drive/NN/train/emb_test/"
path_tfs = "drive/My Drive/NN/train/tfs/"
ckp_pth = "../input/ef-ckpt/cp-0000.ckpt"

# 1. Shopee Data

### Get data

<font color="red">NOTE: </font> Set `DATA = "test"` when submitting.

In [5]:
DATA = "test" # set to "test" when submitting

In [6]:
import csv
with open("../input/shopee-product-matching/"+DATA+".csv", "r", encoding="utf8") as f:
    data = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]

In [7]:
posting_ids = [row["posting_id"] for row in data]
posting_ids[0]

'test_2255846744'

In [8]:
descriptions = [row["title"] for row in data]
descriptions[0]

'Edufuntoys - CHARACTER PHONE ada lampu dan musik/ mainan telepon'

### Create index2ID dictionary
This dictionary maps indices to posting ids based on the order of the test data. So for example if posting id `'test_2255846744'` is the <b>5th</b> row of the data then we will have `index2ID[5] = 'test_2255846744'`. This dictionary is useful later on when we need to convert the most similar instances of an instance to the necessary submission format.

In [9]:
index2ID = {}
for i in range(len(data)):
    index2ID[i] = data[i]["posting_id"] 

# 2. Image embeddings


### Model setup

In [10]:
IMAGE_SIZE = [512, 512]
N_CLASSES = 11014
BATCH_SIZE = 8
AUTO = tf.data.experimental.AUTOTUNE

In [11]:
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [12]:
def load_img(path):
 img = tf.io.read_file(path)
 img = tf.io.decode_jpeg(img, channels=3)
 img = tf.image.resize(img, IMAGE_SIZE)
 img = tf.cast(img, tf.float32) / 255.0
 img = img[tf.newaxis, ...]
 return img


def get_image_embeddings_2(model, nr,df,  path = train_path):
  embs = []
  for i, row in enumerate(df):
    #if i == nr:
    #  break
    filename = path + row["image"]
    img = load_img(filename)
    features = model.predict(img)

    feature_set = cp.squeeze(features)
    embs.append(feature_set)
    if i % 1000 == 0:
        print(i)
        print(feature_set.shape)
    #print(features)
  return embs

In [13]:
if DATA == "test":
    df = pd.read_csv('../input/shopee-product-matching/test.csv')
    image_paths = '../input/shopee-product-matching/test_images/' + df['image']
else:
    df = pd.read_csv('../input/shopee-product-matching/train.csv')
    image_paths = '../input/shopee-product-matching/train_images/' + df['image']


In [14]:
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.7, 
            name='head/arc_margin', 
            dtype='float32'
            )
inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
label = tf.keras.layers.Input(shape = (), name = 'inp2')
x = EfficientNetB3(weights=None, include_top=False)(inp)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = margin([x, label])

output = tf.keras.layers.Softmax(dtype='float32')(x)

model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
model.load_weights('../input/new-weights2/EfficientNetB3_512_42V2.h5')

model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)

embeds = []
chunk = 5000
iterator = np.arange(np.ceil(len(df) / chunk))
for j in iterator:
    a = int(j * chunk)
    b = int((j + 1) * chunk)
    image_dataset = get_dataset(image_paths[a:b])
    image_embeddings = model.predict(image_dataset)
    embeds.append(image_embeddings)
del model
image_embeddings = np.concatenate(embeds)

In [15]:
#embeddings = image_embeddings

In [16]:
from sklearn.decomposition import PCA
pca = PCA(0.9, whiten = True).fit(image_embeddings)
img_embeddings_pca = pca.transform(image_embeddings)
img_embeddings_pca.shape

(3, 2)

In [17]:
#if DATA == 'train':
#with open('../input/img-emb-pca09whiten/img_emb_pca.npy', 'rb') as f:
#    img_embeddings_pca = np.load(f)

# 3. Description embeddings (BERT)

### Clear GPU memory

In [18]:
# clear GPU memory
from numba import cuda
cuda.select_device(0)
cuda.close()
cuda.select_device(0)

<weakproxy at 0x7f74f31df0b0 to Device at 0x7f74f3e95f90>

### Model setup

In [19]:
# get pretrained model from data we uploaded
MODEL_NAME = "../input/finetunebert2/archive(1)"

# define model and tokenizer
bert_model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side = "right")

### Tokenize Data

In [20]:
# truncation = True specifies we want to cut off descirptions longer than 512 tokens
# padding = True specifies we want to pad each description vector to length 512
tokenized_descriptions = tokenizer(descriptions, truncation=True, padding=True)

In [21]:
inputs = tokenized_descriptions["input_ids"] # torch.tensor()
masks = tokenized_descriptions["attention_mask"]

### Split into batches

In [22]:
batch_size = 32
input_batches = [inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)]
mask_batches = [masks[i:i + batch_size] for i in range(0, len(masks), batch_size)]

### Connect to GPU

In [23]:
# get GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
# put everything to GPU
bert_model = bert_model.to(device)

### Pass descriptions through pre-trained DistilBERT model and extract embeddings from last hidden states.

In [25]:
# initialise embeddings (each elemnt of "embeddings" is a an embedding)
#embeddings = np.zeros((batch_size,))

# loop over batches
for batch in range(len(input_batches)):
    
    if batch % 100 == 0:
        print(f"batch {batch} / {len(input_batches)}")
    
    # Turn off the gradient (we only want to do the forward pass)
    with torch.no_grad():
        
        # get input batch and mask batch
        # we need to convert the batch to a tensor then put it to device
        input_batch = torch.tensor(input_batches[batch]).to(device)
        mask_batch = torch.tensor(mask_batches[batch]).to(device)
        
        # calculate last hidden states
        batch_last_hidden_states = bert_model(input_batch, attention_mask = mask_batch)
        
        # get all embeddings for the batch
        #print(batch_last_hidden_states[1])
        batch_embeddings = batch_last_hidden_states[0][:,0,:].cpu().numpy()
        
        # append these embeddings to all embeddings
        #embeddings.append(batch_embeddings)
        if batch == 0:
            bert_embeddings = batch_embeddings.copy()
        else:
            bert_embeddings = np.concatenate((bert_embeddings,batch_embeddings))

batch 0 / 1


In [26]:
print(bert_embeddings.shape)
bert_embeddings

(3, 768)


array([[-0.4534102 ,  0.43418622,  0.14663725, ...,  0.38369814,
        -0.3333526 , -0.27270252],
       [-0.19466557,  0.16739792,  0.13205379, ...,  0.6204294 ,
        -0.50449145, -0.5414049 ],
       [-0.41036266,  0.32696   ,  0.14186713, ...,  0.41688055,
        -0.44970396, -0.4980893 ]], dtype=float32)

### Clear GPU memory

In [27]:
del bert_model
del tokenizer
torch.cuda.empty_cache()

# 4. Concatenate embeddings

In [28]:
from sklearn.decomposition import PCA
pca = PCA(0.9,whiten=True).fit(bert_embeddings)
bert_pca = pca.transform(bert_embeddings)
bert_pca.shape

(3, 2)

In [29]:
concat_embeddings = np.concatenate((img_embeddings_pca,bert_pca), axis=1)
concat_embeddings.shape

(3, 4)

### PCA

In [30]:
#from sklearn.decomposition import PCA
#pca = PCA(0.9,whiten=True).fit(concat_embeddings)
#embeddings_pca2 = pca.transform(concat_embeddings)
#embeddings_pca2.shape

# 5. Get most similar instances for each instance


### [<font color = "red"><b>DATA == "train"</b></font>] Get postings_ids from `label_group` and get <b><u>ground truth matches</b></u> for training data

Here we want to get all the posting ids for a certain label group and use this to get the ground truth matches for each training instance.

In [31]:
if DATA == "train":
    
    # Get label groups
    # i.e. list of [249114794, ...]
    label_groups = list(dict.fromkeys([row["label_group"] for row in data]))
    
    # Initialise group2postings dictionary 
    # i.e. will look like {249114794: "train_2464356923" ,34911679: "train_2464356923 train_3464356923", ...}
    group2postings = {}
    for label_group in label_groups:
        group2postings[label_group] = ""
    
    # fill in group2postings
    # loop over data
    for row in data:
        
        # get label group of data instance
        label_group = row["label_group"]
        
        # if it"s already in group2postings don"t add it 
        if not row["posting_id"] in group2postings[label_group]:
            group2postings[label_group] = group2postings[label_group] + " ".join([row["posting_id"]]) + " "
    
    # strip white space at end
    for k in group2postings.keys():
        group2postings[k] = group2postings[k].rstrip()
    
    # get ground truth matches
    true_matches = []
    for i, row in enumerate(data):
        true_matches.append(group2postings[row["label_group"]])

In [32]:
if DATA == "train":
    print(true_matches[:8])

### Match all postings that have the same perceptual hash


In [33]:
def match_hashes():
    # Get every instance hash
    # i.e. list of [249114794, ...]
    hashes = list(dict.fromkeys([row["image_phash"] for row in data]))

    # Initialise hash2postings dictionary 
    # i.e. will look like {94974f937d4c2433: "train_2464356923" ,94974f937d4c2433: "train_2464356923 train_3464356923", ...}
    hash2postings = {}
    for hash_ in hashes:
        hash2postings[hash_] = ""

    # fill in hash2postings
    # loop over data
    for row in data:

        # get label group of data instance
        hash_ = row["image_phash"]

        # if it"s already in hash2postings don"t add it 
        if not row["posting_id"] in hash2postings[hash_]:
            hash2postings[hash_] = hash2postings[hash_] + " ".join([row["posting_id"]]) + " "

    # strip white space at end
    for k in hash2postings.keys():
        hash2postings[k] = hash2postings[k].rstrip()

    # get predicted matches
    pred_matches = []
    for i, row in enumerate(data):
        pred_matches.append(hash2postings[row["image_phash"]])
        
    return pred_matches

In [34]:
baseline_matches = match_hashes()

### Convert from indices to submission format
What we do is calculate the similarity between one test instance and all test instances. The result for one instance is a list of the indices of the most similar instances. For example me might get `[0, 2, 1]` meaning instance 0 is most similar to instance 0, instance 2 and instance 1. So for example if we have `[0, 2, 1]` we need to convert this list to the string <b>'test_2255846744 test_4015706929 test_3588702337'</b>. This is where our `index2ID` dictionary comes in handy ;). We define a function to help us do this conversion:

In [35]:
def indices_to_submission_format(indices):
    return " ".join([index2ID[i] for i in indices])

### Predict matches for each instance
For this we define a function `predict_matches` that creates a list of the matches for every instance. We use cosine similarity.  

In [36]:
def predict_matches(baseline_matches, embeddings, threshold):
    
    matches = []
    print(embeddings.shape)
    emb = cp.array(embeddings)
    emb =  emb / cp.linalg.norm(emb, axis=1)[:,None]
    N = emb.shape[0]

    for i in tqdm(range(N)):
        v = emb[i, :]
        thresholded_bool = cp.dot(emb,v) > threshold
        thresholded_ix = cp.argwhere(thresholded_bool).squeeze(-1)
        thresholded_ix = thresholded_ix.get()
        match = " ".join([indices_to_submission_format(thresholded_ix)])
        new_match = " ".join(list(set(match.split()).union(set(baseline_matches[i].split()))))
        matches.append(new_match)

    return matches

### [<font color = "red"><b>DATA == "train"</b></font>] F1-score on training data
Here we determine how well our predicted matches are on the training data.

In [37]:
def f1(y_true, y_pred):
    # F1 score calculated for each data instance
    # We return the average

    y_true = list(map(lambda x: set(x.split()), y_true))
    y_pred = list(map(lambda x: set(x.split()), y_pred))

    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array(list(map(lambda x: len(x), y_pred))) - tp
    fn = np.array(list(map(lambda x: len(x), y_true))) - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * ((precision * recall) / (precision + recall))
    return f1.mean()

### [<font color = "red"><b>DATA == "train"</b></font>] Hyperparameter search for the threshold $t$
<b>(Or inference in testing case)</b>

We need to **define a threshold** to determine what is a similar instance. For example me might define the threshold at 0.998 meaning we consider two test instances the same if instance two is at least 99.8% similar to instance one.

In [38]:
if DATA == "train":
    for t in [0.46]: # 0.42 | PCA 0.9
        print(f"t: {t}, F1: {f1(predict_matches(baseline_matches,concat_embeddings,t),true_matches).mean()}")
elif DATA == "test":
    matches = predict_matches(baseline_matches,concat_embeddings,0.46)

(3, 4)


  0%|          | 0/3 [00:00<?, ?it/s]

### Train results

| $t$ | F1 |
|----|----|
|0.3 | 0.663|
|0.35 |0.68277    |
|<b>0.4</b> | <b>0.6875</b>    |
| 0.42 | 0.6871  |
| 0.45 | 0.6839  |
| 0.5| 0.6783  |
| 0.7  |  0.63760 |
| 0.8 | 0.6144  |

# 6. [<font color = "red"><b>DATA == "test"</b></font>] Submit
Finally we can create the submission.csv file

In [39]:
if DATA == "test":
    submission = pd.DataFrame({"posting_id": posting_ids, "matches": matches})
    submission.to_csv("submission.csv", index=False)

In [40]:
if DATA == "test":
    !head -n 3 /kaggle/working/submission.csv

posting_id,matches
test_2255846744,test_2255846744
test_3588702337,test_3588702337
