# Setting up Environment for google.colab

In [1]:
!yes | pip install torch==1.8.0

# Importing Utils and Reading Data

In [2]:
import torch

dev="cuda" if torch.cuda.is_available() else "cpu"
if dev=="cuda":
  print (f"Available gpu is :{torch.cuda.get_device_name (dev)}.")
else:
  print (f"GPU not available :{torch.cuda.get_device_name(dev)}.")

In [3]:
import sys
sys.path=["../input/geffnet-20200820"]+sys.path

In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
from torch import multiprocessing as mp
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from transformers import AutoTokenizer, AutoModel
import os, cv2 as cv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
import geffnet
import albumentations as A
from sklearn.metrics import f1_score

In [5]:
csv="../input/shopee-product-matching/train.csv"
img_dir="../input/shopee-product-matching/train_images/"

In [6]:
dataset=pd.read_csv (csv)
dataset.head(5)

In [7]:
# Preparing dataset

class ShopData (Dataset):
    
    def __init__ (self, df=dataset, img_dir=img_dir, tokenizer=None,
                  transformer=A.Compose([A.Resize(256, 256), A.Normalize()]), device="cpu"):
      
        self.df=df   #imported cvs table
        self.img_dir=img_dir #image directory path
        self.transform=transformer
        self.tokenizer=tokenizer
        self.device=device
    
    def __len__ (self):
        return (len(self.df))
    
    def __getitem__(self, index):
      
        if torch.is_tensor(index):
          index=index.tolist()
      
        row=self.df.iloc[index] #row item
        path=os.path.join (self.img_dir, row.image) #storing image path

        img=cv.imread(path)
        img=cv.cvtColor (img, cv.COLOR_BGR2RGB)
        text=row.title

        if self.tokenizer is not None:
          text=self.tokenizer(text, padding='max_length', truncation=True, max_length=100, return_tensors='pt')
          attention_mask=text['attention_mask'][0]
          input_ids=text['input_ids'][0]
        
        if self.transform is not None:
          img=self.transform(image=img)['image'].astype(np.float32).transpose(2, 0, 1)
        
        return {'posting_id':row.posting_id,
                'image': torch.tensor(img).to(self.device),
                'input_ids':input_ids.to(self.device),
                'attention_mask':attention_mask.to(self.device)}

# Praparing Model

In [11]:
text_model=AutoModel.from_pretrained('../input/bert-base-uncased')
tokenizer=AutoTokenizer.from_pretrained('../input/bert-base-uncased')
image_model=geffnet.create_model('tf_efficientnet_b0_ns', pretrained=None)

  ## Network

In [12]:
class Network (nn.Module):
    def __init__ (self):
        super(Network, self).__init__()
        self.bert=text_model
        self.enet=image_model
        self.feat = nn.LazyLinear(512)
        self.relu=nn.ReLU()

        self.enet.classifier = nn.Identity()

    def forward (self, image, input_ids, attention_mask):
        image=self.enet (image.type(torch.cuda.FloatTensor))
        text=self.bert (input_ids, attention_mask)
        cat=torch.cat ([image, text[1]], dim=1)
        cat=self.feat (cat)
        cat=self.relu (cat)
        cat=nn.functional.normalize(cat)

        return cat

# Finding Similar Embeddings with Cosine Similarity

In [13]:
BATCH_SIZE=32

model=Network().to(dev)
state_dict=torch.load('../input/shopee-b0-bert/b0ns_256_bert_20ep_fold0_epoch27.pth')['model_state_dict']
state_dict = {k: state_dict[k] for k in list(state_dict.keys())[:-1]}
model.load_state_dict(state_dict, strict=True)

data=ShopData(device=dev, tokenizer=tokenizer)
data_loader=DataLoader(data, batch_size=BATCH_SIZE)

In [14]:
embeddings=[]
model.train(False)
posting_id=[]
with torch.no_grad():
    for d in tqdm(data_loader): 
        posting, img, input_ids, attention_mask = d['posting_id'], d['image'], d['input_ids'], d['attention_mask']
        out = model(img, input_ids, attention_mask)
        embed = out.detach().cpu().numpy()
        embeddings.append(embed)
        posting_id.append(posting)

In [15]:
embeds=np.concatenate(embeddings)
pids=np.concatenate(posting_id)

In [16]:
sim=cosine_similarity(embeds)

In [45]:
idx=np.where(sim[3, :]*(sim[3, :]>0.5))

In [46]:
for i in pids[idx]:
  ind=dataset.posting_id.to_list().index(i)
  path_to_image="../input/shopee-product-matching/train_images/"
  img=plt.imread(path_to_image+dataset.iloc[ind].image)
  plt.imshow(img)
  plt.show()

In [19]:
th=np.argwhere(sim>0.5)
predictions={i[0]:dataset.posting_id.values[i[0:]] for i in th}
del th

In [26]:
dataset['predictions']=list(predictions.values())
del predictions

In [30]:
dataset.head(5)

In [31]:
tmp = dataset.groupby('label_group').posting_id.agg('unique').to_dict()
dataset['target'] = dataset.label_group.map(tmp)

In [33]:
dataset.head(5)

# F1 Score

In [48]:
f1=[]
for i in range (len(dataset)):
    a=len(np.intersect1d(dataset.iloc[i].target, dataset.iloc[i].predictions))
    b=len(dataset.iloc[i].target) + len(dataset.iloc[i].predictions)
    f1.append(2*a/b)
print ("F1 Score: ", np.mean(f1))