## 라이브러리 모음

In [1]:
!pip install ../input/external-model/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/external-model/efficientnet-1.1.0-py3-none-any.whl

# 파일 처리
import os

# Data 처리
import pandas as pd
import numpy as np

##############################################################

# RAPIDS 라이브러리
import cudf, cuml, cupy 
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
##############################################################

# ML, DNN, CNN 관련 라이브러리
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, Input, GlobalAveragePooling2D, Softmax

#!pip install efficientnet
import efficientnet.tfkeras as efn
import math
##############################################################

# 이미지 및 그래프 출력
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

##############################################################

# 해쉬(phash) 값 처리
import imagehash

##############################################################

# Text Data NLP 처리
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.neighbors import NearestNeighbors
import re
import nltk
# nltk.download('popular')
# nltk.download('stopwords')

from shutil import copyfile
copyfile(src = "../input/bert-baseline/tokenization.py", dst = "../working/tokenization.py")

import tokenization
import tensorflow_hub as hub

from sklearn.preprocessing import LabelEncoder
##############################################################

# 메모리 관리
import gc

# 경고메시지 지우기
import warnings
warnings.filterwarnings(action='ignore')

# 상태바 진행상태
from tqdm import tqdm

# Text Color
from termcolor import colored

# 실행시간 확인
import time
import datetime

Processing /kaggle/input/external-model/Keras_Applications-1.0.8-py3-none-any.whl
Installing collected packages: Keras-Applications
Successfully installed Keras-Applications-1.0.8
Processing /kaggle/input/external-model/efficientnet-1.1.0-py3-none-any.whl
Installing collected packages: efficientnet
Successfully installed efficientnet-1.1.0


## 메모리 관리

In [2]:
# RESTRICT TENSORFLOW TO 2GB OF GPU RAM
# SO THAT WE HAVE 14GB RAM FOR RAPIDS
LIMIT = 2.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

We will restrict TensorFlow to max 2GB GPU RAM
then RAPIDS can use 14GB GPU RAM


## 변수 모음

In [3]:
IMG_SIZE = [224, 224]

BATCH_SIZE = 5

N_CLASSES = 11014

SEED = 42

GET_CV = False

CHECK_SUB =False

## CV 설정

In [4]:
df = pd.read_csv('../input/shopee-product-matching/test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
del df

## 함수 모음

In [5]:
# DataSet 불러오기
def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].map(lambda x: ' '.join(x))
        
        if CHECK_SUB:
            df = pd.concat([df, df], axis=0)
            df.reset_index(drop=True, inplace=True)
        df_cu =cudf.DataFrame(df)
        image_path = '../input/shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu =cudf.DataFrame(df)
        image_path = '../input/shopee-product-matching/test_images/' + df['image']
        
    return df, df_cu, image_path

# 예측값 결합
def combine_preds(row):
    x = np.concatenate([row['image_pred'], row['text_pred'], row['phash_pred']])
    return ' '.join(np.unique(x))


# F1 Score 함수
def f1_score(t_true, t_pred):
    t_true = t_true.apply(lambda x : set(x.split()))
    t_pred = t_pred.apply(lambda x : set(x.split()))
    
    intersection = np.array([len(x[0] & x[1]) for x in zip(t_true, t_pred)])
    len_t_true = t_true.apply(lambda x : len(x)).values
    len_t_pred = t_pred.apply(lambda x : len(x)).values
    
    F1 = 2 * intersection / (len_t_true + len_t_pred)
    
    return F1

# ArcFace loss 생성 Class
class ArcMarginProduct(Layer):
    '''
    GDis(Geodestic Distance margin) 구하는 Class
    Implements large margin arc distance.
    '''
    
    def __init__(self, n_classes, s=30, m=0.5, easy_margin=False, ls_eps=0.0, **kwargs):
        
        super(ArcMarginProduct, self).__init__(**kwargs)
        
        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m =tf.math.cos(m)
        self.sin_m =tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'n_classes':self.n_classes,
            's' : self.s,
            'm' : self.m,
            'ls_eps' : self.ls_eps,
            'easy_margin' : self.easy_margin
        })
        return config
    
    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])
        
        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer = 'glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None
            )
        
    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
            
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        
        if self.ls_eps > 0:
            one_hot = (1-self.ls_eps) * one_hot + self.ls_eps / self.n_classes
            
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

# KNN 이웃 구하기
def get_neighbors(df, embeddings, KNN=50, image=True):
    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    if GET_CV:
        if image :
            thresholds = list(np.arange(3.0, 5.0, 0.1))
        else:
            thresholds = list(np.arange(15, 35, 1))
            
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                indc = indices[k , idx]
                posting_ids = ' '.join(df['posting_id'].iloc[indc].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            
            print('F1 score for threshold {} is {}'.format(threshold, score))
            scores.append(score)
            
        thresholds_scores = pd.DataFrame({
            'thresholds':thresholds,
            'scores': scores
        })
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        
        print('Our best score : {} , threshold : {}'.format(best_score, best_threshold))
        print(type(best_threshold))
        
        del predictions, scores, indc, idx
        
        
        predictions = []
        for i in range(embeddings.shape[0]):
            if image:
                idx = np.where(distances[i,]<best_threshold)[0]
            else:
                idx = np.where(distances[i,]<best_threshold)[0]
                
            indc = indices[i, idx]
            posting_ids = df['posting_id'].iloc[indc].values
            predictions.append(posting_ids)
        
    else:
        
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            if image:
                idx = np.where(distances[k,]<3.6)[0]
            else:
                idx = np.where(distances[k,]<20.0)[0]
            indc = indices[k, idx]
            posting_ids =df['posting_id'].iloc[indc].values
            predictions.append(posting_ids)
            
    del model, distances, indices, idx, indc, posting_ids
    gc.collect()
    
    return df, predictions

############################################################

# read & decode image
def read_and_decode_img(image):
    image = tf.io.read_file(image)
    img = tf.image.decode_jpeg(image, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.cast(img, tf.float32) / 255.0
    return img

# dataset load
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_and_decode_img, 
                          num_parallel_calls = tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

# image embedding
def image_embedding(img_path):
    embeds = []
    
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    
    input_layer = Input(shape = (*IMG_SIZE, 3))
    label = Input(shape= ())
    
    x = efn.EfficientNetB3(weights = None, include_top=False)(input_layer)
    x = GlobalAveragePooling2D()(x)
    x = margin([x, label])
    
    output_layer = Softmax(dtype='float32')(x)
    
    model = Model(inputs=[input_layer, label], 
                  outputs=[output_layer])
    model.load_weights('../input/shopeeefficientnetb3512/EfficientNetB3_512_42.h5')
    model = Model(inputs = model.input[0], 
                  outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(img_path[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    
    del model
    
    image_embeddings = np.concatenate(embeds)
    
    del embeds
    gc.collect()
    
    return image_embeddings

############################################################

# title 단어 preprocessing
def text_preprocessing(text, flg_stem, flg_lemm):
    
    stopwords_list = nltk.corpus.stopwords.words('english')
    
    # 특수기호 제거, 모든 문자 소문자, 양옆 공백 제거
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    
    # text 문자에서 stopwords_list 에 있는 단어 제거하기
    text_list = text.split()
    
    if stopwords_list is not None :
        text_list = [word for word in text_list 
                    if word not in text_list]
        
    # -ing, -ly, ... 같은 접미어 제거 하기 (가지치기)
    if flg_stem == True :
        ps = nltk.stem.porter.PorterStemmer()
        text_list = [ps.stem(word) for word in text_list]
        
    # 뿌리 단어로 바꾸기
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        text_list = [lem.lemmatize(word) for worf in text_list]
        
    # 문자열로 되돌려놓기
    clean_text = " ".join(text_list)
    return text

# BERT encoding 함수
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []   # ==>
    all_masks = []    # ==>
    all_segments = [] # ==>
    
    for text in texts:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    gc.collect()
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

# BERT Model 전이학습 함수
def get_text_embeddings(df, max_len = 70):
    embeds = []
    module_url = "../input/external-model/bert_en_uncased_L-24_H-1024_A-16_1"
    bert_layer = hub.KerasLayer(module_url, trainable = True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
    
    text = bert_encode(df['title'].values, tokenizer, max_len = max_len)
    
    margin = ArcMarginProduct(
            n_classes = 11014, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    label = Input(shape = (), name = 'label')
    
    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    x = margin([clf_output, label])
    output = Softmax(dtype='float32')(x)
    
    model = Model(inputs = [input_word_ids, input_mask, segment_ids, label], 
                  outputs = [output])
    model.load_weights('../input/bert-baseline/Bert_123.h5')
    
    model = Model(inputs = model.input[0:3], outputs = model.layers[-4].output)
    
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        
        a = int(j * chunk)
        b = int((j + 1) * chunk)

        text_chunk = ((text[0][a:b], text[1][a:b], text[2][a:b]))
        
        text_embeddings = model.predict(text_chunk, batch_size = BATCH_SIZE)
        embeds.append(text_embeddings)
        
    del model
    
    text_embeddings = np.concatenate(embeds)
    
    del embeds
    gc.collect()
    
    return text_embeddings

###########################################################################
# phash data

def phash_match(phash_array, element):
    phash_diff = phash_array - phash_array[element]
    return phash_diff

def add_match(phash, i, dataset, threshold = 5):
    
    diffs = phash_match(phash, i)
    matches = [x for x in diffs[diffs <= threshold].index.drop(i).values]
    
    str_matches = ''
    str_matches = str_matches + dataset.iloc[i, 0] + ' '
    
    for k in matches:
        str_matches = str_matches + dataset.iloc[k, 0] + ' '
    str_matches = str_matches[:-1]
    
    return str_matches

def simple_match(dataset, element):
    """
    A function that returns match names.
    Takes dataset and i element.
    """
    result = []
    matches = dataset[dataset['image_phash'] == 
                      dataset['image_phash'][element]]['posting_id'].drop(element).values
    str_matches = ''
    str_matches = str_matches + dataset.iloc[element, 0] + ' '
    for j in matches:
        str_matches = str_matches + j + ' '
    str_matches = str_matches[:-1]
    result.append(str_matches)
    return result


## DataSet 설정

In [6]:
df, df_cu, img_paths = read_dataset()
df.head()

Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


## Image embeddings

In [7]:
image_embeddings = image_embedding(img_paths)
image_embeddings.shape

(3, 1536)

## Text embeddings

In [8]:
#df['title'] = df['title'].map(lambda x: text_preprocessing(x, True, True))

text_embeddings = get_text_embeddings(df)
text_embeddings.shape

(3, 1024)

## TF-IDF Text embedding

In [9]:
# model = TfidfVectorizer(stop_words=None, binary=True, max_features=15500)
# text_tfidf = model.fit_transform(df_cu.title).toarray()

In [10]:
'''
preds = []
CHUNK = 1024*4

print('Find similar titles by TF-IDF model')
CTS = len(df_cu)//CHUNK
if len(df_cu)%CHUNK!=0: CTS += 1
for i in range(CTS):
    a = i*CHUNK
    b = (i+1)*CHUNK
    b = min(b,len(df_cu))
    print('chunk',a,'to',b)
    
    # Cosine Similarity Distance
    cts = cupy.matmul(text_tfidf, text_tfidf[a:b].T).T
    
    for k in range(b-a):
        IDX = cupy.where(cts[k,]>0.75)[0]
        x = df_cu.iloc[cupy.asnumpy(IDX)].posting_id.to_pandas().values
        preds.append(x)

gc.collect()
        
df_cu['tfidf_text_pred'] = preds

del model, text_tfidf, preds
'''

"\npreds = []\nCHUNK = 1024*4\n\nprint('Find similar titles by TF-IDF model')\nCTS = len(df_cu)//CHUNK\nif len(df_cu)%CHUNK!=0: CTS += 1\nfor i in range(CTS):\n    a = i*CHUNK\n    b = (i+1)*CHUNK\n    b = min(b,len(df_cu))\n    print('chunk',a,'to',b)\n    \n    # Cosine Similarity Distance\n    cts = cupy.matmul(text_tfidf, text_tfidf[a:b].T).T\n    \n    for k in range(b-a):\n        IDX = cupy.where(cts[k,]>0.75)[0]\n        x = df_cu.iloc[cupy.asnumpy(IDX)].posting_id.to_pandas().values\n        preds.append(x)\n\ngc.collect()\n        \ndf_cu['tfidf_text_pred'] = preds\n\ndel model, text_tfidf, preds\n"

## image embedding 값으로 prediction data 구하기

In [11]:
if len(df) > 3: KNN=100
else: KNN=3
    
df, image_predictions = get_neighbors(df, image_embeddings, KNN=KNN, image=True)


100%|██████████| 3/3 [00:00<00:00, 4002.20it/s]


## text embeddings 값으로 prediction data 구하기

In [12]:
if len(df) > 3: KNN=100
else: KNN=3

df, text_predictions = get_neighbors(df, text_embeddings, KNN=KNN, image=False)

gc.collect()


100%|██████████| 3/3 [00:00<00:00, 2903.30it/s]


0

## 해시값으로 동일상품 분류하기

In [13]:
'''
# 해시값으로 동일상품 분류
#phashs = df['image_phash'].apply(lambda x: imagehash.hex_to_hash(x))

phash_df = df[['posting_id', 'image_phash']]
str_matches = []
for i in tqdm(range(len(phash_df)), 
                   desc = 'Progress:', 
                   position = 0, 
                   leave = True):
    str_matches.append(simple_match(phash_df, i))

df['phash_pred'] = str_matches
df.head()
'''
tmp = df.groupby('image_phash').posting_id.unique().to_dict()
df['phash_pred'] = df.image_phash.map(tmp)

df.head()

Unnamed: 0,posting_id,image,image_phash,title,phash_pred
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...,[test_2255846744]
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...,[test_3588702337]
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng,[test_4015706929]


## 예측값 결합

In [14]:
if GET_CV:
    df['image_pred'] = image_predictions
    df['text_pred'] = text_predictions
    #df['tfidf_text_pred'] = df_cu['tfidf_text_pred'].to_pandas().values
    df['pred_matches'] = df.apply(combine_preds, axis = 1)
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df.f1.mean()
    print('Final F1 CV Score :', score)
    df['matches'] = df['pred_matches']
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
else:
    df['image_pred'] = image_predictions
    df['text_pred'] = text_predictions
    #df['tfidf_text_pred'] = df_cu['tfidf_text_pred'].to_pandas().values
    df['matches'] = df.apply(combine_preds, axis = 1)
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)