# 라이브러리 모음

In [2]:
# 파일 처리
import os

# Data 처리
import pandas as pd
import numpy as np

# ML, DNN, CNN 관련 라이브러리
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, Input, GlobalAveragePooling2D, Softmax

import efficientnet.tfkeras as efn
import math
##############################################################

# 이미지 및 그래프 출력
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

##############################################################

# 해쉬(phash) 값 처리
import imagehash

##############################################################

# Text Data NLP 처리
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import re
import nltk
#nltk.download('popular')

from shutil import copyfile
#copyfile(src = "./tokenization.py", dst = "./working/tokenization.py")

import tokenization
import tensorflow_hub as hub

from sklearn.preprocessing import LabelEncoder
##############################################################

# 메모리 관리
import gc

# 경고메시지 지우기
import warnings
warnings.filterwarnings(action='ignore')

# 상태바 진행상태
from tqdm import tqdm

# Text Color
from termcolor import colored


# 전체 Data 간단 요약

In [4]:
# 전체 Data 요약

BASE_DIR = './shopee-product-matching/'

# CSV 파일
train_df = pd.read_csv(BASE_DIR + 'train.csv')
test_df = pd.read_csv(BASE_DIR + 'test.csv')
sample_df = pd.read_csv(BASE_DIR + 'sample_submission.csv')

### 공통 변수 모음

In [13]:
IMG_SIZE = [512, 512]

BATCH_SIZE = 10

N_CLASSES = len(train_df['label_group'].unique()) # 11014

GET_CV = True

CHECK_SUB =False

## Dataset 만들기

In [11]:
def read_dataset():
    if GET_CV:
        df = pd.read_csv(BASE_DIR + 'train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].map(lambda x: ' '.join(x))
        
        if CHECK_SUB:
            df = pd.concat([df, df], axis=0)
            df.reset_index(drop=True, inplace=True)
        
        image_path = './shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv(BASE_DIR + 'test.csv')
        image_path = './shopee-product-matching/test_images/' + df['image']
        
    return df, image_path

In [14]:
train, img_path = read_dataset()
img_path.shape

(34250,)

### <train.csv 상위 5 row data>

In [12]:
train_df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


## <train.csv Data 요약 내용>

In [14]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


## <train data 중 각 Column 별 Unique 개수>

In [23]:
for col in train_df.columns:
    print('{} : {}'.format(col, colored(len(train_df[col].unique()), 'blue')))

posting_id : [34m34250[0m
image : [34m32412[0m
image_phash : [34m28735[0m
title : [34m33117[0m
label_group : [34m11014[0m


In [20]:
# F1 Score 함수
def f1_score(t_true, t_pred):
    t_true = t_true.apply(lambda x : set(x.split()))
    t_pred = t_pred.apply(lambda x : set(x.split()))
    
    intersection = np.array([len(x[0] & x[1]) for x in zip(t_true, t_pred)])
    len_t_true = t_true.apply(lambda x : len(x)).values
    len_t_pred = t_pred.apply(lambda x : len(x)).values
    
    F1 = 2 * intersection / (len_t_true + len_t_pred)
    
    return F1

In [19]:
# ArcFace loss 생성 Class
class ArcMarginProduct(Layer):
    '''
    GDis(Geodestic Distance margin) 구하는 Class
    '''
    
    def __init__(self, n_classes, s=30, m=0.5, easy_margin=False, ls_eps=0.0, **kwargs):
        
        super(ArcMarginProduct, self).__init__(**kwargs)
        
        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m =tf.math.cos(m)
        self.sin_m =tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'n_classes':self.n_classes,
            's' : self.s,
            'm' : self.m,
            'ls_eps' : self.ls_eps,
            'easy_margin' : self.easy_margin
        })
        return config
    
    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])
        
        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer = 'glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None
            )
        
    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
            
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        
        if self.ls_eps > 0:
            one_hot = (1-self.ls_eps) * one_hot + self.ls_eps / self.n_classes
            
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [18]:
# KNN 이웃 구하기
def get_neighbors(df, embeddings, KNN=50, image=True):
    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    if GET_CV:
        if image :
            thresholds = list(np.arange(3.0, 5.0, 0.1))
        else:
            thresholds = list(np.arange(15, 35, 1))
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                indc = indices[k , idx]
                posting_ids = ' '.join(df['posting_id'].iloc[indc].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = df_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            
            print('Our F1 score for threshold {} is {}'.format(threshold, score))
            scores.append(score)
            
        thresholds_scores = pd.DataFrame({
            'thresholds':thresholds,
            'scored': scores
        })
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        
        print('Our best score : {} , threshold : {}'.format(best_score, best_threshold))
        
        del predictions, scores, indc, idx
        
        
        predictions = []
        for i in range(embeddings.shape[0]):
            if image:
                idx = np.where(distances[i,]<3.6)[0]
            else:
                idx = np.where(distances[i,]<20.0)[0]
                
            ids = indices[i, idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            if image:
                idx = np.where(distances[k,]<3.6)[0]
            else:
                idx = np.where(distances[k,]<20.0)[0]
            indc = indices[k, idx]
            posting_ids =df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
            
    del model, distances, indices, idx, ids, posting_ids
    gc.collect()
    
    return df, predictions

In [None]:
import time
import datetime

start = time.time()

image_embedding = get_image_embeddings(train_image_path)
print(image_embedding.shape)

sec = time.time() - start
times = str(datetime.timedelta(seconds=sec)).split('.')
print('총 실행시간 :'times[0])

In [None]:
import time
import datetime

start = time.time()

df, image_prediction = get_neighbors(train_df, image_embedding, KNN=100, image=True)

sec = time.time() - start
times = str(datetime.timedelta(seconds=sec)).split('.')
print(times[0])

# #####################################

# Text Data

# #####################################

## Text embedding

In [15]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []   # ==>
    all_masks = []    # ==>
    all_segments = [] # ==>
    
    for text in texts:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [17]:
# BERT Model 전이학습 함수
def get_text_embeddings(df, max_len = 70):
    embeds = []
    module_url = "./bert_en_uncased_L-24_H-1024_A-16_1"
    bert_layer = hub.KerasLayer(module_url, trainable = True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
    
    text = bert_encode(df['title'].values, tokenizer, max_len = max_len)
    
    margin = ArcMarginProduct(
            n_classes = 11014, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    label = Input(shape = (), name = 'label')
    
    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    x = margin([clf_output, label])
    output = Softmax(dtype='float32')(x)
    
    model = Model(inputs = [input_word_ids, input_mask, segment_ids, label], 
                  outputs = [output])
    model.load_weights('./Bert_Baseline/Bert_123.h5')
    
    model = Model(inputs = model.input[0:3], outputs = model.layers[-4].output)
    
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        text_chunk = ((text[0][a:b], text[1][a:b], text[2][a:b]))
        text_embeddings = model.predict(text_chunk, batch_size = BATCH_SIZE)
        embeds.append(text_embeddings)
        
    del model
    
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    
    del embeds
    gc.collect()
    
    return text_embeddings

In [None]:
# text embedding data

df, img_paths = read_dataset()

text_embeddings = get_text_embeddings(df)

print(text_embeddings.shape)

gc.collect()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
