In [None]:
!nvidia-smi

Mon Jul 12 13:53:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!ls

Mounted at /content/drive
drive  sample_data


In [None]:
import os
import cv2
import math
import random
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm

import tensorflow as tf
print(tf.__version__)
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras import datasets, layers, optimizers, Sequential, metrics
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.applications import EfficientNetB4
from tensorflow.keras import backend as K
import albumentations

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

AUTO = tf.data.experimental.AUTOTUNE

2.5.0


In [None]:
class CFG:
    seed = 2021
    Epochs = 60
    classes = 11014 
    scale = 30 
    margin = 0.1
    fc_dim = 512
    img_size = 384  
    batch_size = 16
    channels = 3

In [None]:
def read_dataset(csv_path, image_path):
    df = pd.read_csv(csv_path)
    image_paths = image_path + df['image']
    return df, image_paths

In [None]:
image_path = '/content/drive/MyDrive/shopee/shopee-product-matching/train_images/'
csv_path = '/content/drive/MyDrive/shopee/shopee-product-matching/train.csv' # repalce the train.csv
df, image_paths = read_dataset(csv_path, image_path)

In [None]:
df.shape

(34250, 5)

In [None]:
labelencoder= LabelEncoder()
df['label_group'] = labelencoder.fit_transform(df['label_group'])
CFG.classes = df['label_group'].nunique()
CFG.classes

11014

In [None]:
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,666
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",7572
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,6172
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,10509
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,9425


In [None]:
def f1_score(y_true, y_pred):
  y_true = y_true.apply(lambda x: set(x.split()))
  y_pred = y_pred.apply(lambda x: set(x.split()))
  intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
  len_y_pred = y_pred.apply(lambda x: len(x)).values
  len_y_true = y_true.apply(lambda x: len(x)).values
  f1 = 2 * intersection / (len_y_pred + len_y_true)
  return f1

In [None]:
def get_image_neighbors(df, embeddings, KNN=50, threshold = 4.5):

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [None]:
# data_augment
def data_augment(image, label):
    p_spatial = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_rotate = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_1 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_2 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_3 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_crop = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
            
    # Flips
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    if p_spatial > .75:
        image = tf.image.transpose(image)
        
    # Rotates
    if p_rotate > .75:
        image = tf.image.rot90(image, k=3) 
    elif p_rotate > .5:
        image = tf.image.rot90(image, k=2) 
    elif p_rotate > .25:
        image = tf.image.rot90(image, k=1) 
        
    
    if p_pixel_1 >= .4:
        image = tf.image.random_saturation(image, lower=.7, upper=1.3)
    if p_pixel_2 >= .4:
        image = tf.image.random_contrast(image, lower=.8, upper=1.2)
    if p_pixel_3 >= .4:
        image = tf.image.random_brightness(image, max_delta=.1)
        
    
    if p_crop > .7:
        if p_crop > .9:
            image = tf.image.central_crop(image, central_fraction=.7)
        elif p_crop > .8:
            image = tf.image.central_crop(image, central_fraction=.8)
        else:
            image = tf.image.central_crop(image, central_fraction=.9)
    elif p_crop > .4:
        crop_size = tf.random.uniform([], int(CFG.img_size*.8), CFG.img_size, dtype=tf.int32)
        image = tf.image.random_crop(image, size=[crop_size, crop_size, CFG.channels])
    
    image = tf.image.resize(image, [CFG.img_size, CFG.img_size])
    return image, label

In [None]:
# Function to decode our images
def preprocess_image(image):
    
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)  # / 255.0  # normalize to [0,1], but effnet has normlayer
    image = tf.image.resize(image, (CFG.img_size, CFG.img_size))
    # image = tf.cast(image, tf.float32)
    return image

def load_and_preprocess_image(image, label_group):
    image = tf.io.read_file(image)
    image = preprocess_image(image)
    return image,label_group

def get_training_dataset(image, label_group):

    dataset = tf.data.Dataset.from_tensor_slices((image, label_group))
    dataset_label = tf.data.Dataset.from_tensor_slices(label_group)
    dataset = dataset.map(load_and_preprocess_image, num_parallel_calls = AUTO)
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    dataset = tf.data.Dataset.zip((dataset, dataset_label))

    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(CFG.batch_size)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Function to get our validation dataset
def get_validation_dataset(image, label_group):

    dataset = tf.data.Dataset.from_tensor_slices((image, label_group))
    dataset_label = tf.data.Dataset.from_tensor_slices(label_group)
    dataset = dataset.map(load_and_preprocess_image, num_parallel_calls = AUTO)
    dataset = tf.data.Dataset.zip((dataset, dataset_label))

    dataset = dataset.batch(CFG.batch_size)
    dataset = dataset.prefetch(AUTO)
    return dataset

def train_and_eval_split(image, label_group):
    trn_image, val_image, trn_labels, val_labels = train_test_split(image, label_group, random_state = CFG.seed, shuffle = True)
    return trn_image, val_image, trn_labels, val_labels

In [None]:
# split the datasets
trn_image, val_image, trn_labels, val_labels = train_test_split(image_paths, df, random_state = CFG.seed, shuffle = True)
train_dataset = get_training_dataset(trn_image, trn_labels['label_group'])
val_dataset = get_validation_dataset(val_image, val_labels['label_group'])

# train_ds = train_ds.cache(filename='./cache.tf-data').shuffle(buffer_size=1000).batch(CFG.batch_size).prefetch(AUTO) # cache会导致爆内存，当内存不够是使用文件进行缓存
# val_ds = val_ds.cache(filename='./cache.val-data').batch(CFG.batch_size).prefetch(AUTO) # no shuffle on test prefetch 预取数据，maybe无需使用cache

In [None]:
train_dataset.take(1)

<TakeDataset shapes: (((None, 384, 384, 3), (None,)), (None,)), types: ((tf.float32, tf.int64), tf.int64)>

In [None]:
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [None]:
def create_model():

  margin = ArcMarginProduct(n_classes = CFG.classes, s = 30, m = 0.5, 
                name = 'head/arc_margin', dtype = 'float32')

  image = tf.keras.layers.Input(shape=(CFG.img_size, CFG.img_size, 3), name='input/image')
  label = tf.keras.layers.Input((), name='input/label')

  x = EfficientNetB4(include_top = False, weights = 'imagenet')(image)
  x = tf.keras.layers.GlobalAveragePooling2D(name='head/pooling')(x)
  
  x = tf.keras.layers.BatchNormalization(name='head/bn1')(x)
  x = tf.keras.layers.Dropout(rate = 0.5, name='head/dropout')(x)
  x = tf.keras.layers.Dense(CFG.fc_dim,name='head/dense')(x)
  x = tf.keras.layers.BatchNormalization(name='head/bn2')(x)

  x = margin([x, label]) # Archead
  output = tf.keras.layers.Softmax()(x)

  model = tf.keras.models.Model(inputs = [image, label], outputs = output)

  opt = tf.keras.optimizers.Adam(learning_rate = 0.001)

  model.compile(
        optimizer = opt,
        loss = [tf.keras.losses.SparseCategoricalCrossentropy()],
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
    )

  return model

In [None]:
# 2. create_model
model = create_model()
model.summary()

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb4_notop.h5
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input/image (InputLayer)        [(None, 384, 384, 3) 0                                            
__________________________________________________________________________________________________
efficientnetb4 (Functional)     (None, None, None, 1 17673823    input/image[0][0]                
__________________________________________________________________________________________________
head/pooling (GlobalAveragePool (None, 1792)         0           efficientnetb4[0][0]             
__________________________________________________________________________________________________
head/bn1 (BatchNormalization)   (None, 1792)         7168        head/pooling[0][0]              

In [None]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [None]:
class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, checkpoint, manager, saved_model_path):
        super(CustomCallback, self).__init__()
        # load and save model 
        self.checkpoint = checkpoint
        self.manager = manager
        self.saved_model_path = saved_model_path

        # uodate learning rate
        self.lr_start = 0.000001
        self.lr_max = 0.000005 * CFG.batch_size
        self.lr_min = 0.000001
        self.lr_ramp_ep = 5
        self.lr_sus_ep = 0
        self.lr_decay = 0.8

    def on_train_begin(self, logs=None):
        self.checkpoint.restore(self.manager.latest_checkpoint) # 训练开始是加载模型
        if self.manager.latest_checkpoint:
          print("Restored from {}".format(manager.latest_checkpoint))
        else:
          print("Initializing from scratch.")

    def on_train_end(self, logs=None):
        self.model.save_weights(self.saved_model_path)
        print('Save the model weights on end {}'.format(self.saved_model_path))

    def on_epoch_begin(self, epoch, logs=None):
        # update learning rate
        scheduled_lr = self.lrfn(epoch)
        tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)
        print("\nEpoch %05d: Learning rate is %6.6f." % (epoch, scheduled_lr))

    def on_epoch_end(self, epoch, logs=None):
        self.manager.save() # 每个epoch保存模型
        print('Save model on epoch {}'.format(epoch))

    def lrfn(self, epoch):
        # learning rate warmup
        if epoch < self.lr_ramp_ep:
            lr = (self.lr_max - self.lr_start) / self.lr_ramp_ep * epoch + self.lr_start   
        elif epoch < self.lr_ramp_ep + self.lr_sus_ep:
            lr = self.lr_max    
        else:
            lr = (self.lr_max - self.lr_min) * self.lr_decay**(epoch - self.lr_ramp_ep - self.lr_sus_ep) + self.lr_min    
        return lr

In [None]:
saved_model_path = '/content/drive/MyDrive/shopee/my_model/effnetb4_arc.h5'

In [None]:
checkpoint = tf.train.Checkpoint(optimizer = model.optimizer, model = model) # 保存和加载模型
manager = tf.train.CheckpointManager(checkpoint, directory = "/content/drive/MyDrive/shopee/my_model/model/", max_to_keep = 3)
custom_cb = CustomCallback(checkpoint, manager, saved_model_path)

In [None]:
def training_model():

    print('\n')
    print('-'*50)
    # Seed everything
    seed_everything(CFG.seed)

    STEPS_PER_EPOCH = len(image_paths) // CFG.batch_size
    initial_epoch = int(manager.latest_checkpoint.split('-')[1])
    
    K.clear_session()
    
    history = model.fit(train_dataset, steps_per_epoch = STEPS_PER_EPOCH,
                epochs = 20, callbacks = [custom_cb], 
                validation_data = val_dataset, initial_epoch = initial_epoch, 
                verbose = 1)
    print('\n')
    print('-'*50)
    print('Training Complete...')
    
    return model
# model = training_model()

In [None]:
reconstructed_model = create_model()
reconstructed_model.load_weights(saved_model_path)
# 重新取输入和输出 选取embedding层作为输出
reconstructed_model = tf.keras.models.Model(inputs = reconstructed_model.input[0], outputs = reconstructed_model.layers[-4].output) 
reconstructed_model.build((None, CFG.img_size, CFG.img_size, 3))
reconstructed_model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input/image (InputLayer)     [(None, 384, 384, 3)]     0         
_________________________________________________________________
efficientnetb4 (Functional)  (None, None, None, 1792)  17673823  
_________________________________________________________________
head/pooling (GlobalAverageP (None, 1792)              0         
_________________________________________________________________
head/bn1 (BatchNormalization (None, 1792)              7168      
_________________________________________________________________
head/dropout (Dropout)       (None, 1792)              0         
_________________________________________________________________
head/dense (Dense)           (None, 512)               918016    
_________________________________________________________________
head/bn2 (BatchNormalization (None, 512)               2048

In [None]:
embeds = []
for images, labels in tqdm(val_dataset):
  #print(images[1].shape)
  embeds.append(reconstructed_model.predict(images[0]))

100%|██████████| 536/536 [01:51<00:00,  4.81it/s]


In [None]:
image_embeddings = np.concatenate(embeds)
test_df_pred, image_predictions = get_image_neighbors(val_labels, image_embeddings, KNN = 50, threshold = 3.6) 

100%|██████████| 8563/8563 [00:01<00:00, 7503.42it/s]


In [None]:
image_embeddings.shape

(8563, 512)

In [None]:
image_predictions[4].shape

(1,)

In [None]:
len(image_predictions)

8563

In [None]:
def get_text_predictions(df, max_features=25000):
    
    model = TfidfVectorizer(stop_words='english', binary=True, max_features=max_features)
    text_embeddings = model.fit_transform(df['title']).toarray()

    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1

    preds = []
    for j in range( CTS ):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)

        # COSINE SIMILARITY DISTANCE
        cts = np.matmul(text_embeddings, text_embeddings[a:b].T).T
        for k in range(b-a):
            IDX = np.where(cts[k,]>0.75)[0]
            o = df.iloc[np.asarray(IDX)].posting_id.values
            preds.append(o)

    del model,text_embeddings
    gc.collect()
    return preds

In [None]:
text_predictions = get_text_predictions(val_labels, max_features=25000)

Finding similar titles...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 8563


In [None]:
def combine_predictions(row):
  x = np.concatenate([row['image_predictions'], row['text_predictions']])
  #x = row['image_predictions']
  return ' '.join(np.unique(x))

In [None]:
test_df_pred['image_predictions'] = image_predictions
test_df_pred['text_predictions'] = text_predictions
test_df_pred['matches'] = test_df_pred.apply(combine_predictions, axis=1)
test_df_pred[['posting_id', 'matches']].to_csv('/content/drive/MyDrive/shopee/submission.csv', index=False)

In [None]:
test_df_pred.head()

Unnamed: 0,posting_id,matches,f1,image_predictions,text_predictions
0,train_290863952,train_290863952,1.0,[train_290863952],[train_290863952]
1,train_2543555082,train_2543555082 train_3013055580,1.0,[train_2543555082],"[train_2543555082, train_3013055580]"
2,train_19057560,train_19057560,1.0,[train_19057560],[train_19057560]
3,train_3909851547,train_2640036529 train_3909851547 train_880041580,0.5,"[train_3909851547, train_880041580, train_2640...",[train_3909851547]
4,train_2578897315,train_2552330451 train_2578897315,1.0,[train_2578897315],"[train_2578897315, train_2552330451]"


In [None]:
test_df_pred = pd.read_csv('/content/drive/MyDrive/shopee/submission.csv')

In [None]:
test_df_pred.head()

Unnamed: 0,posting_id,matches
0,train_290863952,train_290863952
1,train_2543555082,train_2543555082 train_3013055580
2,train_19057560,train_19057560
3,train_3909851547,train_2640036529 train_3909851547 train_880041580
4,train_2578897315,train_2552330451 train_2578897315


In [None]:
test_df_pred['f1'] = f1_score(test_df_pred['posting_id'], test_df_pred['matches'])

In [None]:
test_df_pred['f1'].mean()

0.880404723800746

text: 0.9624643874643882
text+image: 0.5052306952306961
image: 0.7310256410256404