In [1]:
!nvidia-smi

Sat Jun 26 23:34:14 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!fuser -v /dev/nvidia0

In [4]:
!kill -9 2432  

/bin/bash: line 0: kill: (2432) - No such process


In [5]:
!ls

drive  sample_data


In [6]:
import os
import cv2
import math
import random
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm

import tensorflow as tf
print(tf.__version__)
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras import datasets, layers, optimizers, Sequential, metrics
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.applications import EfficientNetB0
import albumentations

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
AUTO = tf.data.experimental.AUTOTUNE

2.5.0


In [7]:
class CFG:
    seed = 54
    Epochs = 10
    # classes = 11014 
    scale = 30 
    margin = 0.5
    fc_dim = 512
    img_size = 384  #对应EfficientNetB4的输入分辨率 
    batch_size = 16
    channels = 3

In [8]:
def read_dataset(csv_path, image_path):
    df = pd.read_csv(csv_path)
    image_paths = image_path + df['image']
    return df, image_paths

In [9]:
image_path = '/content/drive/MyDrive/shopee/shopee-product-matching/train_images/'
csv_path = '/content/drive/MyDrive/shopee/shopee-product-matching/train.csv' # repalce the train.csv
df, image_paths = read_dataset(csv_path, image_path)

In [10]:
# # The pictures pulled out by kaggle are not complete, so you need to delete related non-existent paths
# df['isexist'] = df.apply(lambda x: 1 if os.path.isfile(image_path + x['image']) else 0, axis = 1)
# df = df[df['isexist'] == 1]
# df.drop(columns = ['isexist'], inplace=True)
# df.to_csv('/content/drive/MyDrive/shopee/shopee-product-matching/train_r.csv', sep = ',', index=None)

In [11]:
df.shape

(34250, 5)

In [12]:
labelencoder= LabelEncoder()
df['label_group'] = labelencoder.fit_transform(df['label_group'])
CFG.classes = df['label_group'].nunique()

In [13]:
CFG.classes

11014

In [14]:
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,666
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",7572
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,6172
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,10509
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,9425


In [15]:
# train_df, test_df = train_test_split(df, test_size = 0.1)
# train_image_paths, test_image_paths = train_test_split(image_paths, test_size = 0.1)

In [16]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [17]:
#@title
def get_image_neighbors(df, embeddings, KNN=50):

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    threshold = 4.5
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [18]:
#@title
def data_augment(image, label):
    p_spatial = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_rotate = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_1 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_2 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_3 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_crop = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
            
    # Flips
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    if p_spatial > .75:
        image = tf.image.transpose(image)
        
    # Rotates
    if p_rotate > .75:
        image = tf.image.rot90(image, k=3) 
    elif p_rotate > .5:
        image = tf.image.rot90(image, k=2) 
    elif p_rotate > .25:
        image = tf.image.rot90(image, k=1) 
        
    
    if p_pixel_1 >= .4:
        image = tf.image.random_saturation(image, lower=.7, upper=1.3)
    if p_pixel_2 >= .4:
        image = tf.image.random_contrast(image, lower=.8, upper=1.2)
    if p_pixel_3 >= .4:
        image = tf.image.random_brightness(image, max_delta=.1)
        
    
    if p_crop > .7:
        if p_crop > .9:
            image = tf.image.central_crop(image, central_fraction=.7)
        elif p_crop > .8:
            image = tf.image.central_crop(image, central_fraction=.8)
        else:
            image = tf.image.central_crop(image, central_fraction=.9)
    elif p_crop > .4:
        crop_size = tf.random.uniform([], int(CFG.img_size*.8), CFG.img_size, dtype=tf.int32)
        image = tf.image.random_crop(image, size=[crop_size, crop_size, CFG.channels])
    
    image = tf.image.resize(image, [CFG.img_size, CFG.img_size])
    return image, label

In [19]:
# Function to decode our images
def preprocess_image(image):
    
    image = tf.image.decode_jpeg(image, channels=3)
    #image = tf.image.convert_image_dtype(image, tf.float32)  # / 255.0  # normalize to [0,1], but effnet has normlayer
    image = tf.image.resize(image, (CFG.img_size, CFG.img_size))
    image = tf.cast(image, tf.float32)
    return image

def load_and_preprocess_image(image, label_group):
    image = tf.io.read_file(image)
    image = preprocess_image(image)
    return image, label_group

def get_training_dataset(image, label_group):
    dataset = tf.data.Dataset.from_tensor_slices((image, label_group))
    dataset = dataset.map(load_and_preprocess_image, num_parallel_calls = AUTO)
    # dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(CFG.batch_size)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Function to get our validation dataset
def get_validation_dataset(image, label_group):
    dataset = tf.data.Dataset.from_tensor_slices((image, label_group))
    dataset = dataset.map(load_and_preprocess_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(CFG.batch_size)
    dataset = dataset.prefetch(AUTO)
    return dataset

def train_and_eval_split(image, label_group):
    trn_image, val_image, trn_labels, val_labels = train_test_split(image, label_group, random_state = CFG.seed, shuffle = True)
    return trn_image, val_image, trn_labels, val_labels


In [20]:
trn_image, val_image, trn_labels, val_labels = train_test_split(image_paths, df['label_group'], random_state = CFG.seed, shuffle = True)
train_dataset = get_training_dataset(trn_image, trn_labels)
val_dataset = get_validation_dataset(val_image, val_labels)
# split the datasets
# train_ds = train_ds.cache(filename='./cache.tf-data').shuffle(buffer_size=1000).batch(CFG.batch_size).prefetch(AUTO) # cache会导致爆内存，当内存不够是使用文件进行缓存
# val_ds = val_ds.cache(filename='./cache.val-data').batch(CFG.batch_size).prefetch(AUTO) # no shuffle on test prefetch 预取数据，maybe无需使用cache

In [22]:
#@title
# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [24]:

def create_model():

    inp = tf.keras.layers.Input(shape=(CFG.img_size, CFG.img_size, 3))
    x = EfficientNetB0(include_top = False, weights = 'imagenet')(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(CFG.fc_dim, activation = 'relu')(x)
    output = tf.keras.layers.Dense(CFG.classes, activation = 'softmax')(x)

    model = tf.keras.models.Model(inputs = [inp], outputs = [output])

    return model

In [25]:
# 2. create_model
model = create_model()
model.summary()

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 384, 384, 3)]     0         
_________________________________________________________________
efficientnetb0 (Functional)  (None, None, None, 1280)  4049571   
_________________________________________________________________
global_average_pooling2d (Gl (None, 1280)              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 1280)              5120      
_________________________________________________________________
dense (Dense)                (None, 512)               655872    
_________________________________________________________________
dense_1 (Dense)              (None, 11014)             5650182   
Total params: 10,360,745
Traina

In [26]:
# 3. loss and test
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
mean_accuracy_train = tf.keras.metrics.SparseCategoricalAccuracy() 

In [27]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [28]:
learning_rate = CustomSchedule(d_model = CFG.classes)
optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)

In [29]:
# Save the model, because colab will crash at any time
ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer = optimizer, model = model)

manager = tf.train.CheckpointManager(ckpt, '/content/drive/MyDrive/shopee/my_model_b0', max_to_keep=2)

In [None]:
# 手动保存权重

def training_model():
  
    ckpt.restore(manager.latest_checkpoint)
    if manager.latest_checkpoint:
      print("Restored from {}".format(manager.latest_checkpoint))
    else:
      print("Initializing from scratch.")

    train_loss_results = []
    train_accuracy_results = []
 
    for epoch in tqdm(range(CFG.Epochs)):
      train_loss = tf.keras.metrics.Mean(name='train_loss')
      train_acc = tf.keras.metrics.Mean(name='train_acc')

      for images, labels in train_dataset:
        with tf.GradientTape() as tape:
          probs = model(images)
          loss_value = loss_object(labels, probs)
          acc = mean_accuracy_train(labels, probs)

        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # 追踪进度
        train_loss(loss_value)  # 添加当前的 batch loss
        # 比较预测标签与真实标签
        train_acc(acc)
 
        ckpt.step.assign_add(1)

        if int(ckpt.step) % 500 == 0:
          p_out = 'loss: {}, acc: {}'
          print(p_out.format(train_loss.result().numpy(), train_acc.result().numpy()))
 
      save_path = manager.save()
      print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))

      train_loss_results.append(train_loss.result())
      train_accuracy_results.append(train_acc.result())

      if epoch % 1 == 0:
        print("Epoch {:03d}: Loss: {:.4f}, Accuracy: {:.4%}".format(
            epoch,train_loss.result(), train_acc.result()))
        
    return model

model = training_model()


  0%|          | 0/10 [00:00<?, ?it/s][A

Restored from /content/drive/MyDrive/shopee/my_model_b0/ckpt-10
loss: 0.04215363785624504, acc: 0.9824244379997253
loss: 0.03807296231389046, acc: 0.984658420085907
loss: 0.04302948713302612, acc: 0.9851571917533875



 10%|█         | 1/10 [32:28<4:52:13, 1948.22s/it][A

Saved checkpoint for step 17667: /content/drive/MyDrive/shopee/my_model_b0/ckpt-11
Epoch 000: Loss: 0.0444, Accuracy: 98.5216%
loss: 0.041136227548122406, acc: 0.9858793616294861
loss: 0.0393037274479866, acc: 0.9859554171562195
loss: 0.039150919765233994, acc: 0.9860960841178894



 20%|██        | 2/10 [39:17<3:18:11, 1486.43s/it][A

Saved checkpoint for step 19273: /content/drive/MyDrive/shopee/my_model_b0/ckpt-12
Epoch 001: Loss: 0.0420, Accuracy: 98.6148%
loss: 0.03822874650359154, acc: 0.9863402247428894
loss: 0.035548239946365356, acc: 0.9864304065704346
loss: 0.03469464182853699, acc: 0.9865444302558899



 30%|███       | 3/10 [45:56<2:15:22, 1160.42s/it][A

Saved checkpoint for step 20879: /content/drive/MyDrive/shopee/my_model_b0/ckpt-13
Epoch 002: Loss: 0.0351, Accuracy: 98.6622%
loss: 0.033139199018478394, acc: 0.9869670271873474
loss: 0.030320556834340096, acc: 0.9871546030044556
loss: 0.03284657001495361, acc: 0.9873147010803223



 40%|████      | 4/10 [52:46<1:33:31, 935.18s/it] [A

Saved checkpoint for step 22485: /content/drive/MyDrive/shopee/my_model_b0/ckpt-14
Epoch 003: Loss: 0.0322, Accuracy: 98.7397%
loss: 0.09573417156934738, acc: 0.9876822829246521
loss: 0.02641565352678299, acc: 0.9877457022666931
loss: 0.028942985460162163, acc: 0.9878596663475037
loss: 0.02902979403734207, acc: 0.9879200458526611



 50%|█████     | 5/10 [59:35<1:04:47, 777.47s/it][A

Saved checkpoint for step 24091: /content/drive/MyDrive/shopee/my_model_b0/ckpt-15
Epoch 004: Loss: 0.0295, Accuracy: 98.7934%


In [None]:
saved_model_path = "/content/drive/MyDrive/shopee/my_model/tf_model/"
model.save_weights(saved_model_path)

In [None]:
reconstructed_model = create_model()
reconstructed_model.load_weights(saved_model_path)
# 重新取输入和输出 选取embedding层作为输出
reconstructed_model = tf.keras.models.Model(inputs = reconstructed_model.input[0], outputs = reconstructed_model.layers[-3].output) 
reconstructed_model.build((None, CFG.img_size, CFG.img_size, 3))
reconstructed_model.summary()

In [None]:
embeds = []
for images, labels in tqdm(val_ds):
      embeds.append(model.predict(images))

In [None]:
image_embeddings = np.concatenate(embeds)
test_df_pred, image_predictions = get_image_neighbors(df.iloc[SPLIT:,:], image_embeddings, KNN = 50) 

In [None]:
test_df_pred.head()

In [None]:
len(image_predictions)

In [None]:
def get_text_predictions(df, max_features=25000):
    
    model = TfidfVectorizer(stop_words='english', binary=True, max_features=max_features)
    text_embeddings = model.fit_transform(df['title']).toarray()

    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1

    preds = []
    for j in range( CTS ):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)

        # COSINE SIMILARITY DISTANCE
        cts = np.matmul(text_embeddings, text_embeddings[a:b].T).T
        for k in range(b-a):
            IDX = np.where(cts[k,]>0.75)[0]
            o = df.iloc[np.asarray(IDX)].posting_id.values
            preds.append(o)

    del model,text_embeddings
    gc.collect()
    return preds

In [None]:
text_predictions = get_text_predictions(df.iloc[SPLIT:,:], max_features=25000)

In [None]:
# test_df_pred = test_df_pred.iloc[SPLIT:,:]

In [None]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    # x = row['image_predictions']
    return ' '.join( np.unique(x) )

In [None]:
test_df_pred['image_predictions'] = image_predictions
test_df_pred['text_predictions'] = text_predictions
test_df_pred['matches'] = test_df_pred.apply(combine_predictions, axis=1)
test_df_pred[['posting_id', 'matches']].to_csv('/content/drive/MyDrive/shopee/submission.csv', index=False)

In [None]:
test_df_pred.head()

In [None]:
test_df_pred = pd.read_csv('/content/drive/MyDrive/shopee/submission.csv')

In [None]:
test_df_pred.head()

In [None]:
test_df_pred['f1'] = f1_score(test_df_pred['posting_id'], test_df_pred['matches'])

In [None]:
test_df_pred['f1'].mean()