In [1]:
!nvidia-smi
!fuser -v /dev/nvidia0

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.

Specified filename /dev/nvidia0 does not exist.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!kill -9 2432  

/bin/bash: line 0: kill: (2432) - No such process


In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [3]:
!ls                                      

drive  sample_data


In [4]:
import os
import cv2
import math
import random
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm

import tensorflow as tf
print(tf.__version__)
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras import datasets, layers, optimizers, Sequential, metrics
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.applications import EfficientNetB4
import albumentations

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
AUTO = tf.data.experimental.AUTOTUNE


2.5.0


In [7]:
class CFG:
    seed = 54
    classes = 11014 
    scale = 30 
    margin = 0.5
    fc_dim = 512
    img_size = 380  #对应EfficientNetB4的输入分辨率 
    batch_size = 8
    channels = 3

In [8]:
def read_dataset(csv_path, image_path):
    df = pd.read_csv(csv_path)
    image_paths = image_path + df['image']
    return df, image_paths

In [9]:
image_path = '/content/drive/MyDrive/shopee/shopee-product-matching/train_images/'
csv_path = '/content/drive/MyDrive/shopee/shopee-product-matching/train_r.csv' # repalce the train.csv
df, image_paths = read_dataset(csv_path, image_path)

In [10]:
# # The pictures pulled out by kaggle are not complete, so you need to delete related non-existent paths
# df['isexist'] = df.apply(lambda x: 1 if os.path.isfile(image_path + x['image']) else 0, axis = 1)
# df = df[df['isexist'] == 1]
# df.drop(columns = ['isexist'], inplace=True)
# df.to_csv('/content/drive/MyDrive/shopee/shopee-product-matching/train_r.csv', sep = ',', index_col=None)

In [11]:
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
1,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
2,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
3,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069
4,train_2464356923,0013e7355ffc5ff8fb1ccad3e42d92fe.jpg,bbd097a7870f4a50,CELANA WANITA (BB 45-84 KG)Harem wanita (bisa...,2660605217


In [12]:
label2id = dict(zip(range(df.label_group.nunique()),df.label_group.unique()))
id2label = dict(zip(df.label_group.unique(),range(df.label_group.nunique())))
df["labels"] = df["label_group"].map(id2label)

In [13]:
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,labels
0,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,0
1,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,1
2,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,2
3,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,3
4,train_2464356923,0013e7355ffc5ff8fb1ccad3e42d92fe.jpg,bbd097a7870f4a50,CELANA WANITA (BB 45-84 KG)Harem wanita (bisa...,2660605217,4


In [14]:
# train_df, test_df = train_test_split(df, test_size = 0.1)
# train_image_paths, test_image_paths = train_test_split(image_paths, test_size = 0.1)

In [15]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [16]:
def get_image_neighbors(df, embeddings, KNN=50):

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    threshold = 4.5
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [17]:
def data_augment(image, label):
    p_spatial = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_rotate = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_1 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_2 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_3 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_crop = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
            
    # Flips
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    if p_spatial > .75:
        image = tf.image.transpose(image)
        
    # Rotates
    if p_rotate > .75:
        image = tf.image.rot90(image, k=3) 
    elif p_rotate > .5:
        image = tf.image.rot90(image, k=2) 
    elif p_rotate > .25:
        image = tf.image.rot90(image, k=1) 
        
    
    if p_pixel_1 >= .4:
        image = tf.image.random_saturation(image, lower=.7, upper=1.3)
    if p_pixel_2 >= .4:
        image = tf.image.random_contrast(image, lower=.8, upper=1.2)
    if p_pixel_3 >= .4:
        image = tf.image.random_brightness(image, max_delta=.1)
        
    
    if p_crop > .7:
        if p_crop > .9:
            image = tf.image.central_crop(image, central_fraction=.7)
        elif p_crop > .8:
            image = tf.image.central_crop(image, central_fraction=.8)
        else:
            image = tf.image.central_crop(image, central_fraction=.9)
    elif p_crop > .4:
        crop_size = tf.random.uniform([], int(CFG.img_size*.8), CFG.img_size, dtype=tf.int32)
        image = tf.image.random_crop(image, size=[crop_size, crop_size, CFG.channels])
    
    image = tf.image.resize(image, [CFG.img_size, CFG.img_size])
    return image, label

In [18]:
# Function to decode our images
def preprocess_image(image):
    
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)  # / 255.0  # normalize to [0,1], but effnet has normlayer
    image = tf.image.resize(image, [CFG.img_size, CFG.img_size])
    return image

# Function to read our test image and return image
def load_and_preprocess_image(image, label):
    image = tf.io.read_file(image)
    image = preprocess_image(image)
    return image, label

def get_dataset(image, label):
    dataset = tf.data.Dataset.from_tensor_slices((image, label))
    dataset = dataset.map(load_and_preprocess_image, num_parallel_calls = AUTO) 
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO) # data_augment
    return dataset


In [19]:
SPLIT = int(0.8*len(df))
# 1. datasets
dataset = get_dataset(image_paths, df.labels)
train_ds = dataset.take(SPLIT)
val_ds = dataset.skip(SPLIT)

# split the datasets
train_ds = train_ds.cache().repeat().shuffle(CFG.batch_size*20).batch(CFG.batch_size).prefetch(AUTO)
val_ds = val_ds.cache().batch(CFG.batch_size).prefetch(AUTO) # no shuffle on test

In [20]:
# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [21]:

class EffModel(Model):
    def __init__(self, n_classes = CFG.classes,
        img_size = CFG.img_size,
        fc_dim = CFG.fc_dim
        ):
        super(EffModel, self).__init__()
        
        inputs = layers.Input(shape=(img_size, img_size, 3))
        #inputs = albumentations(inputs)
        self.backbone = EfficientNetB4(include_top=False, input_tensor=inputs, weights="imagenet")
        
        # Freeze the pretrained weights
        self.backbone.trainable = False
        
        self.pooling = layers.GlobalAveragePooling2D(name="avg_pool")
        self.flatten = layers.Flatten()
        
        # Rebuild top
        self.dropout = layers.Dropout(rate = 0.1, name="dropout")
        self.classifier = layers.Dense(fc_dim, activation="relu", name="pred") # rule
        self.bn = layers.BatchNormalization()      

    def call(self, inputs):
        features = self.extract_features(inputs) 
        return features
    
    def extract_features(self, x):
        # effnet + pooling + fc
        x = self.backbone(x)
        x = self.pooling(x)   
        x = self.flatten(x) 

        x = self.dropout(x)
        x = self.classifier(x)
        x = self.bn(x)
            
        return x

In [22]:

def create_model():

    backbone = EffModel()

    margin = ArcMarginProduct(n_classes = CFG.classes, 
                  s = 30, 
                  m = 0.5, 
                  name='head/arc_margin', 
                  dtype='float32'
                  )

    image = tf.keras.layers.Input(shape=(CFG.img_size, CFG.img_size, 3), name='input/image')
    label = tf.keras.layers.Input((), name='input/label')

    x = backbone(image) 
    x = margin([x, label]) # Archead

    model = tf.keras.models.Model(inputs = [image, label], outputs = x)

    return model

In [23]:
# 2. create_model
model = create_model()
model.summary()

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb4_notop.h5
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input/image (InputLayer)        [(None, 380, 380, 3) 0                                            
__________________________________________________________________________________________________
eff_model (EffModel)            (None, 512)          18593887    input/image[0][0]                
__________________________________________________________________________________________________
input/label (InputLayer)        [(None,)]            0                                            
__________________________________________________________________________________________________
head/arc_margin (ArcMarginProdu (None, 11014)        5639168     eff_model[0][0]                 

In [24]:
# 3. loss and test
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
mean_accuracy_train = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=10)

In [25]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)

In [26]:
# Save the model, because colab will crash at any time
ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer = optimizer, model = model)

manager = tf.train.CheckpointManager(ckpt, '/content/drive/MyDrive/shopee/my_model', max_to_keep=2)

In [None]:
# 手动保存权重

def training_model():
  
    Epochs = 50
    ckpt.restore(manager.latest_checkpoint)

    if manager.latest_checkpoint:
      print("Restored from {}".format(manager.latest_checkpoint))
    else:
      print("Initializing from scratch.")

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    test_loss = tf.keras.metrics.Mean(name='test_loss')
    train_acc = tf.keras.metrics.Mean(name='train_acc')
    test_acc = tf.keras.metrics.Mean(name='test_acc')
 
    def train_step(images, labels):
        with tf.GradientTape() as tape:
            probs = model([images, labels], training = True)
            loss = loss_object(labels, probs)
            acc = mean_accuracy_train(labels, probs)

        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
 
        train_loss(loss)
        train_acc(acc)
 
    def test_step(images, labels):
        probs = model([images, labels], training = False)
        loss = loss_object(labels, probs)
        acc = mean_accuracy_train(labels, probs)

        test_loss(loss)
        test_acc(acc)
 
    for epoch in tqdm(range(Epochs)):

        train_loss.reset_states()
        test_loss.reset_states()

        train_acc.reset_states()
        test_acc.reset_states()


        for images, labels in train_ds:
            train_step(images, labels)

            ckpt.step.assign_add(1) 
            if int(ckpt.step) % 100 == 0:
              p_out = 'loss: {}, acc: {}'
              print(p_out.format(train_loss.result().numpy(), train_acc.result().numpy()))

            if int(ckpt.step) % 500 == 0:
              save_path = manager.save()
              print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
        
        for images, labels in val_ds:
            test_step(images, labels)
 
        tmp = 'Epoch: {}, train_loss: {}, test_loss: {}, train_acc: {}, test_acc: {}'
        print(tmp.format(epoch+1, train_loss.result(), test_loss.result(),train_acc.result(), test_acc.result()))

        
    return model

model = training_model()

In [27]:
saved_model_path = "/content/drive/MyDrive/shopee/my_model/"
model.save_weights(saved_model_path)

In [30]:
reconstructed_model = create_model()
reconstructed_model.load_weights(saved_model_path)
# 重新取输入和输出 选取embedding层作为输出
reconstructed_model = tf.keras.models.Model(inputs = reconstructed_model.input[0], outputs = reconstructed_model.layers[-3].output) 
reconstructed_model.build((None, 380, 380, 3))
reconstructed_model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input/image (InputLayer)     [(None, 380, 380, 3)]     0         
_________________________________________________________________
eff_model_3 (EffModel)       (None, 512)               18593887  
Total params: 18,593,887
Trainable params: 919,040
Non-trainable params: 17,674,847
_________________________________________________________________


In [None]:
embeds = []
for images, labels in tqdm(val_ds):
      embeds.append(reconstructed_model.predict(images))

In [None]:
image_embeddings = np.concatenate(embeds)
test_df_pred, image_predictions = get_image_neighbors(test_df, image_embeddings, KNN = 50) 

In [None]:
def get_text_predictions(df, max_features=25000):
    
    model = TfidfVectorizer(stop_words='english', binary=True, max_features=max_features)
    text_embeddings = model.fit_transform(df['title']).toarray()

    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1

    preds = []
    for j in range( CTS ):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)

        # COSINE SIMILARITY DISTANCE
        cts = np.matmul(text_embeddings, text_embeddings[a:b].T).T
        for k in range(b-a):
            IDX = np.where(cts[k,]>0.75)[0]
            o = df.iloc[np.asarray(IDX)].posting_id.values
            preds.append(o)

    del model,text_embeddings
    gc.collect()
    return preds

In [None]:
text_predictions = get_text_predictions(test_df, max_features=25000)

In [None]:
len(text_predictions)

In [None]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x) )

In [None]:
test_df_pred['image_predictions'] = image_predictions
test_df_pred['text_predictions'] = text_predictions
test_df_pred['matches'] = test_df_pred.apply(combine_predictions, axis=1)
test_df_pred[['posting_id', 'matches']].to_csv('/content/drive/MyDrive/shopee/outputs/submission.csv', index=False)

In [None]:
test_df_pred = pd.read_csv('/content/drive/MyDrive/shopee/outputs/submission.csv')

In [None]:
test_df_pred.head()

In [None]:
test_df_pred['f1'] = f1_score(test_df_pred['posting_id'], test_df_pred['matches'])

In [None]:
test_df_pred['f1'].mean()