## 라이브러리 모음

In [1]:
!pip install ../input/external-model/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/external-model/efficientnet-1.1.0-py3-none-any.whl

# 파일 처리
import os

# Data 처리
import pandas as pd
import numpy as np

##############################################################

# RAPIDS 라이브러리
import cudf, cuml, cupy 
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
##############################################################

# ML, DNN, CNN 관련 라이브러리
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, Input, GlobalAveragePooling2D, Softmax

#!pip install efficientnet
import efficientnet.tfkeras as efn
import math
##############################################################

# 이미지 및 그래프 출력
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

##############################################################

# 해쉬(phash) 값 처리
#import imagehash

##############################################################

# Text Data NLP 처리
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.neighbors import NearestNeighbors
import re
import nltk
#nltk.download('popular')

from shutil import copyfile
copyfile(src = "../input/you-need-more-tensors-in-neighbourhood/tokenization.py", dst = "../working/tokenization.py")

import tokenization
import tensorflow_hub as hub

from sklearn.preprocessing import LabelEncoder
##############################################################

# 메모리 관리
import gc

# 경고메시지 지우기
import warnings
warnings.filterwarnings(action='ignore')

# 상태바 진행상태
from tqdm import tqdm

# Text Color
from termcolor import colored

# 실행시간 확인
import time
import datetime

Processing /kaggle/input/external-model/Keras_Applications-1.0.8-py3-none-any.whl
Installing collected packages: Keras-Applications
Successfully installed Keras-Applications-1.0.8
Processing /kaggle/input/external-model/efficientnet-1.1.0-py3-none-any.whl
Installing collected packages: efficientnet
Successfully installed efficientnet-1.1.0


## 메모리 관리

In [2]:
# RESTRICT TENSORFLOW TO 2GB OF GPU RAM
# SO THAT WE HAVE 14GB RAM FOR RAPIDS
LIMIT = 2.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

We will restrict TensorFlow to max 2GB GPU RAM
then RAPIDS can use 14GB GPU RAM


## 변수 모음

In [3]:
IMG_SIZE = [512, 512]

BATCH_SIZE = 5

N_CLASSES = 11014

SEED = 42

GET_CV = True

CHECK_SUB =False

## CV 설정

In [4]:
df = pd.read_csv('../input/shopee-product-matching/test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
del df

## 함수 모음

In [5]:
# DataSet 불러오기
def read_dataset():
    if GET_CV:
        df = pd.read_csv('../input/shopee-product-matching/train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].map(lambda x: ' '.join(x))
        
        if CHECK_SUB:
            df = pd.concat([df, df], axis=0)
            df.reset_index(drop=True, inplace=True)
        df_cu =cudf.DataFrame(df)
        image_path = '../input/shopee-product-matching/train_images/' + df['image']
    else:
        df = pd.read_csv('../input/shopee-product-matching/test.csv')
        df_cu =cudf.DataFrame(df)
        image_path = '../input/shopee-product-matching/test_images/' + df['image']
        
    return df, df_cu, image_path

# 예측값 결합
def combine_preds(row):
    x = np.concatenate([row['image_pred'], row['text_pred'], row['phash_pred']])
    return ' '.join(np.unique(x))


# F1 Score 함수
def f1_score(t_true, t_pred):
    t_true = t_true.apply(lambda x : set(x.split()))
    t_pred = t_pred.apply(lambda x : set(x.split()))
    
    intersection = np.array([len(x[0] & x[1]) for x in zip(t_true, t_pred)])
    len_t_true = t_true.apply(lambda x : len(x)).values
    len_t_pred = t_pred.apply(lambda x : len(x)).values
    
    F1 = 2 * intersection / (len_t_true + len_t_pred)
    
    return F1

# ArcFace loss 생성 Class
class ArcMarginProduct(Layer):
    '''
    GDis(Geodestic Distance margin) 구하는 Class
    Implements large margin arc distance.
    '''
    
    def __init__(self, n_classes, s=30, m=0.5, easy_margin=False, ls_eps=0.0, **kwargs):
        
        super(ArcMarginProduct, self).__init__(**kwargs)
        
        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m =tf.math.cos(m)
        self.sin_m =tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'n_classes':self.n_classes,
            's' : self.s,
            'm' : self.m,
            'ls_eps' : self.ls_eps,
            'easy_margin' : self.easy_margin
        })
        return config
    
    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])
        
        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer = 'glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None
            )
        
    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
            
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        
        if self.ls_eps > 0:
            one_hot = (1-self.ls_eps) * one_hot + self.ls_eps / self.n_classes
            
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

# KNN 이웃 구하기
def get_neighbors(df, embeddings, KNN=50, image=True):
    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    if GET_CV:
        if image :
            thresholds = list(np.arange(3.0, 5.0, 0.1))
        else:
            thresholds = list(np.arange(15, 35, 1))
            
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                indc = indices[k , idx]
                posting_ids = ' '.join(df['posting_id'].iloc[indc].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            
            print('Our F1 score for threshold {} is {}'.format(threshold, score))
            scores.append(score)
            
        thresholds_scores = pd.DataFrame({
            'thresholds':thresholds,
            'scores': scores
        })
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        
        print('Our best score : {} , threshold : {}'.format(best_score, best_threshold))
        
        del predictions, scores, indc, idx
        
        
        predictions = []
        for i in range(embeddings.shape[0]):
            if image:
                idx = np.where(distances[i,]<3.6)[0]
            else:
                idx = np.where(distances[i,]<20.0)[0]
                
            ids = indices[i, idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            if image:
                idx = np.where(distances[k,]<3.6)[0]
            else:
                idx = np.where(distances[k,]<20.0)[0]
            indc = indices[k, idx]
            posting_ids =df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
            
    del model, distances, indices, idx, ids, posting_ids
    gc.collect()
    
    return df, predictions

############################################################

# read & decode image
def read_and_decode_img(image):
    image = tf.io.read_file(image)
    img = tf.image.decode_jpeg(image, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.cast(img, tf.float32) / 255.0
    return img

# dataset load
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_and_decode_img, 
                          num_parallel_calls = tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

# image embedding
def image_embedding(img_path):
    embeds = []
    
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    
    input_layer = Input(shape = (*IMG_SIZE, 3))
    label = Input(shape= ())
    
    x = efn.EfficientNetB3(weights = None, include_top=False)(input_layer)
    x = GlobalAveragePooling2D()(x)
    x = margin([x, label])
    
    output_layer = Softmax(dtype='float32')(x)
    
    model = Model(inputs=[input_layer, label], 
                  outputs=[output_layer])
    model.load_weights('../input/shopeeefficientnetb3512/EfficientNetB3_512_42.h5')
    print(model.summary())
    model = Model(inputs = model.input[0], 
                  outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(img_path[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    
    del model
    
    image_embeddings = np.concatenate(embeds)
    print('Image embeddings shape :', image_embeddings.shape)
    
    del embeds
    gc.collect()
    return image_embeddings

# BERT encoding 함수
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []   # ==>
    all_masks = []    # ==>
    all_segments = [] # ==>
    
    for text in texts:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    gc.collect()
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

# BERT Model 전이학습 함수
def get_text_embeddings(df, max_len = 70):
    embeds = []
    module_url = "../input/external-model/bert_en_uncased_L-24_H-1024_A-16_1"
    bert_layer = hub.KerasLayer(module_url, trainable = True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
    
    text = bert_encode(df['title'].values, tokenizer, max_len = max_len)
    
    margin = ArcMarginProduct(
            n_classes = 11014, 
            s = 30, 
            m = 0.5, 
            name='head/arc_margin', 
            dtype='float32'
            )
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    label = Input(shape = (), name = 'label')
    
    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    x = margin([clf_output, label])
    output = Softmax(dtype='float32')(x)
    
    model = Model(inputs = [input_word_ids, input_mask, segment_ids, label], 
                  outputs = [output])
    model.load_weights('../input/bert-baseline/Bert_123.h5')
    
    model = Model(inputs = model.input[0:3], outputs = model.layers[-4].output)
    
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        
        a = int(j * chunk)
        b = int((j + 1) * chunk)

        text_chunk = ((text[0][a:b], text[1][a:b], text[2][a:b]))
        
        print('총 {}번 작업 중 {}번째 embedding 시작'.format(int(iterator[-1])+1, int(j)+1))
        text_embeddings = model.predict(text_chunk, batch_size = BATCH_SIZE)
        embeds.append(text_embeddings)
        print('{}번째 embedding 끝'.format(int(j)+1))
        
    del model
    
    text_embeddings = np.concatenate(embeds)
    print('Our text embeddings shape :', text_embeddings.shape)
    
    del embeds
    gc.collect()
    
    return text_embeddings


## DataSet 설정

In [6]:
df, df_cu, img_paths = read_dataset()
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,train_129225211 train_2278313361
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,train_3386243561 train_3423213080
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,train_2288590299 train_3803689425
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,train_2406599165 train_3342059966
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,train_3369186413 train_921438619


## Image embeddings

In [7]:
start = time.time()

image_embeddings = image_embedding(img_paths)

print(image_embeddings.shape)

gc.collect()

sec = time.time() - start
times = str(datetime.timedelta(seconds=sec)).split('.')
print('총 실행시간 :', times[0])

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512, 512, 3) 0                                            
__________________________________________________________________________________________________
efficientnet-b3 (Functional)    (None, None, None, 1 10783528    input_1[0][0]                    
__________________________________________________________________________________________________
global_average_pooling2d (Globa (None, 1536)         0           efficientnet-b3[0][0]            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None,)]            0                                            
______________________________________________________________________________________________

## Text embeddings

In [8]:
start = time.time()

text_embeddings = get_text_embeddings(df)

print(text_embeddings.shape)

gc.collect()

sec = time.time() - start
times = str(datetime.timedelta(seconds=sec)).split('.')
print('총 실행시간 :', times[0])

총 7번 작업 중 1번째 embedding 시작
1번째 embedding 끝
총 7번 작업 중 2번째 embedding 시작
2번째 embedding 끝
총 7번 작업 중 3번째 embedding 시작
3번째 embedding 끝
총 7번 작업 중 4번째 embedding 시작
4번째 embedding 끝
총 7번 작업 중 5번째 embedding 시작
5번째 embedding 끝
총 7번 작업 중 6번째 embedding 시작
6번째 embedding 끝
총 7번 작업 중 7번째 embedding 시작
7번째 embedding 끝
Our text embeddings shape : (34250, 1024)
(34250, 1024)
총 실행시간 : 0:06:47


## image embedding 값으로 prediction data 구하기

In [10]:
start = time.time()

df, image_predictions = get_neighbors(df, image_embeddings, KNN=100, image=True)

gc.collect()

sec = time.time() - start
times = str(datetime.timedelta(seconds=sec)).split('.')
print('총 실행시간 :', times[0])

df.head()

Our F1 score for threshold 3.0 is 0.7060730652202043
Our F1 score for threshold 3.1 is 0.7160522989781256
Our F1 score for threshold 3.2 is 0.7277696148321522
Our F1 score for threshold 3.3000000000000003 is 0.7397825990142964
Our F1 score for threshold 3.4000000000000004 is 0.7516964513614298
Our F1 score for threshold 3.5000000000000004 is 0.7640264855741594
Our F1 score for threshold 3.6000000000000005 is 0.7772109520816285
Our F1 score for threshold 3.7000000000000006 is 0.7903100905418591
Our F1 score for threshold 3.8000000000000007 is 0.8030970222483025
Our F1 score for threshold 3.900000000000001 is 0.8153789968925708
Our F1 score for threshold 4.000000000000001 is 0.8257123577977383
Our F1 score for threshold 4.100000000000001 is 0.833837438596564
Our F1 score for threshold 4.200000000000001 is 0.83790237211349
Our F1 score for threshold 4.300000000000001 is 0.8365055765380428
Our F1 score for threshold 4.400000000000001 is 0.8268638556543608
Our F1 score for threshold 4.50000

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,pred_matches,f1
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,train_129225211 train_2278313361,train_129225211,0.666667
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,train_3386243561 train_3423213080,train_3386243561 train_3423213080 train_183194...,0.222222
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,train_2288590299 train_3803689425,train_2288590299 train_3803689425,1.0
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,train_2406599165 train_3342059966,train_2406599165 train_3576714541 train_334205...,0.105263
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,train_3369186413 train_921438619,train_3369186413 train_921438619 train_2522158...,0.039216


## text embeddings 값으로 prediction data 구하기

In [11]:
start = time.time()

df, text_predictions = get_neighbors(df, text_embeddings, KNN=100, image=False)

gc.collect()

sec = time.time() - start
times = str(datetime.timedelta(seconds=sec)).split('.')
print('총 실행시간 :', times[0])

df.head()

Our F1 score for threshold 15 is 0.7238646050555128
Our F1 score for threshold 16 is 0.7521519309645522
Our F1 score for threshold 17 is 0.7735370949071356
Our F1 score for threshold 18 is 0.7888929328931714
Our F1 score for threshold 19 is 0.8005198328263164
Our F1 score for threshold 20 is 0.8101082361910849
Our F1 score for threshold 21 is 0.8186452450831521
Our F1 score for threshold 22 is 0.8258172895366394
Our F1 score for threshold 23 is 0.8323012301249347
Our F1 score for threshold 24 is 0.8358256455775362
Our F1 score for threshold 25 is 0.8350281588717811
Our F1 score for threshold 26 is 0.8210681058884545
Our F1 score for threshold 27 is 0.7754930829460942
Our F1 score for threshold 28 is 0.6772034216911009
Our F1 score for threshold 29 is 0.46215415549048783
Our F1 score for threshold 30 is 0.11412530606190396
Our F1 score for threshold 31 is 0.09992188838949856
Our F1 score for threshold 32 is 0.09992188838949856
Our F1 score for threshold 33 is 0.09992188838949856
Our F1 

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,pred_matches,f1
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,train_129225211 train_2278313361,train_129225211 train_2278313361 train_4025803...,0.039216
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,train_3386243561 train_3423213080,train_3386243561 train_3423213080 train_380550...,0.039216
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,train_2288590299 train_3803689425,train_2288590299 train_3803689425 train_295625...,0.039216
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,train_2406599165 train_3342059966,train_2406599165 train_1744956981 train_352677...,0.039216
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,train_3369186413 train_921438619,train_3369186413 train_921438619 train_2579931...,0.039216


## 해시값으로 동일상품 분류하기

In [12]:
# 해시값으로 동일상품 분류
tmp = df.groupby('image_phash').posting_id.unique().to_dict()
df['phash_pred'] = df.image_phash.map(tmp)

df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,pred_matches,f1,phash_pred
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,train_129225211 train_2278313361,train_129225211 train_2278313361 train_4025803...,0.039216,[train_129225211]
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,train_3386243561 train_3423213080,train_3386243561 train_3423213080 train_380550...,0.039216,[train_3386243561]
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,train_2288590299 train_3803689425,train_2288590299 train_3803689425 train_295625...,0.039216,[train_2288590299]
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,train_2406599165 train_3342059966,train_2406599165 train_1744956981 train_352677...,0.039216,[train_2406599165]
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,train_3369186413 train_921438619,train_3369186413 train_921438619 train_2579931...,0.039216,[train_3369186413]


## 예측값 결합

In [15]:
if GET_CV:
    df['image_pred'] = image_predictions
    df['text_pred'] = text_predictions
    df['pred_matches'] = df.apply(combine_preds, axis = 1)
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df.f1.mean()
    print('Final F1 CV Score :', score)
    df['matches'] = df['pred_matches']
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
else:
    df['image_pred'] = image_predictions
    df['text_pred'] = text_predictions
    df['matches'] = df.apply(combine_preds, axis = 1)
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

Final F1 CV Score : 0.8976507157068873
