# Train-Baseline

In [36]:
# DATA_PATH = '../input/'
# 数据输入路径
DATA_PATH = '../input/shopee-product-matching/'

In [37]:
import numpy as np # linear algebra
import pandas as pd, gc # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2, matplotlib.pyplot as plt
from tqdm import tqdm_notebook

import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

In [38]:
# 计算F1 score
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [39]:
# 标识是不是计算验证得分
COMPUTE_CV = True

test = pd.read_csv(DATA_PATH + 'test.csv')
if len(test)>3: COMPUTE_CV = False
# 如果测试集大于3行，进入提交模式
else: print('this submission notebook will compute CV score, but commit notebook will not')

# COMPUTE_CV = False

if COMPUTE_CV:
    train = pd.read_csv(DATA_PATH + 'train.csv')
    # train['image'] = DATA_PATH + 'train_images/' + train['image']
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
    # train_gf = cudf.read_csv(DATA_PATH + 'train.csv')
    train_gf = cudf.DataFrame(train)
else:
    train = pd.read_csv(DATA_PATH + 'test.csv')
    # train['image'] = DATA_PATH + 'test_images/' + train['image']
    # train_gf = cudf.read_csv(DATA_PATH + 'test.csv')
    train_gf = cudf.DataFrame(train)
    
print('train shape is', train.shape )

this submission notebook will compute CV score, but commit notebook will not
train shape is (34250, 6)


In [None]:
train.head()

# image hash

In [None]:
# 相同哈希值当作一组
tmp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof_hash'] = train.image_phash.map(tmp)
train.head()

In [None]:
if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_hash'),axis=1)
    print('CV score for baseline =',train.f1.mean())

# 聚类包括自己的，应该是大于0.5的

# Efficientnetb0

In [40]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0

In [41]:
# RESTRICT TENSORFLOW TO 1GB OF GPU RAM
# SO THAT WE HAVE 15GB RAM FOR RAPIDS
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

We will restrict TensorFlow to max 1GB GPU RAM
then RAPIDS can use 15GB GPU RAM


In [42]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=''): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

In [None]:
# model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg', input_shape=None)
# model.save_weights('efficientnetb0_notop.h5')

In [43]:
BASE = '../input/shopee-product-matching/test_images/'
if COMPUTE_CV: BASE = '../input/shopee-product-matching/train_images/'

WGT = '../input/effnetb0/efficientnetb0_notop.h5'
model = EfficientNetB0(weights=WGT,include_top=False, pooling='avg', input_shape=None)

embeds = []
CHUNK = 1024*4

print('Computing image embeddings...')
CTS = len(train)//CHUNK
if len(train)%CHUNK!=0: CTS += 1
for i,j in enumerate( range( CTS ) ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(train))
    print('chunk',a,'to',b)
    
    test_gen = DataGenerator(train.iloc[a:b], batch_size=32, path=BASE)
    image_embeddings = model.predict(test_gen,verbose=1,use_multiprocessing=True, workers=4)
    embeds.append(image_embeddings)

    #if i>=1: break
    
del model
_ = gc.collect()
image_embeddings = np.concatenate(embeds)
print('image embeddings shape',image_embeddings.shape)

Computing image embeddings...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250
image embeddings shape (34250, 1280)


In [44]:
KNN = 50
if len(train)==3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embeddings)

NearestNeighbors(n_neighbors=50, verbose=4, handle=<cuml.raft.common.handle.Handle object at 0x7fcf4a816fb0>, algorithm='brute', metric='euclidean', p=2, metric_params=None, output_type='numpy')

In [None]:
CHUNK = 1024*4
distance = []

print('Finding similar images...')
CTS = len(image_embeddings)//CHUNK
if len(image_embeddings)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(image_embeddings))
    print('chunk',a,'to',b)
    distances, indices = model.kneighbors(image_embeddings[a:b,])
    distance.append(distances)
    
temp = np.array(distance)
plt.hist(temp.flatten(),bins = 100)
plt.show()

We notice that there is a tail to the left indicating duplicating items in the histogram. The spike at dist = 0 are the images that are exact duplicates and low numbers are near duplicates. So choosing 6.0 will cut that tail off.

Second, we try different distances and choose the one that maximizes CV score.

In [45]:
def get_knn_preds(df, embeddings, knnmodel, threshold = 6.0, PRINT_CHUNK=False):
    preds = []
    CHUNK = 1024*4

    print('Finding similar embeddings...')
    CTS = len(embeddings)//CHUNK
    if len(embeddings)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(embeddings))
        if PRINT_CHUNK:
            print('chunk', a, 'to', b)

        distances, indices = knnmodel.kneighbors(embeddings[a:b,])

        for k in range(b-a):
            IDX = np.where(distances[k,]<threshold)[0]
            IDS = indices[k,IDX]
            o = df.iloc[IDS].posting_id.values
            preds.append(o)
    return preds

In [46]:
def distance_threshold_searching(df, embeddings, knnmodel, LB=4.0, UB=7.0):
    df1 = pd.DataFrame(columns = ['target', 'pred_matches'])
    df1.target = df.target
        
    thresholds = list(np.arange(LB, UB, 0.2))
    scores = []
    for threshold in thresholds:
        preds = get_knn_preds(df, embeddings, knnmodel, threshold)
        df1.pred_matches = preds
        MyCVScore = df1.apply(getMetric('pred_matches'), axis=1)
        score = MyCVScore.mean()
        print(f'CV score for threshold {round(threshold, 2)} = {score}')
        scores.append(score)
    thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {round(best_threshold, 2)}')

In [47]:
distance_threshold_searching(train, image_embeddings, model, LB=6.0, UB=7.6)

Finding similar embeddings...
CV score for threshold 4.0 = 0.604331913022176
Finding similar embeddings...
CV score for threshold 4.2 = 0.6085015290133909
Finding similar embeddings...
CV score for threshold 4.4 = 0.6126903603905952
Finding similar embeddings...
CV score for threshold 4.6 = 0.6166444106514297
Finding similar embeddings...
CV score for threshold 4.8 = 0.6200762311910106
Finding similar embeddings...
CV score for threshold 5.0 = 0.6241102965641212
Finding similar embeddings...
CV score for threshold 5.2 = 0.6275635861413381
Finding similar embeddings...
CV score for threshold 5.4 = 0.630622327260816
Finding similar embeddings...
CV score for threshold 5.6 = 0.6340223808341016
Finding similar embeddings...
CV score for threshold 5.8 = 0.6369304688092922
Finding similar embeddings...
CV score for threshold 6.0 = 0.6393196101848189
Finding similar embeddings...
CV score for threshold 6.2 = 0.6421880171402373
Finding similar embeddings...
CV score for threshold 6.4 = 0.64398

In [50]:
preds = get_knn_preds(train, image_embeddings, model, threshold=7.0)
train['oof_enet'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_enet'),axis=1)
    print('CV score for baseline =',train.f1.mean())

Finding similar embeddings...
CV score for baseline = 0.646752767970401


# image CNN

In [None]:
from PIL import Image

import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset

# 自定义一个数据集
# 实例化d=ShopeeImageDataset()
# d[10] getitem
# len(d)
class ShopeeImageDataset(Dataset):
    def __init__(self, img_path, transform):
        self.img_path = img_path
        self.transform = transform
        
    def __getitem__(self, index):
        img = Image.open(self.img_path[index]).convert('RGB')
        img = self.transform(img)
        return img
    
    def __len__(self):
        return len(self.img_path)

In [None]:
# 实例化

imagedataset = ShopeeImageDataset(
    train['image'].values,
    transforms.Compose([
        transforms.Resize((512, 512)),
        transforms.ToTensor(),# pillow->tensor,0-255=>0-1
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]))

# dataloader批量读取，batch_size=10,shuffle不打乱
imageloader = torch.utils.data.DataLoader(
    imagedataset,
    batch_size=10, shuffle=False, num_workers=2
)

In [None]:
# 用resnet18
class ShopeeImageEmbeddingNet(nn.Module):
    def __init__(self):
        super(ShopeeImageEmbeddingNet, self).__init__()
        
        #
        model = models.resnet18(True) # True表示使用预训练参数
        # mean-pooling=>max-pooling 会好一些
        model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1))
        
        model = nn.Sequential(*list(model.children())[:-1])
        
        # 原始image_net1000类，不需要全连接，只需要embedding
        model.eval()# 关闭bn，关闭dropout
        self.model = model
    # 正向传播
    def forward(self, img):        
        out = self.model(img)
        return out

In [None]:
# 本地与训练模型保存的文件夹
!mkdir -p /root/.cache/torch/hub/checkpoints/
!cp ../input/pretrained-pytorch-models/resnet18-5c106cde.pth /root/.cache/torch/hub/checkpoints/

In [None]:
DEVICE = 'cuda'

imgmodel = ShopeeImageEmbeddingNet()
imgmodel = imgmodel.to(DEVICE)

imagefeat = []
with torch.no_grad():
    for data in tqdm_notebook(imageloader):
        data = data.to(DEVICE)
        feat = imgmodel(data)
        
        feat = feat.reshape(feat.shape[0], -1)
        
        feat = feat.data.cpu().numpy()
        
        imagefeat.append(feat)

In [None]:
from sklearn.preprocessing import normalize

# l2 norm to kill all the sim in 0-1
imagefeat = np.vstack(imagefeat)
imagefeat = normalize(imagefeat)

In [None]:
'''
KNN = 50
if len(test)==3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN)
model.fit(imagefeat)
'''

In [None]:
preds = []
# 4096一批计算相似度
CHUNK = 1024*4

imagefeat = cupy.array(imagefeat)

print('Finding similar images...')
CTS = len(imagefeat)//CHUNK
if len(imagefeat)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b, len(imagefeat))
    print('chunk',a,'to',b)
    
    distances = cupy.matmul(imagefeat, imagefeat[a:b].T).T
    # distances = np.dot(imagefeat[a:b,], imagefeat.T)
    
    for k in range(b-a):
        # 如果相似度大于0.95就算
        IDX = cupy.where(distances[k,]>0.95)[0]
        # IDX = np.where(distances[k,]>0.95)[0][:]
        o = train.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)
        
# del imagefeat, imgmodel

In [None]:
train['oof_cnn'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_cnn'),axis=1)
    print('CV score for baseline =',train.f1.mean())

# title TFIDF

In [51]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
tqdm.pandas()

def preprocess_text(sentence):
    sentence = str(sentence)
    sentence = sentence.lower()
    # sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    sentence = sentence.split()
    sentence = [word for word in sentence if word not in stopwords.words('english')]
    wl = WordNetLemmatizer()
    sentence = [wl.lemmatize(word) for word in sentence]
    sentence = ' '.join(sentence)
    return sentence

train['title_clean'] = train['title'].progress_apply(preprocess_text)

  from pandas import Panel
100%|██████████| 34250/34250 [00:38<00:00, 893.67it/s]


In [None]:
train.head()

In [52]:
train_gf['title_clean'] = train['title_clean']
train_gf

Unnamed: 0,posting_id,image,image_phash,title,label_group,target,title_clean
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]",paper bag victoria secret
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]","double tape 3m vhb 12 mm x 4,5 original / doub..."
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]",maling tt canned pork luncheon meat 397 gr
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]",daster batik lengan pendek - motif acak / camp...
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]",nescafe \xc3\x89clair latte 220ml
...,...,...,...,...,...,...,...
34245,train_4028265689,fff1c07ceefc2c970a7964cfb81981c5.jpg,e3cd72389f248f21,Masker Bahan Kain Spunbond Non Woven 75 gsm 3 ...,3776555725,"[train_2829161572, train_4028265689]",masker bahan kain spunbond non woven 75 gsm 3 ...
34246,train_769054909,fff401691371bdcb382a0d9075dfea6a.jpg,be86851f72e2853c,MamyPoko Pants Royal Soft - S 70 - Popok Celana,2736479533,"[train_1463059254, train_769054909]",mamypoko pant royal soft - 70 - popok celana
34247,train_614977732,fff421b78fa7284284724baf249f522e.jpg,ad27f0d08c0fcbf0,KHANZAACC Robot RE101S 1.2mm Subwoofer Bass Me...,4101248785,"[train_4126022211, train_3926241003, train_232...",khanzaacc robot re101s 1.2mm subwoofer bass me...
34248,train_3630949769,fff51b87916dbfb6d0f8faa01bee67b8.jpg,e3b13bd1d896c05c,"Kaldu NON MSG HALAL Mama Kamu Ayam Kampung , S...",1663538013,"[train_3419392575, train_1431563868, train_363...","kaldu non msg halal mama kamu ayam kampung , s..."


In [53]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# 25000 
model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000)
# text_embeddings = model.fit_transform(train_gf.title).toarray()
text_embeddings = model.fit_transform(train_gf.title_clean).toarray()
print('text embeddings shape',text_embeddings.shape)

text embeddings shape (34250, 24652)


In [54]:
def get_text_preds(df, embeddings, threshold=0.75, PRINT_CHUNK=False):
    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1

    preds = []
    for j in range( CTS ):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        if PRINT_CHUNK:
            print('chunk', a, 'to', b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(embeddings, embeddings[a:b].T).T
        for k in range(b-a):
            IDX = cupy.where(cts[k,]>threshold)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
    return preds

In [55]:
def cos_threshold_searching(df, embeddings, LB=0.70, UB=0.80):
    df1 = pd.DataFrame(columns = ['target', 'pred_matches'])
    df1.target = df.target
        
    thresholds = list(np.arange(LB, UB, 0.02))
    scores = []
    for threshold in thresholds:
        preds = get_text_preds(df, embeddings, threshold)
        df1.pred_matches = preds
        MyCVScore = df1.apply(getMetric('pred_matches'), axis=1)
        score = MyCVScore.mean()
        print(f'CV score for threshold {round(threshold, 2)} = {score}')
        scores.append(score)
    thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {round(best_threshold, 2)}')

In [56]:
cos_threshold_searching(train, text_embeddings, LB=0.50, UB=0.60)

Finding similar titles...
CV score for threshold 0.5 = 0.6591369293684559
Finding similar titles...
CV score for threshold 0.52 = 0.6610204806839047
Finding similar titles...
CV score for threshold 0.54 = 0.6614022356034018
Finding similar titles...
CV score for threshold 0.56 = 0.660011710952917
Finding similar titles...
CV score for threshold 0.58 = 0.6569429099082632
Our best score is 0.6614022356034018 and has a threshold 0.54


In [57]:
preds = get_text_preds(train, text_embeddings, threshold=0.54, PRINT_CHUNK=False)
train['oof_text'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_text'),axis=1)
    print('CV score for baseline =',train.f1.mean())

Finding similar titles...
CV score for baseline = 0.6614022356034018


In [58]:
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,target,oof_enet,f1,title_clean,oof_text
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]",[train_129225211],1.0,paper bag victoria secret,"[train_129225211, train_2278313361]"
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]","[train_3386243561, train_3423213080]",0.666667,"double tape 3m vhb 12 mm x 4,5 original / doub...","[train_3386243561, train_3423213080, train_183..."
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]",[train_2288590299],1.0,maling tt canned pork luncheon meat 397 gr,"[train_2288590299, train_3803689425]"
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]",[train_2406599165],0.2,daster batik lengan pendek - motif acak / camp...,"[train_2406599165, train_3576714541, train_150..."
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]","[train_3369186413, train_921438619]",0.666667,nescafe \xc3\x89clair latte 220ml,[train_3369186413]


In [60]:
def combine_for_sub(row):
    # x = np.concatenate([row.oof_text,row.oof_cnn, row.oof_hash])
    x = np.concatenate([row.oof_text, row.oof_enet])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    # x = np.concatenate([row.oof_text,row.oof_cnn, row.oof_hash])
    x = np.concatenate([row.oof_text, row.oof_enet])
    return np.unique(x)

In [61]:
if COMPUTE_CV:
    # tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    # train['target'] = train.label_group.map(tmp)
    train['oof'] = train.apply(combine_for_cv,axis=1)
    train['f1'] = train.apply(getMetric('oof'),axis=1)
    print('CV Score =', train.f1.mean() )

train['matches'] = train.apply(combine_for_sub,axis=1)

CV Score = 0.7260622833279889


In [62]:
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,target,oof_enet,f1,title_clean,oof_text,oof,matches
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]",[train_129225211],1.0,paper bag victoria secret,"[train_129225211, train_2278313361]","[train_129225211, train_2278313361]",train_129225211 train_2278313361
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]","[train_3386243561, train_3423213080]",0.666667,"double tape 3m vhb 12 mm x 4,5 original / doub...","[train_3386243561, train_3423213080, train_183...","[train_1831941588, train_3386243561, train_342...",train_1831941588 train_3386243561 train_342321...
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]",[train_2288590299],1.0,maling tt canned pork luncheon meat 397 gr,"[train_2288590299, train_3803689425]","[train_2288590299, train_3803689425]",train_2288590299 train_3803689425
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]",[train_2406599165],0.2,daster batik lengan pendek - motif acak / camp...,"[train_2406599165, train_3576714541, train_150...","[train_1508100548, train_1744956981, train_204...",train_1508100548 train_1744956981 train_204309...
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]","[train_3369186413, train_921438619]",1.0,nescafe \xc3\x89clair latte 220ml,[train_3369186413],"[train_3369186413, train_921438619]",train_3369186413 train_921438619


In [None]:
train[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()