In [24]:
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Loading the CSV file into Pandas dataframe

In [25]:
df = pd.read_csv("/kaggle/input/shopee-product-matching/train.csv")

In [26]:
# Split into train and a temporary df (temp_df)
train, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Split the temp_df into val and test
val, test = train_test_split(temp_df, test_size=0.8, random_state=42)

In [27]:
len(train), len(val), len(test)

(23975, 2055, 8220)

In [28]:
train.columns

Index(['posting_id', 'image', 'image_phash', 'title', 'label_group'], dtype='object')

In [29]:
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
33858,train_518321497,fd12573a06d9996474f913166bce34cb.jpg,9d9c62633336c69c,[PROMO] Botol Spray Bening 60ml / 100ml Tebal ...,2316087548
25273,train_2336150288,bd21a9d83b0b35abf973e1110703ed7a.jpg,b6ff2ea2c0366ac0,HANDUK DEWASA UK 70X135 RANDOM WARNA,30633790
28136,train_149093745,d29d6e2827c9557b0c8e5f6d6401c01c.jpg,8b2ed4d3f05123ec,DOUBLE GAME CONSOLE RESTO FC400 GAMES - GAMEBO...,3891527925
12967,train_2557383584,6198d496b38c9d3b9aab76869aab33aa.jpg,ba87e5bc0271870f,GELYA KAFTAN with BEADS ( PROMOOO ),1712807720
21261,train_2460952678,9fab19d72d0c03b2708d3a51eb5abc8e.jpg,b16161cbce869e9a,Purbasari Hand Gel 50ml - Hand Sanitizer,244622761


In [30]:
def binary_encode_data(data):
    data_without_dup = data.drop_duplicates(subset='label_group')
    posting_id_dict = data.groupby('label_group')['posting_id'].unique().to_dict()
    title_dict = data.groupby('label_group')['title'].unique().to_dict()
    image_dict = data.groupby('label_group')['image'].unique().to_dict()
    label_groups = data_without_dup.label_group.values.tolist()
    
    
    new_data = []
    
    for i,label_group in tqdm(enumerate(data_without_dup['label_group'])):
        matches = posting_id_dict[label_group].tolist()
        titles = title_dict[label_group].tolist()
        images = image_dict[label_group].tolist()

        index = np.random.randint(2,len(label_groups)-2)
        while (index== i):
            index = np.random.randint(0,len(label_groups))

        if len(matches) == 2:
            if len(titles) == 2:
                matches.extend(titles)
                matches.extend(images)
                matches.extend([1])
            else:
                matches.extend([titles[0],titles[0]])
                matches.extend(images)
                matches.extend([1])
            new_data.append(matches)
            new_data.append([matches[0],posting_id_dict[label_groups[index]][0],titles[0],title_dict[label_groups[index]][0],images[0],image_dict[label_groups[index]][0],0])
            new_data.append([matches[0],posting_id_dict[label_groups[index+1]][0],titles[0],title_dict[label_groups[index+1]][0],images[0],image_dict[label_groups[index+1]][0],0])
            new_data.append([matches[0],posting_id_dict[label_groups[index-1]][0],titles[0],title_dict[label_groups[index-1]][0],images[0],image_dict[label_groups[index-1]][0],0])

        else:
            for match,title,image in zip(matches[1:],titles[1:],images[1:]):
                new_data.append([matches[0],match,titles[0],title,images[0],image,1])
                new_data.append([matches[0],posting_id_dict[label_groups[index]][0],titles[0],title_dict[label_groups[index]][0],images[0],image_dict[label_groups[index]][0],0])
    
    siamese_data = pd.DataFrame(new_data,columns=['posting_id_1','posting_id_2','title_1','title_2','image_1','image_2','label'])
    
    return siamese_data

In [31]:
train_df = binary_encode_data(train)
val_df = binary_encode_data(val)
test_df = binary_encode_data(test)

10298it [00:00, 59765.50it/s]
1818it [00:00, 86469.71it/s]
5698it [00:00, 83692.78it/s]


In [32]:
len(train_df), len(val_df), len(test_df)

(33860, 782, 7170)

In [33]:
train_df.head()

Unnamed: 0,posting_id_1,posting_id_2,title_1,title_2,image_1,image_2,label
0,train_518321497,train_1764357950,[PROMO] Botol Spray Bening 60ml / 100ml Tebal ...,READY BOTOL SPRAY BENING KOSONG 100ML botol ha...,fd12573a06d9996474f913166bce34cb.jpg,4bcacaa686389d4505eedbba23a4f888.jpg,1.0
1,train_518321497,train_2774648570,[PROMO] Botol Spray Bening 60ml / 100ml Tebal ...,ALL TYPE Clear Tempered Glass Anti Gores Kaca ...,fd12573a06d9996474f913166bce34cb.jpg,1f1bf35de6a0f760702d87d902833465.jpg,0.0
2,train_518321497,train_192570027,[PROMO] Botol Spray Bening 60ml / 100ml Tebal ...,Botol Spray Bening 50ml / 60ml / 100ml / 250ml...,fd12573a06d9996474f913166bce34cb.jpg,4e2b5518d6a872d254c857740a4e77c4.jpg,1.0
3,train_518321497,train_2774648570,[PROMO] Botol Spray Bening 60ml / 100ml Tebal ...,ALL TYPE Clear Tempered Glass Anti Gores Kaca ...,fd12573a06d9996474f913166bce34cb.jpg,1f1bf35de6a0f760702d87d902833465.jpg,0.0
4,train_518321497,train_2542478416,[PROMO] Botol Spray Bening 60ml / 100ml Tebal ...,BOTOL SPRAY KOSONG 100ML TRAVELLING BAHAN PLAS...,fd12573a06d9996474f913166bce34cb.jpg,44834c652b988cf4f389189154cef8f7.jpg,1.0


In [34]:
val_df.head()

Unnamed: 0,posting_id_1,posting_id_2,title_1,title_2,image_1,image_2,label
0,train_3730694224,train_1605540675,Viva Air Mawar 100ml,viva air mawar,fe89b2de8803236a729bcab1a57864f5.jpg,28a8ce6ec44054606ec51e29b31dcfc3.jpg,1.0
1,train_3730694224,train_4045312279,Viva Air Mawar 100ml,Promo Bella Square New \xe2\x80\x9cwarna part ...,fe89b2de8803236a729bcab1a57864f5.jpg,14d48b3e7d1c2cffcf92521ef6d49760.jpg,0.0
2,train_3730694224,train_3307385969,Viva Air Mawar 100ml,Viva air mawar/netto 100 ml. Exp 2023,fe89b2de8803236a729bcab1a57864f5.jpg,aea9f9bac898152d82f5aed52e833a31.jpg,1.0
3,train_3730694224,train_4045312279,Viva Air Mawar 100ml,Promo Bella Square New \xe2\x80\x9cwarna part ...,fe89b2de8803236a729bcab1a57864f5.jpg,14d48b3e7d1c2cffcf92521ef6d49760.jpg,0.0
4,train_1089341912,train_3518026116,TONER MAXI PEEL/MAXI PEEL TONER,[ORIGINAL FILIPINA] MAXIPEEL TRETINOIN HIDROQU...,afeac1b833d8a87c7a31a2277ded0f7c.jpg,a9da75cbdd2a9d20b5576cfe28dcf64f.jpg,1.0


In [35]:
test_df.head()

Unnamed: 0,posting_id_1,posting_id_2,title_1,title_2,image_1,image_2,label
0,train_2658780777,train_873844439,E33 Sponge Cendol Spons Busa Cuci Mobil Microf...,SPONGE BUSA CUCI MOBIL / MICROFIBEL CENDOL / M...,4411807042c8a1cbef787a9ad2c87015.jpg,a072e3cdc2fc3b4929e9a13e4532cd34.jpg,1.0
1,train_2658780777,train_1672431996,E33 Sponge Cendol Spons Busa Cuci Mobil Microf...,Tepung Cap Putri 250gr,4411807042c8a1cbef787a9ad2c87015.jpg,86d2274c062423374b5b06e044592ac7.jpg,0.0
2,train_2658780777,train_31527834,E33 Sponge Cendol Spons Busa Cuci Mobil Microf...,COSRX Advanced Snail 92 Cream 100gr,4411807042c8a1cbef787a9ad2c87015.jpg,2107d3013e0a91fa5dd20926582fa798.jpg,0.0
3,train_2658780777,train_35568213,E33 Sponge Cendol Spons Busa Cuci Mobil Microf...,Sandal Jepit Dulux Captain America 293D Size 2...,4411807042c8a1cbef787a9ad2c87015.jpg,94c9efff22e041a6f4e4a66f235438cf.jpg,0.0
4,train_2022113923,train_4204231951,\xf0\x9f\x92\x8bLipstick Matte Velvet Tahan La...,HANDAIYAN Lipstick Matte Warna Nude Mewah,dae97dd1c1023ad020b98a863ff21e40.jpg,b106fafd3059190c85f1867e64ef1da0.jpg,1.0


In [36]:
train_df.to_csv("train_df.csv")
test_df.to_csv("test_df.csv")
val_df.to_csv("val_df.csv")