In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')

In [None]:
train_df.head(10)

In [None]:
train_df.describe()

In [None]:
tmp = train_df.groupby(['label_group'])['posting_id'].unique().to_dict()
train_df['matches'] = train_df['label_group'].map(tmp)
train_df['matches'] = train_df['matches'].apply(lambda x: ' '.join(x))

In [None]:
train_df.head()

# インドネシア語を英語に変換

In [None]:
cm_map = { "wanita": "woman", "anak": "child", "bayi": "baby", "tas": "bag", "masker": "face mask", "pria": "men", "murah": "cheap", "tangan": "hand", "alat": "tool", "motif": "motive", "warna": "color", "bahan": "material", "celana": "pants", "baju": "clothes", "kaos": "t-shirt", "sepatu": "shoes", "rambut": "hair", "mainan": "toy", "sarung": "holster", "polos": "plain", "rak": "rack", "botol": "bottle", "sabun": "soap", "kain": "fabric", "panjang": "long", "kabel": "cable", "buku": "book", "plastik": "plastic", "mobil": "car", "hitam": "black", "karakter": "character", "putih": "white", "dompet": "purse", "kaki": "feet", "pembersih": "cleaners", "lipat": "folding", "silikon": "silicone", "minyak": "oil", "isi": "contents", "paket": "package", "susu": "milk", "gamis": "robe", "mandi": "bath", "madu": "honey", "kulit": "skin", "serbaguna": "multipurpose", "bisa": "can", "kacamata": "spectacles", "pendek": "short", "tali": "rope", "selempang": "sash", "topi": "hat", "obat": "drug", "gantungan": "hanger", "tahun": "year", "jilbab": "hijab", "dapur": "kitchen", "dinding": "wall", "kuas": "brush", "perempuan": "woman", "katun": "cotton", "sepeda": "bike", "lucu": "funny", "lengan": "arm", "kaca": "glass", "garansi": "warranty", "bunga": "flower", "handuk": "towel", "dewasa": "adult", "elektrik": "electric", "timbangan": "balance", "besar": "big", "bahan": "ingredient", "ransel": "backpack", "kertas": "paper" }

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("../input/id-en-converter")
txt_model = AutoModelForSeq2SeqLM.from_pretrained("../input/id-en-converter")

txt_model.cuda()
txt_model.eval()

In [None]:
trans_texts = []
CHUNK = 30

print('translating texts')
CTS = len(train_df)//CHUNK
if len(train_df)%CHUNK!=0: CTS += 1
for i,j in tqdm(enumerate(range(CTS)), total=CTS):
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(train_df))
    input_ids = tokenizer(list(train_df.iloc[a:b].title.values), return_tensors="pt", truncation=True, padding=True).input_ids.cuda()
    outputs = txt_model.generate(input_ids=input_ids, num_return_sequences=1)    
    val = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    trans_texts.extend(val)

In [None]:
print(trans_texts[:5])

In [None]:
train_df.drop(columns='title', inplace=True)
train_df['title'] = trans_texts
train_df.head(10)

In [None]:
train_df.to_csv('en_title_train.csv', index=False)

# titleをきれいにする

In [None]:
import re, string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def clean(title):
    stop = stopwords.words('english')
    title = [x for x in title.split() if not x in stop]
    title = " ".join(title)
    title = title.lower()
    title = re.sub(r"\-","",title)
    title = re.sub(r"\+","",title)
    title = re.sub (r"&","and",title)
    title = re.sub(r"\|","",title)
    title = re.sub(r"\\","",title)
    title = re.sub(r"\W"," ",title)
    for p in string.punctuation :
        title = re.sub(r"f{p}","",title)
    
    title = re.sub(r"\s+"," ",title)
    
    return title
train_df['title'] = train_df['title'].map(clean)

In [None]:
train_df['title'].head(10)

In [None]:
train_df.to_csv('en_title_train.csv', index=False)

# TPUに使うdatasetsの呼び出し

In [None]:
from kaggle_datasets import KaggleDatasets
import tensorflow as tf

In [None]:
GCS_PATH = KaggleDatasets().get_gcs_path('predict-python-files')
TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/*.py')

In [None]:
print(GCS_PATH)
print(TRAINING_FILENAMES)

# 画像のサイズ確認

In [None]:
for img_name in train_df['image'].to_list()[:10]:
    input_path = '../input/shopee-product-matching/train_images/'+img_name
    im = Image.open(input_path)
    print(im.size)

# 同一アイテムの画像を確認

In [None]:
top10_names = train_df['label_group'].value_counts().index.tolist()[:15]
top10_values = train_df['label_group'].value_counts().tolist()[:15]

display(top10_names)
display(top10_values)

In [None]:
# 同一画像の枚数
print(train_df['label_group'].value_counts().value_counts())

In [None]:
def show_images(row, col, group_num):
    plt.figure(figsize=(100,100))
    for i, img_name in enumerate(train_df[train_df['label_group']==group_num]['image'].to_list()):
        filename = '../input/shopee-product-matching/train_images/'+img_name
        img = Image.open(filename, 'r')  # 画像読み込み
        plt.subplot(row, col, i+1)
        plt.imshow(img)
        plt.axis('off')  

In [None]:
row = 11 #行数
col = 5 #列数
group_num = 159351600

show_images(row, col, group_num)

In [None]:
row = 7 #行数
col = 5 #列数
group_num = 2008989859

show_images(row, col, group_num)

In [None]:
row = 6 #行数
col = 6 #列数
group_num = 1746655739

show_images(row, col, group_num)

In [None]:
row = 5 #行数
col = 9 #列数
group_num = 3489985175

show_images(row, col, group_num)

# 特定の画像を確認

In [None]:
filename = '../input/shopee-product-matching/train_images/'+'0013e7355ffc5ff8fb1ccad3e42d92fe.jpg'
img = Image.open(filename, 'r')  # 画像読み込み
plt.imshow(img)

# imageのパスに重複がある!

In [None]:
# csvの行数
train_df.shape

In [None]:
# 画像数
!ls -1 ../input/shopee-product-matching/train_images | wc -l

In [None]:
# 画像検出の枚数
!ls -1 ../input/shopee-object-detected-images/object_recognized_images_th_0-3_box_3/object_recognized_images_th_0-3_box_3 | wc -l

In [None]:
top10_names = train_df['image'].value_counts().index.tolist()[:15]
top10_values = train_df['image'].value_counts().tolist()[:15]

display(top10_names)
display(top10_values)

In [None]:
print(len(train_df[train_df['label_group']==4198148727]))
len(train_df[train_df['label_group']==2403374241])

In [None]:
filename = '../input/shopee-product-matching/train_images/'+'0cca4afba97e106abd0843ce72881ca4.jpg'
img = Image.open(filename, 'r')  # 画像読み込み
plt.imshow(img)


In [None]:
def show_images(row, col, group_num):
    plt.figure(figsize=(100,100))
    for i, img_name in enumerate(train_df[train_df['label_group']==group_num]['image'].to_list()):
        filename = '../input/shopee-product-matching/train_images/'+img_name
        img = Image.open(filename, 'r')  # 画像読み込み
        plt.subplot(row, col, i+1)
        plt.imshow(img)
        plt.axis('off')  

In [None]:
row = 5 #行数
col = 3 #列数
group_num = 2403374241

show_images(row, col, group_num)

In [None]:
row = 4 #行数
col = 4 #列数
group_num = 4198148727

show_images(row, col, group_num)

In [None]:
train_df[train_df['image']=='5ee62d13d49ea74cc3553f8ba5f6220d.jpg']

# 重複削除csv

In [None]:
for i, v in zip(train_df['image'].value_counts().index.tolist(), train_df['image'].value_counts().tolist()):
    if v==1:
        break
    df = train_df[train_df['image']==i]
    uniq_labels = df['label_group'].value_counts().index.tolist()
    uniq_label_counts = df['label_group'].value_counts().tolist()
    if len(uniq_labels) == 2 and uniq_label_counts[0]!=uniq_label_counts[1]:
        print(f'2種類の異なる数のラベルが存在：{i}, {v}')
    elif len(uniq_labels) == 2 and uniq_label_counts[0]==uniq_label_counts[1]:
        print(f'2種類で同一数のラベルが存在：{i}, {v}')

In [None]:
no_duplicate_train_df = train_df.drop_duplicates(subset=['image', 'label_group'])
no_duplicate_train_df.shape
no_duplicate_train_df.to_csv('train.csv', index=False)

In [None]:
# 普通のオーグメンテーションの重複なしcsv
df1 = pd.read_csv('../input/shoee-augmented-data/shopee_augmented_data/train.csv')
display(df1.shape)
no_dup_df1 = df1.drop_duplicates(subset=['image', 'label_group'])
display(no_dup_df1.shape)
no_dup_df1.to_csv('simple_aug_train.csv', index=False)

In [None]:
# 物体検知データの重複なしcsv
df2 = pd.read_csv('../input/shopee-object-detected-aug/shopee_augmented_data/train.csv')
display(df2.shape)
no_dup_df2 = df2.drop_duplicates(subset=['image', 'label_group'])
display(no_dup_df2.shape)
no_dup_df2.to_csv('object_detected_train.csv', index=False)

# Inferenceの結を調査

# submission結果を表示させる関数

In [None]:
def show_result(df):
    display(df.head())

    df['matches_num']=df['matches'].map(lambda x: len(x.split(' ')))
    display(df.sort_values('matches_num', ascending=False).head(100))

    display(df.describe())

    top_names = df['matches_num'].value_counts().index.tolist()[:15]
    top_values = df['matches_num'].value_counts().tolist()[:15]

    display(top_names)
    display(top_values)

# train_df

In [None]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')
tmp = train_df.groupby(['label_group'])['posting_id'].unique().to_dict()
train_df['matches'] = train_df['label_group'].map(tmp)
train_df['matches'] = train_df['matches'].apply(lambda x: ' '.join(x))
show_result(train_df)

In [None]:
display(train_df.iloc[30488])
display(train_df.iloc[9318])

# ガチベスト(0.337)の結果

In [None]:
best_df = pd.read_csv('../input/shopee-best-sub/submission_0.337.csv')
show_result(best_df)

# ベスト(0.334)の結果

In [None]:
best_df = pd.read_csv('../input/shopee-best-sub/original_submission.csv')
show_result(best_df)

# EFN7単体ベストの結果

In [None]:
sp_df = pd.read_csv('../input/shopee-best-sub/first_best_submission.csv')
show_result(sp_df)

# 0.334のefn7のknn=30

In [None]:
tf_df = pd.read_csv('../input/modified-tf-idf-rapids-arc-margin-shopee/submission.csv')
show_result(tf_df)

# best + カリキュラ

In [None]:
cf_df = pd.read_csv('../input/shopee-best-sub/submission_curricularface.csv')
show_result(cf_df)