In [38]:
# DATA_PATH = '../input/'
DATA_PATH = './data/'

import psutil

### 라이브러리 모음

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2, matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import gc

def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [3]:
COMPUTE_CV = True

test = pd.read_csv(DATA_PATH + 'test.csv')
if len(test)>3: COMPUTE_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')

# COMPUTE_CV = False

if COMPUTE_CV:
    train = pd.read_csv(DATA_PATH + 'train.csv')
    # 이미지 컬럼에 이미지 경로 삽입
    train['image'] = DATA_PATH + 'train_images/' + train['image']
    # target은 자신의 posting_id와 label이 같은 하나의 이미지의 posting_id를 엮은 것
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
else:
    train = pd.read_csv(DATA_PATH + 'test.csv')
    train['image'] = DATA_PATH + 'test_images/' + train['image']
    
print('train shape is', train.shape )
train.head()

this submission notebook will compute CV score, but commit notebook will not
train shape is (34250, 6)


Unnamed: 0,posting_id,image,image_phash,title,label_group,target
0,train_129225211,./data/train_images/0000a68812bc7e98c42888dfb1...,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]"
1,train_3386243561,./data/train_images/00039780dfc94d01db8676fe78...,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]"
2,train_2288590299,./data/train_images/000a190fdd715a2a36faed16e2...,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]"
3,train_2406599165,./data/train_images/00117e4fc239b1b641ff08340b...,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]"
4,train_3369186413,./data/train_images/00136d1cf4edede0203f32f05f...,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]"


##### target이 뭘까 == label이 같은 이미지

In [10]:
display(train.loc[train['posting_id']=='train_129225211',['posting_id','label_group','target']])
display(train.loc[train['posting_id']=='train_2278313361',['posting_id','label_group','target']])

Unnamed: 0,posting_id,label_group,target
0,train_129225211,249114794,"[train_129225211, train_2278313361]"


Unnamed: 0,posting_id,label_group,target
33161,train_2278313361,249114794,"[train_129225211, train_2278313361]"


### TF-IDF: 
#### 특정 단어의 상대적인 빈도수를 나타내주는 값
#### 문서 내 단어들의 빈도를 기준으로 가중치를 정해 특징을 추출하는 것

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# stop_words: you, your, i, me 같은 자주 사용되나 의미 없는 단어 제거
model = TfidfVectorizer(stop_words='english', binary=True, max_features=55000)

# train의 title 데이터를 벡터 속 좌표화
text_embeddings = model.fit_transform(train.title).toarray()
print('text embeddings shape',text_embeddings.shape)

text embeddings shape (34250, 24939)


In [14]:
import torch


text_embeddings = torch.from_numpy(text_embeddings)
# cuda: NVIDIA가 만든 병렬 컴퓨팅 플랫폼 및 API 모델
#       GPU에서 실행되도록 할당하는 것
text_embeddings = text_embeddings.cuda()

#### Embedding: 우리가 표현하고자 하는 대상을 벡터공간의 좌표로 매핑하고 표현하는 과정

In [18]:
print(text_embeddings)
print(torch.matmul(text_embeddings, text_embeddings[0:1024].T).T)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', dtype=torch.float64)
tensor([[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.1067],
        [0.0000, 0.0000, 1.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', dtype=torch.float64)


### 학습

In [20]:
preds = []
CHUNK = 1024

print('Finding similar titles...')
CTS = len(train)//CHUNK
if len(train)%CHUNK!=0: CTS += 1
CTS_index = 0
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(train))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    # cts = np.dot( text_embeddings, text_embeddings[a:b].T).T
    cts = torch.matmul(text_embeddings, text_embeddings[a:b].T).T
    cts = cts.data.cpu().numpy()
    print(cts.shape)
    for k in range(b-a):
        # IDX = np.where(cts[k,]>0.7)[0]
        IDX = np.where(cts[k,]>0.7)[0]
        o = train.iloc[IDX].posting_id.values
        preds.append(o.tolist())
        CTS_index += 1
# del model, text_embeddings

Finding similar titles...
chunk 0 to 1024
(1024, 34250)
chunk 1024 to 2048
(1024, 34250)
chunk 2048 to 3072
(1024, 34250)
chunk 3072 to 4096
(1024, 34250)
chunk 4096 to 5120
(1024, 34250)
chunk 5120 to 6144
(1024, 34250)
chunk 6144 to 7168
(1024, 34250)
chunk 7168 to 8192
(1024, 34250)
chunk 8192 to 9216
(1024, 34250)
chunk 9216 to 10240
(1024, 34250)
chunk 10240 to 11264
(1024, 34250)
chunk 11264 to 12288
(1024, 34250)
chunk 12288 to 13312
(1024, 34250)
chunk 13312 to 14336
(1024, 34250)
chunk 14336 to 15360
(1024, 34250)
chunk 15360 to 16384
(1024, 34250)
chunk 16384 to 17408
(1024, 34250)
chunk 17408 to 18432
(1024, 34250)
chunk 18432 to 19456
(1024, 34250)
chunk 19456 to 20480
(1024, 34250)
chunk 20480 to 21504
(1024, 34250)
chunk 21504 to 22528
(1024, 34250)
chunk 22528 to 23552
(1024, 34250)
chunk 23552 to 24576
(1024, 34250)
chunk 24576 to 25600
(1024, 34250)
chunk 25600 to 26624
(1024, 34250)
chunk 26624 to 27648
(1024, 34250)
chunk 27648 to 28672
(1024, 34250)
chunk 28672 to 2

### 최종 정확도 출력
#### 0.6139718474362906

In [33]:
# 예측값을 off_text에 삽입
train['oof_text'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_text'),axis=1)
    print('CV score for baseline =',train.f1.mean())

CV score for baseline = 0.6139718474362906


#### oof_text(prediction 값) 확인

In [34]:
display(train.head())

Unnamed: 0,posting_id,image,image_phash,title,label_group,target,oof_text,f1
0,train_129225211,./data/train_images/0000a68812bc7e98c42888dfb1...,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]","[train_129225211, train_2278313361]",1.0
1,train_3386243561,./data/train_images/00039780dfc94d01db8676fe78...,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]",[train_3386243561],0.666667
2,train_2288590299,./data/train_images/000a190fdd715a2a36faed16e2...,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]",[train_2288590299],0.666667
3,train_2406599165,./data/train_images/00117e4fc239b1b641ff08340b...,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]","[train_2406599165, train_3576714541, train_150...",0.285714
4,train_3369186413,./data/train_images/00136d1cf4edede0203f32f05f...,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]",[train_3369186413],0.666667


#### oof_text 속 posting_id들의 label_group이 다르면 정확도가 낮게 나옴

In [37]:
display(train.loc[train['posting_id']=='train_2406599165',['posting_id','label_group','target','oof_text','f1']])
display(train.loc[train['posting_id']=='train_3576714541',['posting_id','label_group','target','oof_text','f1']])

Unnamed: 0,posting_id,label_group,target,oof_text,f1
3,train_2406599165,4093212188,"[train_2406599165, train_3342059966]","[train_2406599165, train_3576714541, train_150...",0.285714


Unnamed: 0,posting_id,label_group,target,oof_text,f1
2522,train_3576714541,264184112,"[train_3576714541, train_1762379453]","[train_2406599165, train_3576714541, train_174...",0.333333
