## dataset 준비
dataset을 negative item을 추가해서 준비한다.

1. outfit을 구성하는 item의 갯수 중 50퍼센트가 넘지않게 item을 랜덤하게 선택한다.
2. 해당 item과 같은 category의 다른 item을 negative item으로 대체한다.
3. negative item이라 하면은 outfit의 나머지 item들과 다른 outfit에서도 함께 등장하지 않은 item을 뜻한다.

In [1]:
import os
import numpy as np
import json
from matplotlib import pyplot as plt
from collections import defaultdict     ### dictionary의 value에 list를 추가하고싶을 때 이 라이브러리를 쓰면됨
from random import *
import secrets
import random
import time
from tqdm import tqdm
from PIL import Image

## Json data load

train data는 17316개의 outfit으로 구성되어 있다.
<br/>
valid data는 1497개의 outfit으로 구성되어 있다.
<br/>
test data는 3076개의 outfit으로 구성되어 있다.

In [2]:
### json, image의 path
json_path = './polyvore-dataset-master/'
image_path = json_path + 'images/'

### train, valid, testset의 json read
with open(json_path + 'train_no_dup.json', 'r') as f:
    train_data = json.load(f)
    
with open(json_path + 'valid_no_dup.json', 'r') as f:
    valid_data = json.load(f)

with open(json_path + 'test_no_dup.json', 'r') as f:
    test_data = json.load(f)

### data 구조 확인
print('train_data shape: {}'.format(np.shape(train_data)))
print('valid_data shape: {}'.format(np.shape(valid_data)))
print('test_data shape:  {}'.format(np.shape(test_data)))

train_data shape: (17316,)
valid_data shape: (1497,)
test_data shape:  (3076,)


### Outfit을 구성하는 item의 갯수 중 50% 넘지 않게 item을 random하게 선택한다.

* 각 item 별로 등장하는 outfit을 찾는다.

In [3]:
### item별로 등장하는 outfit을 찾는다.
### defaultdict()를 사용하면 dictionary의 value에 list를 넣기 쉬움
train_item2outfit = defaultdict(list)
valid_item2outfit = defaultdict(list)
test_item2outfit  = defaultdict(list)

### train data
for i in range(len(train_data)):
    list_ids = []
    ### 각 outfit에 있는 item들을 list에 저장한다.
    for j in range(len(train_data[i]['items'])):
        _, ids = train_data[i]['items'][j]['image'].split('id=')
        list_ids.append(ids)
    ### 저장한 list의 item이 등장하는 outfit을 붙인다.
    for j in range(len(list_ids)):
        train_item2outfit[list_ids[j]].append(train_data[i]['set_id'])

### valid data
for i in range(len(valid_data)):
    list_ids = []
    for j in range(len(valid_data[i]['items'])):
        _, ids = valid_data[i]['items'][j]['image'].split('id=')
        list_ids.append(ids)
    for j in range(len(list_ids)):
        valid_item2outfit[list_ids[j]].append(valid_data[i]['set_id'])

### test data
for i in range(len(test_data)):
    list_ids = []
    for j in range(len(test_data[i]['items'])):
        _, ids = test_data[i]['items'][j]['image'].split('id=')
        list_ids.append(ids)
    for j in range(len(list_ids)):
        test_item2outfit[list_ids[j]].append(test_data[i]['set_id'])


data를 저장해서 확인해본다.
<br/>
key: item_id, value: outfit_list

In [4]:
save_path = './negative_outfit/'

with open(save_path + 'train_item2outfit.json', 'w') as f:
    json.dump(train_item2outfit, f, indent=4)

with open(save_path + 'valid_item2outfit.json', 'w') as f:
    json.dump(valid_item2outfit, f, indent=4)

with open(save_path + 'test_item2outfit.json', 'w') as f:
    json.dump(test_item2outfit, f, indent=4)

## Outfit에 있는 item 랜덤하게 sampling하기

outfit에 있는 item을 절반이 넘지 않게 랜덤하게 선택해본다.
<br/>
먼저 outfit 별로 갖고 있는 item을 확인할 수 있도록 간략하게 json file을 만들어본다.

In [5]:
### outfit에 등장하는 item들을 저장한다.
### defaultdict()를 사용하면 dictionary의 value에 list를 넣기 쉬움
train_outfit2item = defaultdict(list)
valid_outfit2item = defaultdict(list)
test_outfit2item  = defaultdict(list)

train_set_ids = []
valid_set_ids = []
test_set_ids  = []

### train data
for i in range(len(train_data)):
    train_list_ids = []
    set_id = train_data[i]['set_id']
    train_set_ids.append(set_id)
    for j in range(len(train_data[i]['items'])):
        _, ids = train_data[i]['items'][j]['image'].split('id=')
        train_list_ids.append(ids)

    train_outfit2item[set_id] = train_list_ids

### valid data
for i in range(len(valid_data)):
    valid_list_ids = []
    set_id = valid_data[i]['set_id']
    valid_set_ids.append(set_id)
    for j in range(len(valid_data[i]['items'])):
        _, ids = valid_data[i]['items'][j]['image'].split('id=')
        valid_list_ids.append(ids)

    valid_outfit2item[set_id] = valid_list_ids

### test data
for i in range(len(test_data)):
    test_list_ids = []
    set_id = test_data[i]['set_id']
    test_set_ids.append(set_id)
    for j in range(len(test_data[i]['items'])):
        _, ids = test_data[i]['items'][j]['image'].split('id=')
        test_list_ids.append(ids)

    test_outfit2item[set_id] = test_list_ids

In [6]:
with open(save_path + 'train_outfit2item.json', 'w') as f:
    json.dump(train_outfit2item, f, indent=4)

with open(save_path + 'valid_outfit2item.json', 'w') as f:
    json.dump(valid_outfit2item, f, indent=4)

with open(save_path + 'test_outfit2item.json', 'w') as f:
    json.dump(test_outfit2item, f, indent=4)

outfit 내에서 item을 랜덤하게 50%가 넘지 않게 선택한다. <br/>
이 때, 이 item을 대체할 item은 다른 outfit에서 한번도 같이 등장한 적이 없어야한다.

In [7]:
train_choice_items = defaultdict(list)
valid_choice_items = defaultdict(list)
test_choice_items = defaultdict(list)

### train data에 대하여
for i in range(len(train_data)):
    ### rand_num: 각 outfit 별로 random하게 선택할 item의 갯수
    set_id = train_data[i]['set_id']
    length = len(train_outfit2item[train_set_ids[i]])//2
    rand_num = np.random.randint(1, length)
    # print(rand_num)
    
    ### random한 item을 선택한다.
    choice = random.choices(train_outfit2item[train_set_ids[i]], k=rand_num)
    train_choice_items[set_id] = choice

### valid data에 대하여
for i in range(len(valid_data)):
    set_id = valid_data[i]['set_id']
    length = len(valid_outfit2item[valid_set_ids[i]])//2
    rand_num = np.random.randint(1, length)

    choice = random.choices(valid_outfit2item[valid_set_ids[i]], k=rand_num)
    valid_choice_items[set_id] = choice

### test data에 대하여
for i in range(len(test_data)):
    set_id = test_data[i]['set_id']
    length = len(test_outfit2item[test_set_ids[i]])//2
    rand_num = np.random.randint(1, length)

    choice = random.choices(test_outfit2item[test_set_ids[i]], k=rand_num)
    test_choice_items[set_id] = choice

In [8]:
# key: outfit id, value: negative item으로 선택될 n개의 items
with open(save_path + 'train_choice_item.json', 'w') as f:
    json.dump(train_choice_items, f, indent=4)

with open(save_path + 'valid_choice_item.json', 'w') as f:
    json.dump(valid_choice_items, f, indent=4)

with open(save_path + 'test_choice_item.json', 'w') as f:
    json.dump(test_choice_items, f, indent=4)

## Category별로 item 집합 만들기

In [9]:
train_cate2item_dict = defaultdict(list)
valid_cate2item_dict = defaultdict(list)
test_cate2item_dict  = defaultdict(list)

for i in range(len(train_data)):
    for j in range(len(train_data[i]['items'])):
        _, ids = train_data[i]['items'][j]['image'].split('id=')
        category_ids = train_data[i]['items'][j]['categoryid']
        train_cate2item_dict[category_ids].append(ids)

for i in range(len(valid_data)):
    for j in range(len(valid_data[i]['items'])):
        _, ids = valid_data[i]['items'][j]['image'].split('id=')
        category_ids = valid_data[i]['items'][j]['categoryid']
        valid_cate2item_dict[category_ids].append(ids)

for i in range(len(test_data)):
    for j in range(len(test_data[i]['items'])):
        _, ids = test_data[i]['items'][j]['image'].split('id=')
        category_ids = test_data[i]['items'][j]['categoryid']
        test_cate2item_dict[category_ids].append(ids)


In [10]:
### key: category id, value: item_id
with open(save_path + 'train_cate2item.json', 'w') as f:
    json.dump(train_cate2item_dict, f, indent=4)

with open(save_path + 'valid_cate2item.json', 'w') as f:
    json.dump(valid_cate2item_dict, f, indent=4)

with open(save_path + 'test_cate2item.json', 'w') as f:
    json.dump(test_cate2item_dict, f, indent=4)


## 대체 이미지 찾기
1. 같은 category여야한다.
2. 다른 outfit에서 item이 함께 등장하지 않아야한다.

In [11]:
with open(save_path + 'train_choice_item.json', 'r') as f:
    train_choice = json.load(f)
with open(save_path + 'valid_choice_item.json', 'r') as f:
    valid_choice = json.load(f) 
with open(save_path + 'test_choice_item.json', 'r') as f:
    test_choice = json.load(f)

with open(save_path + 'train_cate2item.json', 'r') as f:
    train_cate2item = json.load(f)
with open(save_path + 'valid_cate2item.json', 'r') as f:
    valid_cate2item = json.load(f)
with open(save_path + 'test_cate2item.json', 'r') as f:
    test_cate2item = json.load(f)
    
with open(save_path + 'all_item2outfit.json', 'r') as f:
    item2outfit = json.load(f)
with open(save_path + 'all_outfit2item.json', 'r') as f:
    outfit2item = json.load(f)

with open(save_path + 'train_outfit2item.json', 'r') as f:
    train_item2outfit = json.load(f)
with open(save_path + 'valid_outfit2item.json', 'r') as f:
    valid_outfit2item = json.load(f)
with open(save_path + 'test_outfit2item.json', 'r') as f:
    test_outfit2item = json.load(f)

In [12]:

before = time.time()

train_list = []
for outfit, positive_item in tqdm(train_choice.items()):

    now_outfit_id = str
    now_outfit_list = []
    
    for key, value in outfit2item.items():
        if key == outfit:
            now_outfit_id = key
            for i in range(len(value)):
                now_outfit_list.append(value[i])
            break
    
    replace_items = value

    for i in range(len(positive_item)):
        item_category = str
        for category, item_id in train_cate2item.items():
            for j in range(len(item_id)):
                if positive_item[i] == item_id[j]:
                    item_category = category
                    break
        
        for category, item_id in train_cate2item.items():
            if category == item_category:
                cand_items = []
                for j in range(len(item_id)):
                    cand_items.append(item_id[j])
               
        replace_item = str
        while True:
            cnt = 0
            len_cnt = 0
            if len_cnt == len(cand_items):
                break
            negative_item = random.choice(cand_items)
            negative_outfit_list = []
            for key, value in item2outfit.items():
                if key == negative_item:
                    for j in range(len(value)):
                        negative_outfit_list.append(value[j])
            
            for a in range(len(negative_outfit_list)):
                for b in range(len(now_outfit_list)):
                    if negative_outfit_list[a] == now_outfit_list[b]:
                        cnt += 1
            if cnt == 0:
                replace_item = negative_item
                break
            len_cnt += 1
        
        # 대체 item을 원래의 item과 교체한다.
        positive_item[i] == replace_item
        for j in range(len(replace_items)):
            if positive_item[i] == replace_items[j]:
                replace_items[j] = replace_item
    # if len(positive_item) > 1:
        # print(len(positive_item), ' ', replace_items)
    train_list.append(replace_items)         
print(time.time()-before)  

100%|██████████| 17316/17316 [15:19<00:00, 18.84it/s]919.1365401744843



In [13]:

before = time.time()

valid_list = []
for outfit, positive_item in tqdm(valid_choice.items()):

    now_outfit_id = str
    now_outfit_list = []
    
    for key, value in outfit2item.items():
        if key == outfit:
            now_outfit_id = key
            for i in range(len(value)):
                now_outfit_list.append(value[i])
            break
    
    replace_items = value

    for i in range(len(positive_item)):
        item_category = str
        for category, item_id in valid_cate2item.items():
            for j in range(len(item_id)):
                if positive_item[i] == item_id[j]:
                    item_category = category
                    break
        
        for category, item_id in valid_cate2item.items():
            if category == item_category:
                cand_items = []
                for j in range(len(item_id)):
                    cand_items.append(item_id[j])
               
        replace_item = str
        while True:
            cnt = 0
            len_cnt = 0
            if len_cnt == len(cand_items):
                break
            negative_item = random.choice(cand_items)
            negative_outfit_list = []
            for key, value in item2outfit.items():
                if key == negative_item:
                    for j in range(len(value)):
                        negative_outfit_list.append(value[j])
            
            for a in range(len(negative_outfit_list)):
                for b in range(len(now_outfit_list)):
                    if negative_outfit_list[a] == now_outfit_list[b]:
                        cnt += 1
            if cnt == 0:
                replace_item = negative_item
                break
            len_cnt += 1
        
        # 대체 item을 원래의 item과 교체한다.
        positive_item[i] == replace_item
        for j in range(len(replace_items)):
            if positive_item[i] == replace_items[j]:
                replace_items[j] = replace_item
    # if len(positive_item) > 1:
        # print(len(positive_item), ' ', replace_items)
    valid_list.append(replace_items)         
print(time.time()-before)  

100%|██████████| 1497/1497 [00:41<00:00, 36.38it/s]41.156988859176636



In [14]:

before = time.time()

test_list = []
for outfit, positive_item in tqdm(test_choice.items()):

    now_outfit_id = str
    now_outfit_list = []
    
    for key, value in outfit2item.items():
        if key == outfit:
            now_outfit_id = key
            for i in range(len(value)):
                now_outfit_list.append(value[i])
            break
    
    replace_items = value

    for i in range(len(positive_item)):
        item_category = str
        for category, item_id in test_cate2item.items():
            for j in range(len(item_id)):
                if positive_item[i] == item_id[j]:
                    item_category = category
                    break
        
        for category, item_id in test_cate2item.items():
            if category == item_category:
                cand_items = []
                for j in range(len(item_id)):
                    cand_items.append(item_id[j])
               
        replace_item = str
        while True:
            cnt = 0
            len_cnt = 0
            if len_cnt == len(cand_items):
                break
            negative_item = random.choice(cand_items)
            negative_outfit_list = []
            for key, value in item2outfit.items():
                if key == negative_item:
                    for j in range(len(value)):
                        negative_outfit_list.append(value[j])
            
            for a in range(len(negative_outfit_list)):
                for b in range(len(now_outfit_list)):
                    if negative_outfit_list[a] == now_outfit_list[b]:
                        cnt += 1
            if cnt == 0:
                replace_item = negative_item
                break
            len_cnt += 1
        
        # 대체 item을 원래의 item과 교체한다.
        positive_item[i] == replace_item
        for j in range(len(replace_items)):
            if positive_item[i] == replace_items[j]:
                replace_items[j] = replace_item
    # if len(positive_item) > 1:
        # print(len(positive_item), ' ', replace_items)
    test_list.append(replace_items)         
print(time.time()-before)  

100%|██████████| 3076/3076 [01:32<00:00, 33.27it/s]92.45711064338684



In [15]:
for i in range(len(valid_list)):
    print(valid_list[i])

543']
['116067447', '98504776', '76805936', '97806863', '88577636']
['172918272', '57805737', '21254676', '44893885']
['148259102', '98080008', '101458569', '93410767', '101375129', '67460310', '157396941', '162012748']
['127628179', '121094998', '127690783', '183503755', '119492890', '113689314', '85879605', '125266145']
['126542656', '164933366', '127887736', '103997656', '129900237', '112540418']
['121681698', '111162502', '151369626', '129989677', '61687670', '102533535']
['144873554', '111019632', '159586567', '182801618']
['181880569', '135743878', '179203798', '157968873', '169810018', '193932639']
['114440717', '133296417', '129732874', '110604802', '148297190', '131601904', '132928958']
['162456866', '133122120', '133875394', '133875387', '133875389', '146737077']
['92094490', '66368710', '85884359', '105576133']
['195744116', '195744111', '195528436', '195079694', '195745425']
['195078869', '182976972', '195079255', '140325597', '194492128', '195080867']
['102452615', '785622

In [16]:
# key: outfit id, value: negative item으로 선택될 n개의 items
with open(save_path + 'train_new_outfit.json', 'w') as f:
    json.dump(train_list, f, indent=4)

with open(save_path + 'valid_new_outfit.json', 'w') as f:
    json.dump(valid_list, f, indent=4)

with open(save_path + 'test_new_outfit.json', 'w') as f:
    json.dump(test_list, f, indent=4)

In [17]:
import os 
def createDirectory(directory): 
    try: 
        if not os.path.exists(directory): 
            os.makedirs(directory) 
    except OSError: 
        print("Error: Failed to create the directory.")


이미지를 저장해보자
<br/>
원본의 outfit과 새로 구성한 outfit의 id를 비교한다. <br/>
서로 다른 item id가 나오기 전까지 해당 outfit 폴더에 image를 저장한다. <br/>
다른 item id가 나온다면, 새로 구성한 outfit의 그 item id가 등장하는 outfit id를 찾는다. <br/>
그 outfit에서 몇번째에 등장하는지 찾는다. <br/>

In [18]:
for outfit, items in train_outfit2item.items():
    createDirectory(save_path + 'images/{}'.format(outfit))

for outfit, items in valid_outfit2item.items():
    createDirectory(save_path + 'images/{}'.format(outfit))

for outfit, items in test_outfit2item.items():
    createDirectory(save_path + 'images/{}'.format(outfit))


In [19]:
item_list = []
set_ids = []
# 원본 outfit을 구성하는 item들
for i in tqdm(range(len(train_data))):
    lst = []
    set_id = train_data[i]['set_id']
    for j in range(len(train_data[i]['items'])):
        _, item_id = train_data[i]['items'][j]['image'].split('id=')
        lst.append(item_id)
    item_list.append(lst)
    set_ids.append(set_id)
print('\n')
print(item_list[0])     # positive
print(train_list[0])    # negative
print((set_ids[0]))
before = time.time()
for i in tqdm(range(len(train_list))):
    for j in range(len(train_list[i])):
        item_outfit = str
        train_outfit = str

        if train_list[i][j] == item_list[i][j]:
            img = Image.open(image_path + '{}/{}.jpg'.format(set_ids[i], j+1)).convert('RGB')
            img.save(save_path + 'images/{}/{}.jpg'.format(set_ids[i], j+1))
        else:
            for item, outfit in item2outfit.items():
                if train_list[i][j] == item:
                    for train_outfit, train_items in train_outfit2item.items():
                        for k in range(len(train_items)):
                            if train_list[i][j] == train_items[k]:
                                img = Image.open(image_path + '{}/{}.jpg'.format(train_outfit, k+1)).convert('RGB')
                                break
                    img.save(save_path+ 'images/{}/{}.jpg'.format(set_ids[i],j+1))
                        

print(time.time()-before)

100%|██████████| 17316/17316 [00:00<00:00, 171447.04it/s]
  0%|          | 0/17316 [00:00<?, ?it/s]

['194508109', '188778349', '188977857', '194942557', '194941874', '194578327', '190204630']
['194088670', '133292224', '188977857', '194942557', '194941874', '194578327', '190204630']
214181831
100%|██████████| 17316/17316 [50:07<00:00,  5.76it/s]3007.4126467704773



In [20]:
item_list = []
set_ids = []
# 원본 outfit을 구성하는 item들
for i in tqdm(range(len(valid_data))):
    lst = []
    set_id = valid_data[i]['set_id']
    for j in range(len(valid_data[i]['items'])):
        _, item_id = valid_data[i]['items'][j]['image'].split('id=')
        lst.append(item_id)
    item_list.append(lst)
    set_ids.append(set_id)
print('\n')
print(item_list[0])     # positive
print(valid_list[0])    # negative
print((set_ids[0]))
before = time.time()
for i in tqdm(range(len(valid_list))):
    for j in range(len(valid_list[i])):
        item_outfit = str
        valid_outfit = str

        if valid_list[i][j] == item_list[i][j]:
            img = Image.open(image_path + '{}/{}.jpg'.format(set_ids[i], j+1)).convert('RGB')
            img.save(save_path + 'images/{}/{}.jpg'.format(set_ids[i], j+1))
        else:
            for item, outfit in item2outfit.items():
                if valid_list[i][j] == item:
                    for valid_outfit, valid_items in valid_outfit2item.items():
                        for k in range(len(valid_items)):
                            if valid_list[i][j] == valid_items[k]:
                                img = Image.open(image_path + '{}/{}.jpg'.format(valid_outfit, k+1)).convert('RGB')
                                break
                    img.save(save_path+ 'images/{}/{}.jpg'.format(set_ids[i],j+1))
                        

print(time.time()-before)

100%|██████████| 1497/1497 [00:00<00:00, 166314.55it/s]
  0%|          | 3/1497 [00:00<01:02, 23.81it/s]

['148259102', '78433673', '121464208', '151052339']
['148259102', '78433673', '121464208', '151052339']
209512492
100%|██████████| 1497/1497 [01:41<00:00, 14.73it/s]101.6490740776062



In [21]:
item_list = []
set_ids = []
# 원본 outfit을 구성하는 item들
for i in tqdm(range(len(test_data))):
    lst = []
    set_id = test_data[i]['set_id']
    for j in range(len(test_data[i]['items'])):
        _, item_id = test_data[i]['items'][j]['image'].split('id=')
        lst.append(item_id)
    item_list.append(lst)
    set_ids.append(set_id)
print('\n')
print(item_list[0])     # positive
print(test_list[0])    # negative
print((set_ids[0]))
before = time.time()
for i in tqdm(range(len(test_list))):
    for j in range(len(test_list[i])):
        item_outfit = str
        test_outfit = str

        if test_list[i][j] == item_list[i][j]:
            img = Image.open(image_path + '{}/{}.jpg'.format(set_ids[i], j+1)).convert('RGB')
            img.save(save_path + 'images/{}/{}.jpg'.format(set_ids[i], j+1))
        else:
            for item, outfit in item2outfit.items():
                if test_list[i][j] == item:
                    for test_outfit, test_items in test_outfit2item.items():
                        for k in range(len(test_items)):
                            if test_list[i][j] == test_items[k]:
                                img = Image.open(image_path + '{}/{}.jpg'.format(test_outfit, k+1)).convert('RGB')
                                break
                    img.save(save_path+ 'images/{}/{}.jpg'.format(set_ids[i],j+1))
                        

print(time.time()-before)

100%|██████████| 3076/3076 [00:00<00:00, 161902.41it/s]
  0%|          | 0/3076 [00:00<?, ?it/s]

['102972440', '103394173', '91303250', '94989504', '103184729']
['172684470', '103394173', '91303250', '94989504', '103184729']
119704139
100%|██████████| 3076/3076 [06:38<00:00,  7.72it/s]398.5117516517639

