## Generate training image triplets from polyvore data (2 compatible fashion item and 1 not compatible fashion item)

In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
DATA_DIR = "../data/polyvore_outfits/"
SUB_DATASET = "disjoint/"
SUB_DATASET_DIR = f"{DATA_DIR}/{SUB_DATASET}"

In [3]:
def json2df(data):
    df = pd.DataFrame()
    for index, el in enumerate(data):
        for key, val in el.items():
            df.loc[index, key] = val
    return df

In [4]:
dict_df = pd.read_json(f"{SUB_DATASET_DIR}/train.json").explode(column="items")
dict_df = dict_df.join(pd.DataFrame(dict_df["items"].values.tolist(), index=dict_df.index))
dict_df.item_id = dict_df.item_id.astype("int")
dict_df = dict_df.drop(columns="items")

In [6]:
item_attrs = pd.read_json(f"{DATA_DIR}/polyvore_item_metadata.json",orient = "index")
item_attrs.head()

Unnamed: 0,category_id,catgeories,description,related,semantic_category,title,url_name
100004189,57,,,,sunglasses,,retro hippie fashion metal lennon
100005237,53,,,,accessories,,amazon.com 100 imported cashmere gloves
100007550,4,,,,all-body,,mcq alexander mcqueen tailored tuxedo
100010397,41,,,,shoes,,nfinity vengeance cheerleading shoe
100010564,41,"[Women's Fashion, Shoes, Pumps, Giuseppe Zanot...",,"[Giuseppe Zanotti, High heeled footwear, Heel ...",shoes,i36033 001 - Décolleté Donna - Scarpe Donna su...,i36033 001 d%C3%A9collet%C3%A9 donna scarpe


In [7]:
merged_df = dict_df.merge(item_attrs[["category_id", "semantic_category"]], left_on = "item_id", right_index = True, how = "left")
merged_df.head()

Unnamed: 0,set_id,item_id,index,category_id,semantic_category
0,199244701,132621870,1,38,bags
0,199244701,153967122,2,46,shoes
0,199244701,171169800,3,62,jewellery
0,199244701,162799044,4,64,jewellery
0,199244701,172538912,5,65,jewellery


In [8]:
merged_df[["set_id","item_id"]]

Unnamed: 0,set_id,item_id
0,199244701,132621870
0,199244701,153967122
0,199244701,171169800
0,199244701,162799044
0,199244701,172538912
...,...,...
16993,205613256,180608937
16994,225327855,180383429
16994,225327855,201915007
16994,225327855,180383429


In [9]:
merged_df["identifier"] = merged_df[["set_id","index"]].astype("str").apply("_".join, axis=1)

In [10]:
df = pd.read_table(f"{DATA_DIR}/disjoint/compatibility_train.txt", header=None, skipinitialspace=True)
df = df[0].str.split(" ", expand = True)

df.columns = ["is_compat", *[f"item_{i}" for i in range(1, len(df.columns))]]
df.is_compat = df.is_compat.astype("int")
df.loc[df.is_compat == 0, "item_1":] = df.loc[df.is_compat == 0, "item_1":].shift(periods=-1, axis='columns')
df["outfit_num"] = np.tile(range(int(df.shape[0]/2)), 2)
df

Unnamed: 0,is_compat,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_13,item_14,item_15,item_16,item_17,outfit_num
0,1,199244701_1,199244701_2,199244701_3,199244701_4,199244701_5,199244701_6,,,,,,,,,,,,0
1,1,200742384_1,200742384_2,200742384_3,200742384_4,200742384_5,,,,,,,,,,,,,1
2,1,206955877_1,206955877_2,206955877_3,206955877_4,206955877_5,206955877_6,,,,,,,,,,,,2
3,1,220340816_1,220340816_2,220340816_3,220340816_4,220340816_5,,,,,,,,,,,,,3
4,1,219393187_1,219393187_2,219393187_3,219393187_4,219393187_5,,,,,,,,,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33985,0,218506324_1,221405016_1,206461374_3,218864675_6,,,,,,,,,,,,,,16990
33986,0,105184990_1,211494606_3,204086912_4,220595501_3,,,,,,,,,,,,,,16991
33987,0,66630383_1,132553628_1,224927539_2,223480393_3,219992174_3,85057420_4,195654231_5,206629909_3,,,,,,,,,,16992
33988,0,149252664_1,224932578_2,205854787_3,206523026_4,212115201_6,218241080_6,,,,,,,,,,,,16993


In [11]:
melted_df = df.melt(id_vars=["is_compat", "outfit_num"], var_name="item", value_name="identifier").query("identifier.notnull()")
melted_df

Unnamed: 0,is_compat,outfit_num,item,identifier
0,1,0,item_1,199244701_1
1,1,1,item_1,200742384_1
2,1,2,item_1,206955877_1
3,1,3,item_1,220340816_1
4,1,4,item_1,219393187_1
...,...,...,...,...
475683,0,16818,item_14,216771451_14
484206,1,8346,item_15,220541340_15
501201,0,8346,item_15,220060656_5
518196,1,8346,item_16,220541340_16


In [12]:
melted_df[["set_id", "index"]] = melted_df["identifier"].str.split("_", expand=True).astype("int")

In [13]:
len(melted_df.outfit_num.unique())

16995

In [14]:
merged_df

Unnamed: 0,set_id,item_id,index,category_id,semantic_category,identifier
0,199244701,132621870,1,38,bags,199244701_1
0,199244701,153967122,2,46,shoes,199244701_2
0,199244701,171169800,3,62,jewellery,199244701_3
0,199244701,162799044,4,64,jewellery,199244701_4
0,199244701,172538912,5,65,jewellery,199244701_5
...,...,...,...,...,...,...
16993,205613256,180608937,6,51,scarves,205613256_6
16994,225327855,180383429,1,49,shoes,225327855_1
16994,225327855,201915007,2,4,all-body,225327855_2
16994,225327855,180383429,1,49,shoes,225327855_1


In [15]:
melted_df.query("outfit_num == 0").sort_values(by=["is_compat", "item"])

Unnamed: 0,is_compat,outfit_num,item,identifier,set_id,index
16995,0,0,item_1,219713029_1,219713029,1
50985,0,0,item_2,223118810_1,223118810,1
84975,0,0,item_3,224078562_3,224078562,3
118965,0,0,item_4,222231874_3,222231874,3
152955,0,0,item_5,222344225_3,222344225,3
186945,0,0,item_6,76483638_7,76483638,7
0,1,0,item_1,199244701_1,199244701,1
33990,1,0,item_2,199244701_2,199244701,2
67980,1,0,item_3,199244701_3,199244701,3
101970,1,0,item_4,199244701_4,199244701,4


In [16]:
merged_df

Unnamed: 0,set_id,item_id,index,category_id,semantic_category,identifier
0,199244701,132621870,1,38,bags,199244701_1
0,199244701,153967122,2,46,shoes,199244701_2
0,199244701,171169800,3,62,jewellery,199244701_3
0,199244701,162799044,4,64,jewellery,199244701_4
0,199244701,172538912,5,65,jewellery,199244701_5
...,...,...,...,...,...,...
16993,205613256,180608937,6,51,scarves,205613256_6
16994,225327855,180383429,1,49,shoes,225327855_1
16994,225327855,201915007,2,4,all-body,225327855_2
16994,225327855,180383429,1,49,shoes,225327855_1


In [25]:
final_df = melted_df.merge(merged_df, on="identifier").drop_duplicates()

In [28]:
final_df.semantic_category.unique()

array(['bags', 'shoes', 'jewellery', 'tops', 'hats', 'sunglasses',
       'all-body', 'accessories', 'bottoms', 'outerwear', 'scarves'],
      dtype=object)

In [27]:
final_df.query("outfit_num == 0").sort_values(by=["is_compat", "item"])

Unnamed: 0,is_compat,outfit_num,item,identifier,set_id_x,index_x,set_id_y,item_id,index_y,category_id,semantic_category
127201,0,0,item_1,219713029_1,219713029,1,219713029,204656212,1,38,bags
94161,0,0,item_2,223118810_1,223118810,1,223118810,211483114,1,46,shoes
444358,0,0,item_3,224078562_3,224078562,3,224078562,203867196,3,62,jewellery
496627,0,0,item_4,222231874_3,222231874,3,222231874,118116943,3,64,jewellery
400114,0,0,item_5,222344225_3,222344225,3,222344225,179210717,3,65,jewellery
917974,0,0,item_6,76483638_7,76483638,7,76483638,55333264,7,5,all-body
0,1,0,item_1,199244701_1,199244701,1,199244701,132621870,1,38,bags
203150,1,0,item_2,199244701_2,199244701,2,199244701,153967122,2,46,shoes
390616,1,0,item_3,199244701_3,199244701,3,199244701,171169800,3,62,jewellery
612252,1,0,item_4,199244701_4,199244701,4,199244701,162799044,4,64,jewellery


In [39]:
compat_df = final_df.query("is_compat == 1").sort_values(by=["outfit_num", "item"])
not_compat_df = final_df.query("is_compat == 0").sort_values(by=["outfit_num", "item"])
compat_df.head()

Unnamed: 0,is_compat,outfit_num,item,identifier,set_id_x,index_x,set_id_y,item_id,index_y,category_id,semantic_category
0,1,0,item_1,199244701_1,199244701,1,199244701,132621870,1,38,bags
203150,1,0,item_2,199244701_2,199244701,2,199244701,153967122,2,46,shoes
390616,1,0,item_3,199244701_3,199244701,3,199244701,171169800,3,62,jewellery
612252,1,0,item_4,199244701_4,199244701,4,199244701,162799044,4,64,jewellery
770852,1,0,item_5,199244701_5,199244701,5,199244701,172538912,5,65,jewellery


In [43]:
compat_dict = dict(zip(compat_df.item_id.tolist(), not_compat_df.item_id.tolist()))
compat_dict[171169800]

203867196

In [50]:
!ls ../data/polyvore_outfits/images/| head

100004189.jpg
100005237.jpg
100007550.jpg
100010397.jpg
100010564.jpg
100010612.jpg
100015328.jpg
100015595.jpg
100018459.jpg
100018712.jpg


In [45]:
import itertools
all_triplets= []
for outfit_num in compat_df.outfit_num.unique():
    all_triplets += [(item1, item2, compat_dict[item3]) for (item1,item2,item3) in itertools.permutations(compat_df.query(f"outfit_num == {outfit_num}").item_id,3)]

**final triplet list**

In [53]:
pd.DataFrame(all_triplets)

Unnamed: 0,0,1,2
0,132621870,153967122,203867196
1,132621870,153967122,194316716
2,132621870,153967122,179210717
3,132621870,153967122,55333264
4,132621870,171169800,211483114
...,...,...,...
1552177,180608937,175961768,181738641
1552178,180608937,162927065,79746902
1552179,180608937,162927065,213750382
1552180,180608937,162927065,181225408


In [52]:
pd.DataFrame(all_triplets).to_csv(f"{SUB_DATASET_DIR}/triplets_train_disjoint", sep=" ", header=False, index=False)