In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import tqdm
from sklearn import preprocessing
import random
import cv2

In [2]:
tr = pd.read_csv('./f_tr.csv',index_col=0)
tr['image_name'] = 'train/' + tr.image_name
print(tr.shape)
tr.head(2)

(996475, 9)


Unnamed: 0,image_name,colors,color,sub_category_x,sub_category_y,style,fit,print,print_length
0,train/street_1330115_t-shirt.jpg,lightgray,화이트,t-shirt,t-shirt,street,루즈,['레터링'],1
1,train/romantic_1168617_jean.jpg,darkgray,블루,jean,jean,romantic,와이드,['무지'],1


In [3]:
val = pd.read_csv('./f_val.csv',index_col=0)
val['image_name'] = 'val/' + val.image_name
print(val.shape)
val.head(2)

(123437, 9)


Unnamed: 0,image_name,colors,color,sub_category_x,sub_category_y,style,fit,print,print_length
0,val/street_411249_shirt.jpg,brown,브라운,shirt,shirt,street,루즈,['체크'],1
1,val/resort_1030135_jacket.jpg,gray,그레이,jacket,jacket,resort,루즈,['무지'],1


In [4]:
df = pd.concat([tr,val]).reset_index(drop=True)

In [5]:
# df = df.loc[df.colors!="etc", :].reset_index(drop=True)

In [6]:
df["sub_category"] = df["sub_category_x"]
df["print"] = [df['print'][i][2:-2] for i in range(len(df))]
df = df.drop(['style', 'sub_category_x', 'sub_category_y', 'colors'],axis=1)

In [7]:
def get_label_groups_basic(dt):
    dat = dt.copy()
    category = dat.sub_category.to_list()
    color = dat.color.to_list()
    
    comb_ls = [category[x]+','+color[x] for x in range(len(category))]
    #Encoding
    le = preprocessing.LabelEncoder()
    le.fit(comb_ls)
    comb_idx = le.transform(comb_ls)
    
    dat['comb'] = comb_ls
    dat['label_group'] = comb_idx
    
    return dat

In [8]:
def get_label_groups_semi(dt):
    dat = dt.copy()
    category = dat.sub_category.to_list()
    color = dat.color.to_list()
    pr = dat.print.to_list()
    
    comb_ls = [category[x]+','+color[x]+','+ pr[x] for x in range(len(category))]
    #Encoding
    le = preprocessing.LabelEncoder()
    le.fit(comb_ls)
    comb_idx = le.transform(comb_ls)
    
    dat['comb'] = comb_ls
    dat['label_group'] = comb_idx
    
    return dat

In [9]:
def get_label_groups_full(dt):
    dat = dt.copy()
    category = dat.sub_category.to_list()
    color = dat.color.to_list()
    pr = dat.print.to_list()
    fit = dat.fit.to_list()
    
    comb_ls = [category[x]+','+color[x]+','+ pr[x] +','+fit[x] for x in range(len(category))]
    #Encoding
    le = preprocessing.LabelEncoder()
    le.fit(comb_ls)
    comb_idx = le.transform(comb_ls)
    
    dat['comb'] = comb_ls
    dat['label_group'] = comb_idx
    
    return dat

In [10]:
def get_encoded_groups(df):
    df_ = df.groupby(['label_group'])['image_name'].apply(lambda x: ','.join(x))
    encoded_df = pd.DataFrame({'label_group':df_.index,
                             'image_group':df_.values})
    
    encoded_df['len'] = encoded_df.image_group.apply(lambda x : len(x.split(','))) 
    encoded_df["image_group"] = [x.split(',') for x in encoded_df['image_group']]
    return encoded_df

In [11]:
sample_num = 100000

In [12]:
#tf = get_label_groups_full(df)
#tf = get_label_groups_semi(df)
tf = get_label_groups_basic(df)
encoded_df = get_encoded_groups(tf)

In [13]:
a = pd.merge(tf, encoded_df[["label_group", "len"]], on="label_group")
a = a[a["len"] > 10].reset_index(drop=True)
#a = a[a["len"] >= sample_num].reset_index(drop=True)
a = a.sample(frac=0.5).reset_index(drop=True)
#a = a.sample(frac=1).reset_index(drop=True)
del a["len"]
tf = get_label_groups_basic(a)
encoded_df = get_encoded_groups(tf)

In [14]:
tf

Unnamed: 0,image_name,color,fit,print,print_length,sub_category,comb,label_group
0,train/military_468201_jumpsuit.jpg,카키,노멀,무지,1,jumpsuit,"jumpsuit,카키",187
1,train/street_444471_t-shirt.jpg,베이지,루즈,무지,1,t-shirt,"t-shirt,베이지",298
2,train/country_802026_skirt.jpg,베이지,노멀,체크,1,skirt,"skirt,베이지",277
3,val/street_33882_t-shirt.jpg,레드,루즈,스트라이프,1,t-shirt,"t-shirt,레드",296
4,train/country_1104574_t-shirt.jpg,카키,노멀,무지,1,t-shirt,"t-shirt,카키",307
...,...,...,...,...,...,...,...,...
559917,train/avantgarde_63550_dress.jpg,레드,노멀,믹스,1,dress,"dress,레드",80
559918,train/resort_305343_pants.jpg,블랙,노멀,무지,1,pants,"pants,블랙",237
559919,train/street_1280155_shirt.jpg,화이트,루즈,스트라이프,1,shirt,"shirt,화이트",268
559920,train/street_1165548_skirt.jpg,화이트,루즈,도트,1,skirt,"skirt,화이트",289


In [15]:
encoded_df

Unnamed: 0,label_group,image_group,len
0,0,"[train/manish_546453_best.jpg, train/modern_70...",948
1,1,"[train/street_1223332_best.jpg, val/street_111...",146
2,2,"[train/street_1058950_best.jpg, train/street_1...",51
3,3,"[train/street_1070648_best.jpg, train/country_...",252
4,4,"[train/modern_165778_best.jpg, val/street_1081...",29
...,...,...,...
347,347,"[train/sporty_353381_zipup.jpg, train/genderle...",13
348,348,"[train/genderless_943800_zipup.jpg, train/stre...",101
349,349,"[val/street_353355_zipup.jpg, train/street_130...",29
350,350,"[train/street_1060528_zipup.jpg, train/street_...",270


In [16]:
# label_ls = encoded_df.label_group.to_list()
# sampled_ls = [random.sample(x,sample_num) for x in encoded_df.image_group.values.tolist()] 
# sampled_df = pd.DataFrame({'label_group' : label_ls,
#                            'image_group' : sampled_ls})

In [17]:
# sampled_df['len'] = sampled_df['image_group'].apply(lambda x : len(x))

In [18]:
# encoded_df['label_group2'] = encoded_df.label_group.astype('str')
# # px.histogram(encoded_df, x="label_group2", y="len", marginal="violin", color_discrete_sequence=px.colors.sequential.Plasma)

In [19]:
# px.box(encoded_df, y='len', points="all")

In [20]:
# tf.to_csv("separ_meta.csv")

### Under-sampling

In [21]:
# encoded_df[(encoded_df.len<=105) & (encoded_df.len>=30)]

In [22]:
random.seed = 225

In [23]:
def reshape_df(df,args):
    params = int(args)
    encoded_df = df.copy()
    sum_over = len(encoded_df[encoded_df.len > params]) * params
    sum_under = sum(encoded_df[encoded_df.len<=params].len)

    unsampled_df = encoded_df[encoded_df.len <= params].copy()
    unsampled_df['image_group'] = [x.split(',') for x in unsampled_df['image_group']]
    
    #sampling
    label_ls = encoded_df[encoded_df.len>params].label_group.to_list()
    img_ls = [x.split(',') for x in encoded_df[encoded_df.len>params].image_group]
    sampled_ls = [random.sample(x,params) for x in img_ls] 
    sampled_df = pd.DataFrame({'label_group' : label_ls,
                       'image_group' : sampled_ls})
    #combine
    df = pd.concat([unsampled_df,sampled_df],ignore_index=True)
    #df = df.sort_values('label_group').reset_index(drop=True)
    df['len'] = df['image_group'].apply(lambda x : len(x))
    
    print(f' Train에 사용되는 그룹별 최대 이미지는 {params}개, 클래스는 {df.label_group.nunique()}개, 총 이미지 수는: {sum_over + sum_under}개 입니다.')

    return df

## train dataframe 생성

In [24]:
def get_train_df(df):
    k = df.copy()
    li,pi = [],[]
    for i in k. image_group:
        for j in i:
            li.append(j)
    
    pr = [[k.label_group[x]]*k.len[x] for x in k.label_group]
    for i in pr:
        for j in i:
            pi.append(j)

    train_df = pd.DataFrame({'image_name': li,
                             'label_group' : pi})
    train_df = train_df.reset_index(drop=True)
    
    return train_df


In [25]:
#reshaped_df = reshape_df(encoded_df,40000)
train_df = get_train_df(encoded_df)
train_df

Unnamed: 0,image_name,label_group
0,train/manish_546453_best.jpg,0
1,train/modern_709120_best.jpg,0
2,train/classic_746922_best.jpg,0
3,train/modern_1342796_best.jpg,0
4,train/street_1223318_best.jpg,0
...,...,...
559917,train/sporty_976385_zipup.jpg,351
559918,train/street_192044_zipup.jpg,351
559919,train/sporty_1051068_zipup.jpg,351
559920,train/sexy_164262_zipup.jpg,351


In [26]:
class visual_kfashion:
    def visualize(df, path, matrix_num, label_num):
        '''
        matrix_num : 2로 설정할경우 2*2 행렬로 총 4개 출력
        label_num : 보고싶은 label_group의 번호
        '''
        print('※1장이 나올경우는 해당 label_group의 길이 수가 1이라는 것을 의미합니다.')
        smp_dt = df.loc[df.label_group == label_num , 'image_group'][label_num]
        print()
        print(f'{label_num}번의 label_group의 이미지 수는 총 {len(smp_dt)}장 입니다.')
        fig = plt.figure(figsize=(20, 20))
        rows = matrix_num
        cols = matrix_num
        i = 1
        
        print()
        print(tf.loc[tf.label_group == label_num, "comb"].unique()[0])
        
        for filename in smp_dt:
            try:
                img = cv2.imread(f'{path}/{filename}')
                ax = fig.add_subplot(rows, cols, i)
                ax.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
                ax.set_xlabel(filename)
                ax.set_xticks([]), ax.set_yticks([])
                i += 1
            except:
                break

In [None]:
path = '/mnt/hdd1/wearly/compatibility_rec/data/images/'
visual_kfashion.visualize(df=reshaped_df, path=path, matrix_num=4, label_num=70)

In [None]:
path = '/mnt/hdd1/wearly/compatibility_rec/data/images/'
visual_kfashion.visualize(df=reshaped_df, path=path, matrix_num=4, label_num=770)

In [None]:
path = '/mnt/hdd1/wearly/compatibility_rec/data/images/'
visual_kfashion.visualize(df=reshaped_df, path=path, matrix_num=4, label_num=46)

In [None]:
path = '/mnt/hdd1/wearly/compatibility_rec/data/images/'
visual_kfashion.visualize(df=reshaped_df, path=path, matrix_num=4, label_num=199)

In [None]:
path = '/mnt/hdd1/wearly/compatibility_rec/data/images/'
visual_kfashion.visualize(df=reshaped_df, path=path, matrix_num=4, label_num=1999)

In [None]:
path = '/mnt/hdd1/wearly/compatibility_rec/data/images/'
visual_kfashion.visualize(df=reshaped_df, path=path, matrix_num=4, label_num=2536)

In [None]:
path = '/mnt/hdd1/wearly/compatibility_rec/data/images/'
visual_kfashion.visualize(df=reshaped_df, path=path, matrix_num=4, label_num=326)

## Validation setting

In [27]:
from sklearn.model_selection import StratifiedKFold

In [28]:
def stratify_df(df):
    
    train_df = df.copy()
    
    train_df['fold'] = -1
    n_folds = 10
    
    strat_kfold = StratifiedKFold(n_splits=n_folds, random_state = 123, shuffle=True)

    for i, (_, train_index) in enumerate(strat_kfold.split(train_df.index, train_df['label_group'])):
        train_df.iloc[train_index,-1] = i

    train_df['fold'] = train_df['fold'].astype('int')
    
    if n_folds == 10:
        train = train_df[train_df.fold != 0].reset_index(drop=True)
        valid = train_df[train_df.fold == 0].reset_index(drop=True)
        
        return train,valid

In [29]:
train,test = stratify_df(train_df)



In [30]:
train.label_group.nunique()

352

In [31]:
print(f'train : {train.shape[0]}, test : {test.shape[0]}')

train : 503929, test : 55993


In [32]:
train.to_csv("separ_train.csv")
test.to_csv("separ_test.csv")

In [33]:
import pandas as pd

In [4]:
tr = pd.read_csv("separ_train.csv", index_col = 0)
tr

Unnamed: 0,image_name,label_group,fold
0,train/manish_546453_best.jpg,0,3
1,train/modern_709120_best.jpg,0,3
2,train/classic_746922_best.jpg,0,7
3,train/modern_1342796_best.jpg,0,3
4,train/street_1223318_best.jpg,0,1
...,...,...,...
503924,train/street_735225_zipup.jpg,351,7
503925,train/street_1177967_zipup.jpg,351,5
503926,train/genderless_1255580_zipup.jpg,351,6
503927,train/street_192044_zipup.jpg,351,8
