# 클래스 분포 확인

In [1]:
import pandas as pd

df = pd.read_csv("./data/train/train.csv")
print(df)

          id  gender   race  age                    path
0     000001  female  Asian   45  000001_female_Asian_45
1     000002  female  Asian   52  000002_female_Asian_52
2     000004    male  Asian   54    000004_male_Asian_54
3     000005  female  Asian   58  000005_female_Asian_58
4     000006  female  Asian   59  000006_female_Asian_59
...      ...     ...    ...  ...                     ...
2695  006954    male  Asian   19    006954_male_Asian_19
2696  006955    male  Asian   19    006955_male_Asian_19
2697  006956    male  Asian   19    006956_male_Asian_19
2698  006957    male  Asian   20    006957_male_Asian_20
2699  006959    male  Asian   19    006959_male_Asian_19

[2700 rows x 5 columns]


In [2]:
df['label']=0
df['age']==df['age'].astype(int)
print(df)

          id  gender   race  age                    path  label
0     000001  female  Asian   45  000001_female_Asian_45      0
1     000002  female  Asian   52  000002_female_Asian_52      0
2     000004    male  Asian   54    000004_male_Asian_54      0
3     000005  female  Asian   58  000005_female_Asian_58      0
4     000006  female  Asian   59  000006_female_Asian_59      0
...      ...     ...    ...  ...                     ...    ...
2695  006954    male  Asian   19    006954_male_Asian_19      0
2696  006955    male  Asian   19    006955_male_Asian_19      0
2697  006956    male  Asian   19    006956_male_Asian_19      0
2698  006957    male  Asian   20    006957_male_Asian_20      0
2699  006959    male  Asian   19    006959_male_Asian_19      0

[2700 rows x 6 columns]


In [3]:
df.loc[(df['gender']=='male')&(df['age']<30),'label']=0
df.loc[(df['gender']=='male')&(df['age']>=30) & (df['age']<60),'label']=1
df.loc[(df['gender']=='male')&(df['age']>=60),'label']=2
df.loc[(df['gender']=='female')&(df['age']<30),'label']=3
df.loc[(df['gender']=='female')&(df['age']>=30) & (df['age']<60),'label']=4
df.loc[(df['gender']=='female')&(df['age']>=60),'label']=5
print(df)

          id  gender   race  age                    path  label
0     000001  female  Asian   45  000001_female_Asian_45      4
1     000002  female  Asian   52  000002_female_Asian_52      4
2     000004    male  Asian   54    000004_male_Asian_54      1
3     000005  female  Asian   58  000005_female_Asian_58      4
4     000006  female  Asian   59  000006_female_Asian_59      4
...      ...     ...    ...  ...                     ...    ...
2695  006954    male  Asian   19    006954_male_Asian_19      0
2696  006955    male  Asian   19    006955_male_Asian_19      0
2697  006956    male  Asian   19    006956_male_Asian_19      0
2698  006957    male  Asian   20    006957_male_Asian_20      0
2699  006959    male  Asian   19    006959_male_Asian_19      0

[2700 rows x 6 columns]


In [4]:
df['count']=1
df_label=df[['label','count']].copy()
df_label=df_label.groupby('label').count()
print(df_label)

       count
label       
0        549
1        410
2         83
3        732
4        817
5        109


In [5]:
df_label.loc[0,"count"]

549

In [6]:
df_folds = df[['path','label']].copy()
print(df_folds)


                        path  label
0     000001_female_Asian_45      4
1     000002_female_Asian_52      4
2       000004_male_Asian_54      1
3     000005_female_Asian_58      4
4     000006_female_Asian_59      4
...                      ...    ...
2695    006954_male_Asian_19      0
2696    006955_male_Asian_19      0
2697    006956_male_Asian_19      0
2698    006957_male_Asian_20      0
2699    006959_male_Asian_19      0

[2700 rows x 2 columns]


In [7]:
print(df_folds[df_folds.loc[:,'path']=='000001_female_Asian_45'])

                     path  label
0  000001_female_Asian_45      4


In [8]:
print(list(df_folds['path'])[:10])

['000001_female_Asian_45', '000002_female_Asian_52', '000004_male_Asian_54', '000005_female_Asian_58', '000006_female_Asian_59', '000007_female_Asian_58', '000008_female_Asian_58', '000009_female_Asian_56', '000010_female_Asian_58', '000012_male_Asian_57']


# stratified Group K-Fold

In [9]:
from collections import defaultdict
import numpy as np
import random

seed=2022
k=5

path_list=list(df_folds['path'])
label_list=list(df_folds['label'])
path_to_label=dict()

for a,b in zip(path_list,label_list):
    path_to_label[a]=b
    
labels_num = 6
class_counts_per_fold = defaultdict(lambda:np.zeros(labels_num))

def eval_class_counts_per_fold(image_path,fold,k):
    class_counts_per_fold[fold][path_to_label[image_path]] += 1
    std_per_class=[]
    for class_number in range(labels_num):
        class_std = np.std([class_counts_per_fold[i][class_number]/df_label.loc[class_number,'count'] for i in range(k)])
        std_per_class.append(class_std)
    class_counts_per_fold[fold][path_to_label[image_path]] -= 1
    
    return np.mean(std_per_class)


image_names_per_fold = defaultdict(set)

random.Random(seed).shuffle(path_list)

for image_path in path_list:
    best_fold = None
    min_eval = None
    for i in range(k):
        fold_eval = eval_class_counts_per_fold(image_path,i,k)
        if min_eval is None or fold_eval < min_eval:
            min_eval = fold_eval
            best_fold = i
    class_counts_per_fold[best_fold][path_to_label[image_path]]+=1
    image_names_per_fold[best_fold].add(image_path)

all_image_names = set(path_list)

k_fold_train_test_list=[]

for i in range(k):
    train_image_names = all_image_names - image_names_per_fold[i]
    test_image_names = image_names_per_fold[i]
    
    k_fold_train_test_list.append((list(train_image_names),list(test_image_names)))
    

In [10]:
print(f'fold0 train 개수 : {len(k_fold_train_test_list[0][0])}, valid 개수 : {len(k_fold_train_test_list[0][1])}')
print(f'fold1 train 개수 : {len(k_fold_train_test_list[1][0])}, valid 개수 : {len(k_fold_train_test_list[1][1])}')
print(f'fold2 train 개수 : {len(k_fold_train_test_list[2][0])}, valid 개수 : {len(k_fold_train_test_list[2][1])}')
print(f'fold3 train 개수 : {len(k_fold_train_test_list[3][0])}, valid 개수 : {len(k_fold_train_test_list[3][1])}')
print(f'fold4 train 개수 : {len(k_fold_train_test_list[4][0])}, valid 개수 : {len(k_fold_train_test_list[4][1])}')

fold0 train 개수 : 2158, valid 개수 : 542
fold1 train 개수 : 2158, valid 개수 : 542
fold2 train 개수 : 2160, valid 개수 : 540
fold3 train 개수 : 2161, valid 개수 : 539
fold4 train 개수 : 2163, valid 개수 : 537


In [11]:
k_fold={}
k_fold[0]=(list(k_fold_train_test_list[0]))
k_fold[1]=(list(k_fold_train_test_list[1]))
k_fold[2]=(list(k_fold_train_test_list[2]))
k_fold[3]=(list(k_fold_train_test_list[3]))
k_fold[4]=(list(k_fold_train_test_list[4]))

In [12]:
import json

with open('stratified_k_fold_dict.json','w') as f:
    json.dump(k_fold,f,indent=4)

In [13]:
# class별로 잘 분류되었는지 확인

fold_check=defaultdict(lambda:np.zeros(6))

for i in range(k):
    train,valid=k_fold[i]
    for item in valid:
        fold_check[i][path_to_label[item]]+=1
    print(fold_check[i],sum(fold_check[i]))

[110.  82.  17. 147. 164.  22.] 542.0
[110.  82.  17. 147. 164.  22.] 542.0
[110.  82.  17. 146. 163.  22.] 540.0
[110.  82.  16. 146. 163.  22.] 539.0
[109.  82.  16. 146. 163.  21.] 537.0
