In [1]:

import random
import pandas as pd
import numpy as np
import os
import cv2
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch.utils.data import Dataset, DataLoader,random_split
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
os.chdir('../')
CFG = {
    'IMG_SIZE':128,
    'EPOCHS':5,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':64,
    'SEED':41,
    'TRAIN_RATE':0.5,
}
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

all_df = pd.read_csv('./train.csv')
all_df_temp = all_df.copy()

lable_category_3 = list(set(list(all_df['cat3'])))
lable_category_3.sort()
label2num = {x:i for i,x in enumerate(lable_category_3)}
num2label = {i:x for i,x in enumerate(lable_category_3)}


def labeltonum(x,label2num):
    num = label2num[x['cat3']]
    return num
all_df['cat3'] = all_df.apply(labeltonum,args=(label2num,),axis=1)


vectorizer = CountVectorizer(max_features=4096)
all_vectors = vectorizer.fit_transform(all_df['overview'])
all_vectors = all_vectors.todense()


class CustomDataset(Dataset):
    def __init__(self, img_path_list, text_vectors, label_list, transforms, infer=False):
        self.img_path_list = img_path_list
        self.text_vectors = text_vectors
        self.label_list = label_list
        self.transforms = transforms
        self.infer = infer
        
    def __getitem__(self, index):
        # NLP
        text_vector = self.text_vectors[index]
        
        # Image
        img_path = self.img_path_list[index]
        image = cv2.imread(img_path)
        
        if self.transforms is not None:
            image = self.transforms(image=image)['image']
        
        # Label
        if self.infer:
            return image, torch.Tensor(text_vector).view(-1)
        else:
            label = self.label_list[index]
            return image, torch.Tensor(text_vector).view(-1), label
        
    def __len__(self):
        return len(self.img_path_list)


train_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

test_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

all_dataset = CustomDataset(all_df['img_path'].values, all_vectors, all_df['cat3'].values, train_transform)
    
dataset_size = len(all_dataset)
train_size = int(dataset_size * CFG['TRAIN_RATE'])
validation_size = dataset_size - train_size
    
train_dataset, validation_dataset = random_split(all_dataset, [train_size, validation_size])

    
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=6)
val_loader = DataLoader(validation_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=6)

train_label_list=[]
for i,data in enumerate(train_loader):
    train_label_list = train_label_list + list(data[-1].detach().cpu().numpy())

val_label_list =[]
for i,data in enumerate(val_loader):
    val_label_list = val_label_list + list(data[-1].detach().cpu().numpy())

## 숫자를 네임으로
train_name_list = []
for num in train_label_list:
    train_name_list.append(num2label[num])
    
val_name_list = []
for num in val_label_list:
    val_name_list.append(num2label[num])



temp = 1

In [2]:
import collections

t_count_dict = {x:0 for x in lable_category_3}
v_count_dict = {x:0 for x in lable_category_3}
all_count_dict = {x:0 for x in lable_category_3}

In [3]:


for key,item in dict(collections.Counter(list(all_df_temp['cat3']))).items():
    all_count_dict[key] = item

for key,item in dict(collections.Counter(val_name_list)).items():
    v_count_dict[key] = item
    
for key,item in dict(collections.Counter(train_name_list)).items():
    t_count_dict[key] = item
    

In [4]:
a_n =[]
a_c=[]
for key,item in all_count_dict.items():
    a_n.append(key)
    a_c.append(item)
    

v_c=[]
for key,item in v_count_dict.items():
    v_c.append(item)

t_c=[]    
for key,item in t_count_dict.items():
    t_c.append(item)
    

In [5]:
dataset_info_df = pd.DataFrame({"카테고리 네임":a_n,"원본 수량":a_c,"학습 수량":t_c,"검증 수량":v_c})
# dataset_info_df.to_csv('./data_info.csv')
dataset_info_df

Unnamed: 0,카테고리 네임,원본 수량,학습 수량,검증 수량
0,5일장,165,89,76
1,ATV,4,4,0
2,MTB,2,1,1
3,강,104,49,55
4,게스트하우스,83,42,41
...,...,...,...,...
123,헬스투어,15,7,8
124,헹글라이딩/패러글라이딩,9,4,5
125,호수,30,13,17
126,홈스테이,57,25,32


In [6]:
dataset_info_df[dataset_info_df["검증 수량"]==0]

Unnamed: 0,카테고리 네임,원본 수량,학습 수량,검증 수량
1,ATV,4,4,0
22,대중콘서트,2,2,0
46,백화점,4,4,0


In [7]:
len(dataset_info_df[dataset_info_df["검증 수량"]==0])

3