In [1]:
import random
import pandas as pd
import numpy as np
import os
import cv2


from sklearn.feature_extraction.text import CountVectorizer


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


from tqdm.auto import tqdm

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2


from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings(action='ignore')
import mlflow


os.chdir('../')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
CFG = {
    'IMG_SIZE':128,
    'EPOCHS':5,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':64,
    'SEED':41,
    'TRAIN_RATE':0.9,
}
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

all_df = pd.read_csv('./train.csv')
all_df_temp = all_df.copy()

lable_category_3 = list(set(list(all_df['cat3'])))
lable_category_3.sort()
label2num = {x:i for i,x in enumerate(lable_category_3)}
num2label = {i:x for i,x in enumerate(lable_category_3)}


def labeltonum(x,label2num):
    num = label2num[x['cat3']]
    return num
all_df['cat3'] = all_df.apply(labeltonum,args=(label2num,),axis=1)


vectorizer = CountVectorizer(max_features=4096)
all_vectors = vectorizer.fit_transform(all_df['overview'])
all_vectors = all_vectors.todense()

all_df['text_v'] = all_vectors.tolist()

class CustomDataset(Dataset):
    def __init__(self, df, transforms, infer=False):
        self.img_path_list = df['img_path'].to_list()
        self.text_vectors = df['text_v'].to_list()
        self.label_list = df['cat3'].to_list()
        
        
        self.transforms = transforms
        self.infer = infer
        
    def __getitem__(self, index):
        # NLP

        text_vector = self.text_vectors[index]
        
        # Image
        img_path = self.img_path_list[index]
        image = cv2.imread(img_path)
        
        if self.transforms is not None:
            image = self.transforms(image=image)['image']
        
        # Label
        if self.infer:
            return image, torch.Tensor(text_vector).view(-1)
        else:
            label = self.label_list[index]
            return image, torch.Tensor(text_vector).view(-1), label
        
    def __len__(self):
        return len(self.img_path_list)


train_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

test_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])


def split_df(df,train_rate,min_num,label_column):

    label_list = list(set(list(df[label_column])))
    label_list.sort()


    ## 각 항목별 df의 index가져옴
    train_index_list = []
    val_index_list = []
    for label in label_list:
        index_list = list(df[df[label_column]==label].index)
        ## 필요하다면 여기서 인덱스 리스트를 셔플해도 됨
        
        index_len = len(list(df[df[label_column]==label].index))

        if index_len*train_rate > min_num:
            train_index_list = train_index_list + index_list[:int(index_len*train_rate)]
            val_index_list = val_index_list + index_list[int(index_len*train_rate):]

    train_df = df.iloc[train_index_list]

    val_df = df.iloc[val_index_list]

    return train_df,val_df
    
train_df,val_df = split_df(all_df,CFG['TRAIN_RATE'],1,"cat3")


In [2]:
train_df

Unnamed: 0,id,img_path,overview,cat1,cat2,cat3,text_v
40,TRAIN_00040,./image/train/TRAIN_00040.jpg,"전라북도 익산시 금마면에서 매월 2, 7, 12, 17, 22, 27일에 개설되는 ...",쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
280,TRAIN_00280,./image/train/TRAIN_00280.jpg,"복흥지역은 내장산 국립공원과 인접하여 가을이면 단풍을 즐길 수 있으며, 10월 중에...",쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
430,TRAIN_00430,./image/train/TRAIN_00430.jpg,곡성기차마을 전통시장은 1956년 개설된 장옥형의 중형시장으로 매월 3·8일마다 ...,쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
460,TRAIN_00460,./image/train/TRAIN_00460.jpg,옛날 시골장 정취는 없으나 아직도 시골 아낙네나 할머니들이 남새밭에서 재배한 무공해...,쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
759,TRAIN_00759,./image/train/TRAIN_00759.jpg,1830년대 안동 지역에는 여러 개의 오일장이 있었다. 구체적으로는 부내장과 신당장...,쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
14713,TRAIN_14713,./image/train/TRAIN_14713.jpg,두릅나무과에 딸린 낙엽활엽교목으로 우리나라를 비롯하여 중국 일본 등지에 널리 분포되...,자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
14803,TRAIN_14803,./image/train/TRAIN_14803.jpg,미탄면에서 42번 국도를 타고 정선방면으로 2.8㎞ 이동한 후 백운삼거리에서 우회전...,자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15018,TRAIN_15018,./image/train/TRAIN_15018.jpg,"명진리 마을 앞 들 가운데 서 있는 이 나무는 높이 14m, 둘레 7.7m, 수관은...",자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15037,TRAIN_15037,./image/train/TRAIN_15037.jpg,"* 단종 유배생활의 벗, 영월의 관음송 *<br /><br />한강 상류지역인 영월...",자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
from imblearn.over_sampling import RandomOverSampler 

temp_df = train_df.copy()
y = train_df['cat3']
# temp_df = temp_df.drop('cat3',axis=1)
# X = temp_df.drop('cat3',axis=0)

ros = RandomOverSampler(random_state=0)
sampleing_df = ros.fit_resample(temp_df, y)
sampleing_df

(                 id                       img_path  \
 0       TRAIN_00040  ./image/train/TRAIN_00040.jpg   
 1       TRAIN_00280  ./image/train/TRAIN_00280.jpg   
 2       TRAIN_00430  ./image/train/TRAIN_00430.jpg   
 3       TRAIN_00460  ./image/train/TRAIN_00460.jpg   
 4       TRAIN_00759  ./image/train/TRAIN_00759.jpg   
 ...             ...                            ...   
 396027  TRAIN_08996  ./image/train/TRAIN_08996.jpg   
 396028  TRAIN_02139  ./image/train/TRAIN_02139.jpg   
 396029  TRAIN_06290  ./image/train/TRAIN_06290.jpg   
 396030  TRAIN_09183  ./image/train/TRAIN_09183.jpg   
 396031  TRAIN_05083  ./image/train/TRAIN_05083.jpg   
 
                                                  overview cat1  cat2  cat3  \
 0       전라북도 익산시 금마면에서 매월 2, 7, 12, 17, 22, 27일에 개설되는 ...   쇼핑    쇼핑     0   
 1       복흥지역은 내장산 국립공원과 인접하여 가을이면 단풍을 즐길 수 있으며, 10월 중에...   쇼핑    쇼핑     0   
 2       곡성기차마을 전통시장은 1956년 개설된 장옥형의 중형시장으로  매월 3·8일마다 ...   쇼핑    쇼핑     0   
 3       옛날 시골장 정취는 없으

In [23]:
train_df.groupby('cat3').count()

Unnamed: 0_level_0,id,img_path,overview,cat1,cat2,text_v
cat3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,148,148,148,148,148,148
1,3,3,3,3,3,3
2,1,1,1,1,1,1
3,93,93,93,93,93,93
4,74,74,74,74,74,74
...,...,...,...,...,...,...
123,13,13,13,13,13,13
124,8,8,8,8,8,8
125,27,27,27,27,27,27
126,51,51,51,51,51,51


In [24]:
sampleing_df[0].groupby('cat3').count()

Unnamed: 0_level_0,id,img_path,overview,cat1,cat2,text_v
cat3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3094,3094,3094,3094,3094,3094
1,3094,3094,3094,3094,3094,3094
2,3094,3094,3094,3094,3094,3094
3,3094,3094,3094,3094,3094,3094
4,3094,3094,3094,3094,3094,3094
...,...,...,...,...,...,...
123,3094,3094,3094,3094,3094,3094
124,3094,3094,3094,3094,3094,3094
125,3094,3094,3094,3094,3094,3094
126,3094,3094,3094,3094,3094,3094
