In [1]:
import random
import pandas as pd
import numpy as np
import os
import cv2


from sklearn.feature_extraction.text import CountVectorizer


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


from tqdm.auto import tqdm

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2


from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings(action='ignore')
import mlflow


os.chdir('../')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
CFG = {
    'IMG_SIZE':128,
    'EPOCHS':5,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':64,
    'SEED':41,
    'TRAIN_RATE':0.9,
}
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

all_df = pd.read_csv('./train.csv')
all_df_temp = all_df.copy()

lable_category_3 = list(set(list(all_df['cat3'])))
lable_category_3.sort()
label2num = {x:i for i,x in enumerate(lable_category_3)}
num2label = {i:x for i,x in enumerate(lable_category_3)}


def labeltonum(x,label2num):
    num = label2num[x['cat3']]
    return num
all_df['cat3'] = all_df.apply(labeltonum,args=(label2num,),axis=1)


vectorizer = CountVectorizer(max_features=4096)
all_vectors = vectorizer.fit_transform(all_df['overview'])
all_vectors = all_vectors.todense()

all_df['text_v'] = all_vectors.tolist()

class CustomDataset(Dataset):
    def __init__(self, df, transforms, infer=False):
        self.img_path_list = df['img_path'].to_list()
        self.text_vectors = df['text_v'].to_list()
        self.label_list = df['cat3'].to_list()
        
        
        self.transforms = transforms
        self.infer = infer
        
    def __getitem__(self, index):
        # NLP

        text_vector = self.text_vectors[index]
        
        # Image
        img_path = self.img_path_list[index]
        image = cv2.imread(img_path)
        
        if self.transforms is not None:
            image = self.transforms(image=image)['image']
        
        # Label
        if self.infer:
            return image, torch.Tensor(text_vector).view(-1)
        else:
            label = self.label_list[index]
            return image, torch.Tensor(text_vector).view(-1), label
        
    def __len__(self):
        return len(self.img_path_list)


train_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

test_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])


def split_df(df,train_rate,min_num,label_column):

    label_list = list(set(list(df[label_column])))
    label_list.sort()


    ## 각 항목별 df의 index가져옴
    train_index_list = []
    val_index_list = []
    for label in label_list:
        index_list = list(df[df[label_column]==label].index)
        ## 필요하다면 여기서 인덱스 리스트를 셔플해도 됨
        
        index_len = len(list(df[df[label_column]==label].index))

        if index_len*train_rate > min_num:
            train_index_list = train_index_list + index_list[:int(index_len*train_rate)]
            val_index_list = val_index_list + index_list[int(index_len*train_rate):]

    train_df = df.iloc[train_index_list]

    val_df = df.iloc[val_index_list]

    return train_df,val_df
    
train_df,val_df = split_df(all_df,CFG['TRAIN_RATE'],1,"cat3")


In [2]:
all_df

Unnamed: 0,id,img_path,overview,cat1,cat2,cat3,text_v
0,TRAIN_00000,./image/train/TRAIN_00000.jpg,소안항은 조용한 섬으로 인근해안이 청정해역으로 일찍이 김 양식을 해서 높은 소득을 ...,자연,자연관광지,120,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,TRAIN_00001,./image/train/TRAIN_00001.jpg,경기도 이천시 모가면에 있는 골프장으로 대중제 18홀이다. 회원제로 개장을 했다가 ...,레포츠,육상 레포츠,8,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,TRAIN_00002,./image/train/TRAIN_00002.jpg,금오산성숯불갈비는 한우고기만을 전문적으로 취급하고 사용하는 부식 자재 또한 유기농법...,음식,음식점,118,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,TRAIN_00003,./image/train/TRAIN_00003.jpg,철판 위에서 요리하는 안동찜닭을 맛볼 수 있는 곳이다. 경상북도 안동시에 있는 한식...,음식,음식점,118,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,TRAIN_00004,./image/train/TRAIN_00004.jpg,※ 영업시간 10:30 ~ 20:30\n\n3대에 걸쳐 아귀만을 전문으로 취급하는 ...,음식,음식점,118,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
16981,TRAIN_16981,./image/train/TRAIN_16981.jpg,해발 12000m에 자리한 식담겸 카페점문점이다.<br>곤드레밥과 감자전을 판매하고...,음식,음식점,118,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16982,TRAIN_16982,./image/train/TRAIN_16982.jpg,설악힐호텔은 동해고속도로 속초톨게이트에서 멀지 않은 관광로 변에 있다. 속초의 대표...,숙박,숙박시설,31,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16983,TRAIN_16983,./image/train/TRAIN_16983.jpg,충남 서산시 중심가에 위치한 줌모텔은 프라이버스가 보장되는 조용한 공간으로 가치가 ...,숙박,숙박시설,31,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16984,TRAIN_16984,./image/train/TRAIN_16984.jpg,토토큰바위캠핑장은 경기도 가평지역 내에서도 청정지역으로 손꼽히는 지역으로 주변에 화...,레포츠,육상 레포츠,73,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [3]:
train_df

Unnamed: 0,id,img_path,overview,cat1,cat2,cat3,text_v
40,TRAIN_00040,./image/train/TRAIN_00040.jpg,"전라북도 익산시 금마면에서 매월 2, 7, 12, 17, 22, 27일에 개설되는 ...",쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
280,TRAIN_00280,./image/train/TRAIN_00280.jpg,"복흥지역은 내장산 국립공원과 인접하여 가을이면 단풍을 즐길 수 있으며, 10월 중에...",쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
430,TRAIN_00430,./image/train/TRAIN_00430.jpg,곡성기차마을 전통시장은 1956년 개설된 장옥형의 중형시장으로 매월 3·8일마다 ...,쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
460,TRAIN_00460,./image/train/TRAIN_00460.jpg,옛날 시골장 정취는 없으나 아직도 시골 아낙네나 할머니들이 남새밭에서 재배한 무공해...,쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
759,TRAIN_00759,./image/train/TRAIN_00759.jpg,1830년대 안동 지역에는 여러 개의 오일장이 있었다. 구체적으로는 부내장과 신당장...,쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
14713,TRAIN_14713,./image/train/TRAIN_14713.jpg,두릅나무과에 딸린 낙엽활엽교목으로 우리나라를 비롯하여 중국 일본 등지에 널리 분포되...,자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
14803,TRAIN_14803,./image/train/TRAIN_14803.jpg,미탄면에서 42번 국도를 타고 정선방면으로 2.8㎞ 이동한 후 백운삼거리에서 우회전...,자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15018,TRAIN_15018,./image/train/TRAIN_15018.jpg,"명진리 마을 앞 들 가운데 서 있는 이 나무는 높이 14m, 둘레 7.7m, 수관은...",자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
15037,TRAIN_15037,./image/train/TRAIN_15037.jpg,"* 단종 유배생활의 벗, 영월의 관음송 *<br /><br />한강 상류지역인 영월...",자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [4]:
temp = list(train_df.groupby('cat3').size())
temp.sort()
temp[-3]

706

In [5]:
import random
import pickle
import re

wordnet = {}
with open("./wordnet.pickle", "rb") as f:
	wordnet = pickle.load(f)


# 한글만 남기고 나머지는 삭제
def get_only_hangul(line):
	parseText= re.compile('/ ^[ㄱ-ㅎㅏ-ㅣ가-힣]*$/').sub('',line)

	return parseText



########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################
def synonym_replacement(words, n):
	new_words = words.copy()
	random_word_list = list(set([word for word in words]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			num_replaced += 1
		if num_replaced >= n:
			break

	if len(new_words) != 0:
		sentence = ' '.join(new_words)
		new_words = sentence.split(" ")

	else:
		new_words = ""

	return new_words


def get_synonyms(word):
	synomyms = []

	try:
		for syn in wordnet[word]:
			for s in syn:
				synomyms.append(s)
	except:
		pass

	return synomyms

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################
def random_deletion(words, p):
	if len(words) == 1:
		return words

	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################
def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################
def random_insertion(words, n):
	new_words = words.copy()
	for _ in range(n):
		add_word(new_words)
	
	return new_words


def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		if len(new_words) >= 1:
			random_word = new_words[random.randint(0, len(new_words)-1)]
			synonyms = get_synonyms(random_word)
			counter += 1
		else:
			random_word = ""

		if counter >= 10:
			return
		
	random_synonym = synonyms[0]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)



def EDA(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
	sentence = get_only_hangul(sentence)
	words = sentence.split(' ')
	words = [word for word in words if word is not ""]
	num_words = len(words)

	augmented_sentences = []
	num_new_per_technique = int(num_aug/4) + 1

	n_sr = max(1, int(alpha_sr*num_words))
	n_ri = max(1, int(alpha_ri*num_words))
	n_rs = max(1, int(alpha_rs*num_words))

	# sr
	for _ in range(num_new_per_technique):
		a_words = synonym_replacement(words, n_sr)
		augmented_sentences.append(' '.join(a_words))

	# ri
	for _ in range(num_new_per_technique):
		a_words = random_insertion(words, n_ri)
		augmented_sentences.append(' '.join(a_words))

	# rs
	for _ in range(num_new_per_technique):
		a_words = random_swap(words, n_rs)
		augmented_sentences.append(" ".join(a_words))

	# rd
	for _ in range(num_new_per_technique):
		a_words = random_deletion(words, p_rd)
		augmented_sentences.append(" ".join(a_words))

	augmented_sentences = [get_only_hangul(sentence) for sentence in augmented_sentences]
	random.shuffle(augmented_sentences)

	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	augmented_sentences.append(sentence)

	return augmented_sentences

('집가고싶다', False)

In [7]:

def while_EDA(sentence):
    count = 0
    while count < 100:
        aug_list = EDA(sentence=sentence,num_aug=10)
        for aug in aug_list:
            if aug != sentence:
                return aug
        count +=1
    return aug
# while_EDA("제가 우울감을 느낀지는 오래됐는데 점점 개선되고 있다고 느껴요")
# while_EDA("집가고싶다")

def make_aug_data_df(df,index_list,target_quantity):
    ## df 칼럼 복사 data frame
    result_df = df.iloc[0:0]
    
    ## 순차적으로 데이터를 증강함
    for i in range(0,target_quantity - len(index_list)):
        add_sentance = while_EDA(df.iloc[index_list[i%len(index_list)]]['overview'])
        temp = df.iloc[index_list[i%len(index_list)]]
        temp['overview'] = add_sentance
        result_df.loc[len(result_df)+1] = temp
    # df_temp.append(dict(temp),ignore_index=True)

    return result_df




rank_number = 5
df = train_df.copy()
label_name = "cat3"


def custom_oversampling(df,label_name,rank_number):
    df = df.reset_index(drop=True)
    label_len = list(df.groupby(label_name).size())

    ## 모든 데이터가 증강할 수량
    target_quantity = sorted(label_len,reverse=True)[rank_number]

    label_list = list(set(list(df[label_name])))

    index_dict = {label_name:[] for label_name in label_list}

    for i,row in enumerate(df[label_name]):
        index_dict[row].append(i)



    aug_df = df.iloc[0:0]
    for key,item in index_dict.items():
        if len(item) < target_quantity:
            aug_ = make_aug_data_df(df,item,target_quantity)
            aug_df = pd.concat([aug_df,aug_])

    df = pd.concat([df,aug_df])

    return df


# df_temp = df.iloc[0:0]
# df_temp = df_temp.append(df.iloc[1:10])
# df_temp



## 내가 정한 우선순위 높은거 기준으로 데이터를 리샘플링 한다


Unnamed: 0,id,img_path,overview,cat1,cat2,cat3,text_v
1,TRAIN_00040,./image/train/TRAIN_00040.jpg,"전라북도 익산시 금마면에서 매월 2, 7, 17, 22, 시작은 정확하게 알려지지 ...",쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,TRAIN_00280,./image/train/TRAIN_00280.jpg,"복흥지역은 내장산 국립공원과 인접하여 가을이면 들려 즐길 개최되면 있으며, 10월 ...",쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,TRAIN_00430,./image/train/TRAIN_00430.jpg,곡성기차마을 전통시장은 1956년 개설된 장옥형의 중형시장으로 매월 3·8일마다 열...,쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,TRAIN_00460,./image/train/TRAIN_00460.jpg,날 시골장 정취는 없으나 아직도 시골 아낙네나 할머니들이 남새밭에서 재배한 무공해작...,쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,TRAIN_00759,./image/train/TRAIN_00759.jpg,1830년대 안동 지역에는 여러 개의 오일장이 있었다. 구체적으로는 부내장과 신당장...,쇼핑,쇼핑,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
362,TRAIN_08909,./image/train/TRAIN_08909.jpg,평야 지대로는 김제시의 대표적인 들(野)로 불리우는 만경들이 위치해 있는 곳으로 1...,자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
363,TRAIN_08996,./image/train/TRAIN_08996.jpg,삼척시내에서 울진방향으로 약10분 가량을 가다보면 7번국도변을 따라 오래된 벚나무에...,자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
364,TRAIN_09183,./image/train/TRAIN_09183.jpg,"회화나무는 콩과에 속하는 나무로, 원산지이며 우리나라에 전해지면서 주로 마을 가까운...",자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
365,TRAIN_09310,./image/train/TRAIN_09310.jpg,양재리 2구에 사는 모수택 소유인 수정재 뒷편에 있는 동산의 소나무 숲 속에 두그루...,자연,관광자원,127,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
aug_df.groupby('cat3').size()

cat3
0      297
1      442
2      444
3      352
4      371
      ... 
123    432
124    437
125    418
126    394
127    366
Length: 122, dtype: int64

In [9]:
df.groupby('cat3').size()

cat3
0      148
1        3
2        1
3       93
4       74
      ... 
123     13
124      8
125     27
126     51
127     79
Length: 128, dtype: int64

In [10]:
aug_df.to_csv('./au.csv')