In [1]:
import pandas as pd
import random
import os
import numpy as np
from PIL import Image
from torchvision import transforms
from collections import Counter

In [None]:
train = pd.read_csv('train.csv')

In [None]:
#증강에 사용할 형용사, 부사, 유의어 모음 불러오기(Google Drive 첨부 - https://drive.google.com/drive/folders/1U6Y6Zv_PxZXsXLgLlgzu6smZ_e_b2mpS?usp=sharing)

adj_ls = pd.read_csv('adjective.csv')['adjective'].to_list()
ad_ls = pd.read_csv('adverb.csv')['adverb'].to_list() 

main_token = pd.read_csv('sim_data.csv')['token'].to_list()  
sub_token = pd.read_csv('sim_data.csv')['sim_token'].to_list()
sim_dic = {}
for m, s in zip(main_token,sub_token) :
    sim_dic[m] = s

In [None]:
#형용사 증강 : 문장의 명사 앞에 랜덤으로 형용사어 삽입 함수

def aug_adj(senten):
    tokens = senten.split(' ')


    cnt = 0
    token_ls = []
    for i in range(len(tokens)) :
        token = tokens[i]
        if token != '' :
            if token[-1] == '은' or token[-1] == '는' or token[-1] == '이' or token[-1] == '가' :
                cnt +=1
                token_ls.append(token)
            if token[-1] == '을' or token[-1] == '를' :
                cnt +=1
                token_ls.append(token)            

    if cnt > 0 :
        for i in range(random.randint(1,cnt)) :
            token_index = random.randint(0,cnt-1)
            senten = senten.replace(token_ls[token_index],'{} {}'.format(adj_ls[random.randint(0,len(adj_ls)-1)],token_ls[token_index]))
    
    return senten      

In [None]:
#부사 증강 : 문장의 동사 앞에 랜덤으로 부사어 삽입 함수
def aug_ad(senten):
    tokens = senten.split(' ')


    cnt = 0
    token_ls = []
    for i in range(len(tokens)) :
        token = tokens[i]
        if token != '' : 
            if token[-1] == '.':
                cnt +=1
                token_ls.append(token)

    if cnt > 0 :
        for i in range(random.randint(1,cnt)) :
            token_index = random.randint(0,cnt-1)
            senten = senten.replace(token_ls[token_index],'{} {}'.format(ad_ls[random.randint(0,len(ad_ls)-1)],token_ls[token_index]))
        
    return senten  

In [None]:
#유의어 증강 : 문장의 단어들을 유의어로 변경하는 함수
def aug_simul(senten):
    tokens = senten.split(' ')
    cnt = 0
    token_ls3 = []
    for i in range(len(tokens)) :
        token = tokens[i]
        if token in sim_dic.keys():
            token_ls3.append(token)
            cnt += 1

    if cnt > 0 :
        for i in range(random.randint(1,cnt)) :
            token_index = random.randint(0,len(token_ls3)-1)
            to_index = tokens.index(token_ls3[token_index])
            to_val = sim_dic[tokens[to_index]].split('/')
            if len(to_val) > 1 :
                tokens[to_index] = to_val[random.randint(0,len(to_val)-1)]
            else :
                tokens[to_index] = to_val[0]
            
            token_ls3.pop(token_index)


    senten = ' '.join(tokens)
    return senten

In [None]:
#이미지 증강


def img_aug(img) :
    loader_transform1 = transforms.ColorJitter(
        brightness=0.6, 
        contrast=0.6, 
        saturation=0.6, 
        hue=0.2
        )  
    loader_transform2 = transforms.RandomAffine(degrees=90,fillcolor=0)     


    imgArray = np.array(img) # 이미지 분석을 위해 배열 전환
    img_shape = imgArray.shape
    x = int(img_shape[0] * 0.8)
    y = int(img_shape[1] * 0.8)
    x2 = int(img_shape[0] * 0.4)
    y2 = int(img_shape[1] * 0.4)

    re_img = loader_transform1(img)
    re_img = loader_transform2(re_img)
    re_img = transforms.RandomCrop(size=(x,y))(re_img)
    re_img

    imgArray_aug = np.array(re_img)
    indx1 = random.sample(range(x2),2)
    indx2 = random.sample(range(y2),2)

    length = (np.max(indx1)-np.min(indx1))*(np.max(indx2)-np.min(indx2))*3

    imgArray_aug[np.min(indx1):np.max(indx1), np.min(indx2):np.max(indx2), range(3)] = np.random.choice(256, length, replace=True).reshape(((np.max(indx1)-np.min(indx1)),(np.max(indx2)-np.min(indx2)),3))

    aug_img = Image.fromarray(imgArray_aug)

    return aug_img

In [None]:
os.mkdir('image/val')   #증강 이미지를 저장할 폴더 생성

sam_overview = train['overview'].to_list()
sam_label = train['cat3'].to_list()
sam_img = train['img_path'].to_list()


#증강 샘플 1개 생성
sample = sam_overview[0]
label = sam_label[0]
img = Image.open(sam_img[0])

new_sample = aug_adj(sample)
new_sample = aug_ad(new_sample)
new_sample = aug_simul(new_sample)

new_img = img_aug(img)
save_path = 'image/val/val_1.jpg'
new_img.save(save_path,'JPEG')

val_set = pd.DataFrame({'img_path' : [save_path], 'overview' : [new_sample], 'cat3' : [label]})

In [None]:
#미리 만들어둔 샘플 1개와 concat하여 5000개의 증강데이터 생성
for i in range(4999) :
    idx = random.randint(0,len(sam_overview)-1)
    sample = sam_overview[idx]
    label = sam_label[idx]
    img = Image.open(sam_img[idx])
    

    new_sample = aug_adj(sample)
    new_sample = aug_ad(new_sample)
    new_sample = aug_simul(new_sample)

    new_img = img_aug(img)
    save_path = 'image/val/val_{}.jpg'.format(i+2)
    new_img.save(save_path,'JPEG')

    df = pd.DataFrame({'img_path' : [save_path], 'overview' : [new_sample], 'cat3' : [label]})
    val_set = pd.concat([val_set,df],axis=0)

In [None]:
val_set.to_csv('val_set.csv')

In [None]:
# 텍스트 증강을 통한 샘플 불균형 완화
def text_aug(label_ls, train_set, num, adj_aug=True, ad_aug=True, sim_aug=True ):
    train = train_set
    cnt = 16985
    for i in label_ls :
        cnt = Counter(train['cat3'])[i]    
        if cnt < num :
            for j in range(num-cnt):
                print(j)
                cnt += 1
                df = train.loc[(train['cat3'] == i)]
                df_ls = df['overview'].to_list()
                num = len(df_ls)
                idx = random.randint(0, num-1)
                sample = df_ls[idx]
                if adj_aug == True :
                    sample = aug_adj(sample)
                if ad_aug == True :
                    sample = aug_ad(sample)
                if sim_aug == True :
                    sample = aug_simul(sample)


                df2 = pd.DataFrame({'id' : ['TRAIN_{}'.format(cnt)], 'overview' : [sample], 'cat3' : [i]})
                train = pd.concat([train,df2],axis=0)
    return train

In [None]:
train_set = train[['id', 'overview','cat3']]
label_ls = list(train_set['cat3'].unique())

#형용사만 사용 / 최소 50개
data1 = text_aug(label_ls,train_set,50,True,False,False)
data1.to_csv('train(adj,50).csv', index=False)


#부사만 사용 / 최소 50개
data2 = text_aug(label_ls,train_set,50,False,True,False)
data2.to_csv('train(ad,50).csv', index=False)


#유의어만 사용 / 최소 50개
data3 = text_aug(label_ls,train_set,50,False,False,True)
data3.to_csv('train(sim,50).csv', index=False)


#형용사+부사 / 최소 50개
data4 = text_aug(label_ls,train_set,50,True,True,False)
data4.to_csv('train(adj,ad,50).csv', index=False)


#형용사+부사+유의어 / 최소 100개
data5 = text_aug(label_ls,train_set,100,True,True,True)
data5.to_csv('train(adj,ad,sim,50).csv', index=False)