In [None]:
# 구글드라이브 연동
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 데이터셋 불러오기
import pandas as pd
data = pd.read_excel('/content/drive/MyDrive/데캡디/감성대화/Training/train.xlsx')

# 해당 column만 추출
df = data[ ['감정_대분류', '사람문장1'] ]

df.head()

In [None]:
# 데이터 합치기
new_data = df[['사람문장1', '감정_대분류']]
new_data = new_data.rename(columns={'사람문장1':'content', '감정_대분류':'label'})
new_data.head()

In [None]:
import random
import pickle
import re

wordnet = {}
with open("/content/drive/MyDrive/데캡디/wordnet.pickle", "rb") as f:
	wordnet = pickle.load(f)
 

In [None]:
# 한글만 남기고 나머지는 삭제
def get_only_hangul(line):
	parseText= re.compile('/ ^[ㄱ-ㅎㅏ-ㅣ가-힣]*$/').sub('',line)

	return parseText



########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################
def synonym_replacement(words, n):
	new_words = words.copy()
	random_word_list = list(set([word for word in words]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			num_replaced += 1
		if num_replaced >= n:
			break

	if len(new_words) != 0:
		sentence = ' '.join(new_words)
		new_words = sentence.split(" ")

	else:
		new_words = ""

	return new_words


def get_synonyms(word):
	synomyms = []

	try:
		for syn in wordnet[word]:
			for s in syn:
				synomyms.append(s)
	except:
		pass

	return synomyms

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################
def random_deletion(words, p):
	if len(words) == 1:
		return words

	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################
def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################
def random_insertion(words, n):
	new_words = words.copy()
	for _ in range(n):
		add_word(new_words)
	
	return new_words


def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		if len(new_words) >= 1:
			random_word = new_words[random.randint(0, len(new_words)-1)]
			synonyms = get_synonyms(random_word)
			counter += 1
		else:
			random_word = ""

		if counter >= 10:
			return
		
	random_synonym = synonyms[0]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)

In [None]:
def EDA(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=5):
	sentence = get_only_hangul(sentence)
	words = sentence.split(' ')
	words = [word for word in words if word is not ""]
	num_words = len(words)

	augmented_sentences = []
	num_new_per_technique = int(num_aug/4) + 1

	n_sr = max(1, int(alpha_sr*num_words))
	n_ri = max(1, int(alpha_ri*num_words))
	n_rs = max(1, int(alpha_rs*num_words))

	# rd
	for _ in range(num_new_per_technique):
		a_words = random_deletion(words, p_rd)
		augmented_sentences.append(" ".join(a_words))
  

    # sr
	for _ in range(num_new_per_technique):
		a_words = synonym_replacement(words, n_sr)
		augmented_sentences.append(' '.join(a_words))
  
    # ri
	for _ in range(num_new_per_technique):
		a_words = random_insertion(words, n_ri)
		augmented_sentences.append(' '.join(a_words))
  
    # rs
	for _ in range(num_new_per_technique):
		a_words = random_swap(words, n_rs)
		augmented_sentences.append(" ".join(a_words))
  	

	

	augmented_sentences = [get_only_hangul(sentence) for sentence in augmented_sentences]
	random.shuffle(augmented_sentences)

	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	augmented_sentences.append(sentence)

	return augmented_sentences

In [None]:
df_1 = new_data[new_data['label']=='분노']
df_2 = new_data[new_data['label']=='슬픔']
df_3 = new_data[new_data['label']=='불안']
df_4 = new_data[new_data['label']=='상처']
df_5 = new_data[new_data['label']=='당황']
df_6 = new_data[new_data['label']=='기쁨']

In [None]:
df_1_aug = df_1['content'].apply(EDA)
df_2_aug = df_2['content'].apply(EDA)
df_3_aug = df_3['content'].apply(EDA)
df_4_aug = df_4['content'].apply(EDA)
df_5_aug = df_5['content'].apply(EDA)
df_6_aug = df_6['content'].apply(EDA)

In [None]:
aug_1 = []
for i in range(len(df_1_aug)):
  for j in range(len(df_1_aug.iloc[i])):
    aug_1.append(df_1_aug.iloc[i][j])

aug_2 = []
for i in range(len(df_2_aug)):
  for j in range(len(df_2_aug.iloc[i])):
    aug_2.append(df_2_aug.iloc[i][j])

aug_3 = []
for i in range(len(df_3_aug)):
  for j in range(len(df_3_aug.iloc[i])):
    aug_3.append(df_3_aug.iloc[i][j])

aug_4 = []
for i in range(len(df_4_aug)):
  for j in range(len(df_4_aug.iloc[i])):
    aug_4.append(df_4_aug.iloc[i][j])

aug_5 = []
for i in range(len(df_5_aug)):
  for j in range(len(df_5_aug.iloc[i])):
    aug_5.append(df_5_aug.iloc[i][j])

aug_6 = []
for i in range(len(df_6_aug)):
  for j in range(len(df_6_aug.iloc[i])):
    aug_6.append(df_6_aug.iloc[i][j])

In [None]:
df_augmentation_1 = pd.DataFrame({'content': aug_1,
             'label': '분노'})
df_augmentation_2 = pd.DataFrame({'content': aug_2,
             'label': '슬픔'})
df_augmentation_3 = pd.DataFrame({'content': aug_3,
             'label': '불안'})
df_augmentation_4 = pd.DataFrame({'content': aug_4,
             'label': '상처'})
df_augmentation_5 = pd.DataFrame({'content': aug_5,
             'label': '당황'})
df_augmentation_6 = pd.DataFrame({'content': aug_6,
             'label': '기쁨'})


In [None]:
df_augmentation_1[:6]

In [None]:
data = pd.concat([df_augmentation_1, df_augmentation_2, df_augmentation_3, df_augmentation_4, df_augmentation_5, df_augmentation_6])

In [None]:
data

In [None]:
data = data.sample(frac=1)

In [None]:
data['label'].value_counts()