# PATH & Hyperparameter Setting

In [None]:
import os
import pandas as pd
import shutil
import torch
from torch import nn
import torchaudio


os.chdir('/root/')

In [None]:
## Setting parameters
max_len = 256
batch_size = 256
warmup_ratio = 0.1
num_epochs = 200
max_grad_norm = 1
log_interval = 200
learning_rate =  0.0005
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
!git clone https://github.com/jo1132/HappynJoy.git

# 데이터 전처리
## 음성 및 텍스트파일 한곳에 몰아넣기

In [None]:
PATH = './HappynJoy'

In [None]:
# 파일 복사 (현재 경로, 복사할 경로)
def CopyFile(cur_path, copy_path):
    shutil.move(cur_path, copy_path)
    

# 재귀적으로 파일 탐색(현재 경로, 복사할 경로, 파일의 태그(종류, wav, EDA ....))
def SearchFiles(path, copy_path, tagname):
    for cur in os.listdir(path):
        cur_path = os.path.join(path, cur)
        if os.path.isdir(cur_path):
            SearchFiles(cur_path, copy_path, tagname)
        else:
            CopyFile(cur_path, os.path.join(copy_path, tagname+'_'+cur))


In [None]:
# 복제할 경로지정
COPY_PATH = os.path.join(PATH, "TOTAL")
os.makedirs(COPY_PATH, exist_ok=True)


for Ori in ["KEMDy19", "KEMDy20"]:
    for cur in os.listdir(os.path.join(PATH, Ori)):
        cur_path = os.path.join(PATH, Ori, cur)
        SearchFiles(cur_path, COPY_PATH, cur)

## DataFrame으로 정리하기

### ["KEMDy19", "KEMDy20"] 합치기

In [None]:
# DataFrame 세팅
def cut_df(df, filter_cols):
    df = df.iloc[1:]
    
    columns = []
    for item in df.columns:
        if item[:4] == "Eval":
            columns.append(item)
    temp = df[columns]
    
    
    
    temp['Emotion'] = ''*len(temp)
    for i in range(len(temp)):
        temp['Emotion'].iloc[i] = temp[columns].iloc[i].values
    
    df['Total Evaluation'] = temp['Emotion']
    df['index'] = df['Segment ID']
    df = df.set_index(['index'])
    df = df[filter_cols]
    df.columns = ['Segment ID', 'Emotion', 'Valence', 'Arousal']
    
    
    return df

# KEMDy19 데이터셋 합치기
def merge_data(F, M, neutral=True):
    aro = 0
    vals = 0
    
    if F['Segment ID'] != M['Segment ID']:
        print('ID diffrent Error', F['Segment ID'], M['Segment ID'])
        return pd.DataFrame(index=['Segment ID', 'Emotion', 'Valence', 'Arousal']).T
        
    ID = F['Segment ID']
    
    vals = (float(F['Valence']) + float(M['Valence'])) / 2
    aro = (float(F['Arousal']) + float(M['Arousal'])) / 2

    emo = F['Emotion'].tolist() + M['Emotion'].tolist()

    return pd.DataFrame([ID, emo, vals, aro], index=['Segment ID', 'Emotion', 'Valence', 'Arousal']).T

# Emotion 정리
def Emotion_Setting(x):
    #print(x)
    labeling = {
        'fear' : 0,
        'surprise' : 1,
        'angry' : 2,
        'sad' : 3,
        'neutral' : 4,
        'happy' : 5,
        'disgust' : 6,
    }
    
    dic = {i : 0  for i in range(len(labeling.keys()))}
    for emo in x:
        key = labeling[emo]
        dic[key] += 1    
    
    arr = []
    arr_sum = sum(dic.values())
    for v in dic.values():
        v = v/arr_sum
        arr.append(v)
    return arr

In [None]:
# 아까 모아놓았던 경로
#PATH = COPY_PATH
df = pd.DataFrame(columns=['Segment ID', 'Emotion', 'Valence', 'Arousal'])

cols1 = ['Segment ID', 'Total Evaluation', ' .1', ' .2']
cols2 = cols1.copy()
cols2[2] = 'Unnamed: 11'
cols2[3] = 'Unnamed: 12'

for i in range(1, 3+1):
    # 각 csv파일 읽고 컬럼 정리 및 이름 변경
    df1 = cut_df(pd.read_csv(os.path.join(COPY_PATH, "annotation_Sess{0:02d}_eval.csv".format(i))), cols1)
    df2 = cut_df(pd.read_csv(os.path.join(COPY_PATH, "annotation_Session{0:02d}_F_res.csv".format(i))), cols2)
    df3 = cut_df(pd.read_csv(os.path.join(COPY_PATH, "annotation_Session{0:02d}_F_res.csv".format(i))), cols2)

    # 파일 합치기 
    for i in range(len(df2)):
        df = pd.concat([df, merge_data(df2.iloc[i], df3.iloc[i])], axis=0)
    df = pd.concat([df, df1], axis=0)

df['Emotion'] = df['Emotion'].apply(Emotion_Setting)
df = df.sort_values(by=['Segment ID'])
df = df.reset_index().drop(labels=['index'], axis=1)
df.head(10)

### Label 속성 만들기

In [None]:
label = []
for item in df['Emotion'].values:
    max_val = 0
    max_key = -1
    for k, v in enumerate(item):
        if max_val < v:
            max_val = v
            max_key = k
    label.append(max_key)

df['label'] = label
df.head(10)

### 감정 데이터 분포 확인

In [None]:
import matplotlib.pyplot as plt
count = {}

for i in range(len(df)):
  max_key = 0
  max_val = 0
  for key, val in enumerate(df.iloc[i]['Emotion']):
    if max_val < val:
      max_val = val
      max_key = key
  count[max_key] = count.get(max_key, 0) + 1
print(count)

labeling = {0: 'fear',
            1: 'surprise',
            2: 'angry',
            3: 'sad',
            4: 'neutral',
            5: 'happy',
            6: 'disgust'}

keys = [labeling[k] for k in count.keys()]
plt.plot(keys, count.values())

### Script 붙이기

In [None]:
def Read_txt(path):
    script = ''
    if os.path.isfile(path):
        try:
            with open(path, 'rt', encoding='CP949') as file:
                script = file.read()
        except:
            with open(path, 'rt', encoding='UTF-8') as file:
                script = file.read()
    else:
        print('No file', path)
    
    return script

In [None]:
df['Script'] = [0]*len(df)

for idx in range(len(df)):
    if type(df.iloc[idx]['Segment ID']) == str:
        SegID = "wav_"+df.iloc[idx]['Segment ID']
        file_name = SegID+'.txt'
        df['Script'].iloc[idx] = Read_txt(os.path.join(COPY_PATH, file_name))
    else:
        print(df.iloc[idx]['Segment ID'])
        df = df.drop(idx, axis=0)
        
df.to_csv(os.path.join(PATH, 'merged_data.csv'), encoding="utf-8-sig", index=False)
df.head(20)

# Generate Dataset

## Import Audios

In [None]:

bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
wave_arr = []
for idx in range(len(df)):
    audio_path = os.path.join(COPY_PATH, 'wav_'+df['Segment ID'].iloc[idx]+'.wav')
    waveform, sample_rate = torchaudio.load(audio_path)

    if sample_rate != bundle.sample_rate:
        waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

    wave_arr.append(waveform)

df['Audio'] = wave_arr

df.tail()

## Text Tokenizing

In [None]:
from transformers import ElectraTokenizer
class KEMDyDataset():
  def __init__(self, dataframe, max_length):
    self.max_length = max_length
    self.dataset = dataframe.dropna(axis=0) 
    # 중복제거
    self.tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx].values
    text = row[0]
    y = row[2]

    inputs = self.tokenizer(
      text,
      return_tensors='pt',
      truncation=True,
      max_length=self.max_length,
      pad_to_max_length=True
    )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
data = df[['Script', 'Audio', 'Emotion']]
#label = df[['Emotion']]

train, test = train_test_split(data, train_size=0.8, random_state=123, shuffle=True)
print(len(train), len(test))

## Generate Dataset

In [None]:
train_dataset = KEMDyDataset(train, max_len)
test_dataset = KEMDyDataset(test, max_len)

In [None]:
test_dataset.__getitem__(0)

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

# KElectra

## Import Libs

In [None]:
import torch
from torch import nn

## Hyperparameter init
- 데이터셋에서 가장 긴 문장의 길이는 이었고, 이를 수용하기 위해 max_len을 256으로 했을 때, 모든 단어가 embedding될 수 있었다.

In [None]:
hidden_size = 768
num_classes = 7
dr_rate = 0.0005
epochs = 10

# Add Classifier

In [None]:
class KoELECTRAClassifier(nn.Module):
    def __init__(self,
                 koelectra,
                 hidden_size=768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(KoELECTRAClassifier, self).__init__()
        self.koelectra = koelectra
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    '''
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()
    '''
    def forward(self, token_ids, attention_mask):
        #attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        #_, pooler = self.koelectra(input_ids = token_ids, token_type_ids = .long(), attention_mask = attention_mask.float().to(token_ids.device))
        y_pred = self.koelectra(token_ids, attention_mask)
        if self.dr_rate:
            out = self.dropout(y_pred)
        else:
            out = y_pred
        return self.classifier(out)

In [None]:
from transformers import ElectraModel, AdamW
from torch.nn import MSELoss

koelectra = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")  # KoELECTRA-Base-v3


model = KoELECTRAClassifier(koelectra, hidden_size, num_classes, dr_rate)#.to(device)

optimizer = AdamW(model.parameters(), lr=5e-6)
    


In [None]:
from tqdm.notebook import tqdm

losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for token_ids, attention_masks_batch, y_batch in tqdm(train_dataloader):

    optimizer.zero_grad()
    token_ids = token_ids.long()
    attention_masks_batch = attention_masks_batch.long()
    #y_pred = model(token_ids.to(device), attention_masks_batch.to(device))
    y_pred = model(token_ids, attention_masks_batch)
    loss = MSELoss(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)