In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import random
import os

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
!pip install transformers
from transformers import BertTokenizer
from transformers import BertModel
from torch.optim import Adam

import matplotlib as mpl
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')
from IPython.display import clear_output 
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None)

# !pip install textaugment
# from textaugment import Translate
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# from textaugment import EDA
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import lightgbm as lgb
clear_output()

In [None]:
# path='/content/drive/MyDrive/ColabNotebooks/데이콘/월간 데이콘 발화자의 감정인식 AI 경진대회'
path='.'

In [None]:
train = pd.read_csv(path+'/train.csv')
test = pd.read_csv(path+'/test.csv')

In [None]:
def mark_streak(df):
    mark=0
    df['mark_streak']=0
    df['same']=0
    for i in range(1,len(df)):
        df.loc[i,'mark_streak']=mark
        if df.loc[i-1,'Speaker'] != df.loc[i,'Speaker']:
            mark+=1
            df.loc[i,'mark_streak']=mark
        else:
            df.loc[i,'same']=1
            df.loc[i-1,'same']=1
# for i in [train, test, train_aug1, test_aug1, train_aug2, test_aug2]:
#     mark_streak(i)
for i in [train, test]:
    mark_streak(i)

In [None]:
# for i in [train, test, train_aug1, test_aug1, train_aug2, test_aug2]:
#     i['sumed_Utterance']=i.groupby("mark_streak")['Utterance'].transform('sum')
for i in [train, test]:
    i['sumed_Utterance']=i.groupby("mark_streak")['Utterance'].transform('sum')

In [None]:
# for i in [train, test, train_aug1, test_aug1, train_aug2, test_aug2]:
#     i['Dialogue_Utterance']=i.groupby(["Dialogue_ID",'Speaker'])['Utterance'].transform('sum')
for i in [train, test]:
    i['Dialogue_Utterance']=i.groupby(["Dialogue_ID",'Speaker'])['Utterance'].transform('sum')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
CFG = {
    'EPOCHS': 3,
    'LEARNING_RATE':1e-5,
    'BATCH_SIZE':8,
    'SEED':41
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
tokenizers = AutoTokenizer.from_pretrained("tae898/emoberta-large")

Downloading:   0%|          | 0.00/408 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("tae898/emoberta-large")

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [None]:
class CustomDataset(Dataset):
  
    def __init__(self, data, mode = "train", col= 'Utterance'):
        self.dataset = data
        self.tokenizer = tokenizers
        self.mode = mode
        self.col = col
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        #sumed_Utterance, Utterance #Dialogue_Utterance
        if self.col=='Utterance':
            text = self.dataset['Utterance'][idx] 
        elif self.col == 'sumed_Utterance':
            text = self.dataset['sumed_Utterance'][idx] 
        elif self.col == 'Dialogue_Utterance':
            text = self.dataset['Dialogue_Utterance'][idx]
        inputs = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt", add_special_tokens=True)
        input_ids = inputs['input_ids'][0]
        # token_type_ids = inputs['token_type_ids'][0]
        attention_mask = inputs['attention_mask'][0]
    
        if self.mode == "train":
            y = self.dataset['Target'][idx]
            # return input_ids, token_type_ids, attention_mask, y
            return input_ids, attention_mask, y
        else:
            # return input_ids, token_type_ids, attention_mask
            return input_ids, attention_mask

In [None]:
def NL_data(train):
    train1 = CustomDataset(train, mode = "test",col='Utterance')
    dataloader1 = torch.utils.data.DataLoader(train1, batch_size= CFG['BATCH_SIZE'], shuffle=False)

    train2 = CustomDataset(train, mode = "test",col='sumed_Utterance')
    dataloader2 = torch.utils.data.DataLoader(train2, batch_size= CFG['BATCH_SIZE'], shuffle=False)

    train3 = CustomDataset(train, mode = "test",col='Dialogue_Utterance')
    dataloader3 = torch.utils.data.DataLoader(train3, batch_size= CFG['BATCH_SIZE'], shuffle=False)
    
    return dataloader1, dataloader2, dataloader3

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    # for input_ids, token_type_ids, attention_mask in tqdm(test_loader):
    for input_ids, attention_mask in tqdm(test_loader):
        input_id = input_ids.to(device)
        mask = attention_mask.to(device)
        y_pred = model(input_id, mask)
        # print(y_pred)
        test_predict += y_pred.logits.argmax(1).detach().cpu().numpy().tolist()
        # print(y_pred.argmax(1).detach().cpu().numpy().tolist())
        # print(nn.functional.softmax(y_pred.logits, dim=-1).detach().cpu().numpy().tolist())
        test_predict.extend(nn.functional.softmax(y_pred.logits, dim=-1).detach().cpu().numpy().tolist())
        
    print('Done.')
    return test_predict

In [None]:
train_loader=[]
# for i in [train, train_aug1, train_aug2]:
for i in [train]:
    dataloader1, dataloader2, dataloader3=NL_data(i)
    train_loader.extend([dataloader1, dataloader2, dataloader3])  

test_loader=[]
# for i in [test, test_aug1, test_aug2]:
for i in [test]:
    dataloader1, dataloader2, dataloader3=NL_data(i)
    test_loader.extend([dataloader1, dataloader2, dataloader3])

In [None]:
train_preds=[]
for i in train_loader:
    train_preds.append(inference(model, i, device))

test_preds=[]
for i in test_loader:
    test_preds.append(inference(model, i, device))

  0%|          | 0/1249 [00:00<?, ?it/s]

Done.


  0%|          | 0/1249 [00:00<?, ?it/s]

Done.


  0%|          | 0/1249 [00:00<?, ?it/s]

Done.


  0%|          | 0/327 [00:00<?, ?it/s]

Done.


  0%|          | 0/327 [00:00<?, ?it/s]

Done.


  0%|          | 0/327 [00:00<?, ?it/s]

Done.


In [None]:
def erase_list(array):
    pred=[]
    for i in array:
        if type(i)==list:
            pred.append(i)
    return np.array(pred)

train_pred=[]
for i in train_preds:
    train_pred.append(erase_list(i))

test_pred=[]
for i in test_preds:
    test_pred.append(erase_list(i))

In [None]:
train = pd.read_csv(path+'/train.csv')
test = pd.read_csv(path+'/test.csv')

In [None]:
train.loc[train['Target']=='neutral', 'Target']=0
train.loc[train['Target']=='joy', 'Target']=1
train.loc[train['Target']=='surprise', 'Target']=2
train.loc[train['Target']=='anger', 'Target']=3
train.loc[train['Target']=='sadness', 'Target']=4
train.loc[train['Target']=='disgust', 'Target']=5
train.loc[train['Target']=='fear', 'Target']=6

In [None]:
train['Dialogue_length']=train.groupby("Dialogue_ID")['Dialogue_ID'].transform('count')
test['Dialogue_length']=test.groupby("Dialogue_ID")['Dialogue_ID'].transform('count')

In [None]:
def text_split(x):
    x=x.split()
    return len(x)

train['Utterance_length']= train['Utterance'].apply(text_split)
test['Utterance_length']= test['Utterance'].apply(text_split)

In [None]:
def next_Utterance(df):
    for i in ['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']:
        df[f'next_{i}']=np.nan
    for i in range(0,len(df)-1):
        if df.loc[i+1,'Dialogue_ID'] == df.loc[i,'Dialogue_ID']:
            for j in ['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']:
                df.loc[i,f'next_{j}']=df.loc[i+1,j]

def previous_Utterance(df):
    for i in ['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']:
        df[f'previous_{i}']=np.nan
    for i in range(1,len(df)):
        if df.loc[i-1,'Dialogue_ID'] == df.loc[i,'Dialogue_ID']:
            for j in ['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']:
                df.loc[i,f'previous_{j}']=df.loc[i-1,j]

In [None]:
def fill_na(df):
    df=df.fillna(  {'next_neutral': 1, 'next_joy':0, 'next_surprise':0,
             'next_anger':0, 'next_sadness':0, 'next_disgust':0,
             'next_fear':0, 'previous_neutral':1, 'previous_joy':0,
             'previous_surprise':0, 'previous_anger':0, 'previous_sadness':0,
             'previous_disgust':0, 'previous_fear':0}   )  
    return df

In [None]:
def ranking(df):
    for i in tqdm(range(len(df))):
        a=[0,1,2,3,4,5,6]
        b=df.loc[i,['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']].tolist()
        c=list(zip(a,b))
        c.sort(key=lambda x:x[1],reverse=True)
        df.loc[i,'1st']=c[0][0]
        df.loc[i,'2nd']=c[1][0]
        df.loc[i,'3rd']=c[2][0]
        df.loc[i,'4rd']=c[3][0]
        df.loc[i,'5rd']=c[4][0]
        df.loc[i,'6rd']=c[5][0]
        df.loc[i,'7rd']=c[6][0]
        
        b=df.loc[i,['sumed_neutral', 'sumed_joy', 'sumed_surprise', 'sumed_anger', 'sumed_sadness', 'sumed_disgust', 'sumed_fear']].tolist()
        c=list(zip(a,b))
        c.sort(key=lambda x:x[1],reverse=True)
        df.loc[i,'sumed_1st']=c[0][0]
        df.loc[i,'sumed_2nd']=c[1][0]
        df.loc[i,'sumed_3rd']=c[2][0]
        df.loc[i,'sumed_4rd']=c[3][0]
        df.loc[i,'sumed_5rd']=c[4][0]
        df.loc[i,'sumed_6rd']=c[5][0]
        df.loc[i,'sumed_7rd']=c[6][0]

        b=df.loc[i,['next_neutral','next_joy', 'next_surprise', 'next_anger', 'next_sadness', 'next_disgust', 'next_fear']].tolist()
        c=list(zip(a,b))
        c.sort(key=lambda x:x[1],reverse=True)
        df.loc[i,'next_1st']=c[0][0]
        df.loc[i,'next_2nd']=c[1][0]
        df.loc[i,'next_3rd']=c[2][0]
        df.loc[i,'next_4rd']=c[3][0]
        df.loc[i,'next_5rd']=c[4][0]
        df.loc[i,'next_6rd']=c[5][0]
        df.loc[i,'next_7rd']=c[6][0]

        b=df.loc[i,['previous_neutral','previous_joy', 'previous_surprise', 'previous_anger', 'previous_sadness', 'previous_disgust', 'previous_fear']].tolist()
        c=list(zip(a,b))
        c.sort(key=lambda x:x[1],reverse=True)
        df.loc[i,'previous_1st']=c[0][0]
        df.loc[i,'previous_2nd']=c[1][0]
        df.loc[i,'previous_3rd']=c[2][0]
        df.loc[i,'previous_4rd']=c[3][0]
        df.loc[i,'previous_5rd']=c[4][0]
        df.loc[i,'previous_6rd']=c[5][0]
        df.loc[i,'previous_7rd']=c[6][0]

        # b=df.loc[i,['Dialogue_neutral','Dialogue_joy', 'Dialogue_surprise', 'Dialogue_anger', 'Dialogue_sadness', 'Dialogue_disgust', 'Dialogue_fear']].tolist()
        # c=list(zip(a,b))
        # c.sort(key=lambda x:x[1],reverse=True)
        # df.loc[i,'Dialogue_1st']=c[0][0]
        # df.loc[i,'Dialogue_2nd']=c[1][0]
        # df.loc[i,'Dialogue_3rd']=c[2][0]
        # df.loc[i,'Dialogue_4rd']=c[3][0]
        # df.loc[i,'Dialogue_5rd']=c[4][0]
        # df.loc[i,'Dialogue_6rd']=c[5][0]
        # df.loc[i,'Dialogue_7rd']=c[6][0]
    return df

In [None]:
def make_ml_dataset(train_preds1, train_preds2, test_preds1, test_preds2):
    cols=['neutral', 'joy', 'surprise', 'anger','sadness','disgust','fear']
    a=pd.DataFrame(train_preds1, columns=cols)
    cols=['sumed_neutral', 'sumed_joy', 'sumed_surprise', 'sumed_anger','sumed_sadness','sumed_disgust','sumed_fear']
    b=pd.DataFrame(train_preds2, columns=cols)
    train_EDA=pd.concat([a, b], axis = 1)

    cols=['neutral', 'joy', 'surprise', 'anger','sadness','disgust','fear']
    a=pd.DataFrame(test_preds1, columns=cols)
    cols=['sumed_neutral', 'sumed_joy', 'sumed_surprise', 'sumed_anger','sumed_sadness','sumed_disgust','sumed_fear']
    b=pd.DataFrame(test_preds2, columns=cols)
    test_EDA=pd.concat([a, b], axis = 1)
#####################
    train_EDA['Dialogue_length']=train['Dialogue_length']
    train_EDA['Utterance_length']=train['Utterance_length']

    test_EDA['Dialogue_length']=test['Dialogue_length']
    test_EDA['Utterance_length']=test['Utterance_length']

    train_EDA['Dialogue_ID']=train['Dialogue_ID']
    test_EDA['Dialogue_ID']=test['Dialogue_ID']
    
    # train_EDA['same']=train['same']
    # test_EDA['same']=test['same']

    train_EDA['Target']=train['Target']
    ###########
    train_EDA = train_EDA.reset_index(drop=True)
    ##########
    next_Utterance(train_EDA)
    next_Utterance(test_EDA)

    previous_Utterance(train_EDA)
    previous_Utterance(test_EDA)
    #########
    train_EDA = train_EDA.drop(['Dialogue_ID'], axis=1) 
    test_EDA = test_EDA.drop(['Dialogue_ID'], axis=1)

    train_EDA=fill_na(train_EDA)
    test_EDA=fill_na(test_EDA)
    #####
    train_EDA=ranking(train_EDA)
    test_EDA=ranking(test_EDA)
    
    return train_EDA, test_EDA

In [None]:
train_preds1=train_pred[0]
train_preds2=train_pred[1]

test_preds1=test_pred[0]
test_preds2=test_pred[1]

In [None]:
train_EDA, test_EDA=make_ml_dataset(train_preds1, train_preds2, test_preds1, test_preds2)

  0%|          | 0/9989 [00:00<?, ?it/s]

  0%|          | 0/2610 [00:00<?, ?it/s]

In [None]:
train_EDA = train_EDA.astype({'Target':'int'})

In [None]:
X_train = train_EDA.drop(['Target'], axis=1) 
y_train = train_EDA['Target'] 

In [None]:
param={'objective': 'multiclass', 'metric': 'multi_logloss', 'num_leaves': 281, 'max_depth': 2, 'learning_rate': 0.025858084365768752, 'n_estimators': 1898, 'min_child_samples': 14, 'subsample': 0.933450842332073, 'random_state': 41} # 0.6939
model = lgb.LGBMClassifier(**param)
model.fit(X_train, y_train)
preds=model.predict(test_EDA)

In [None]:
pred=[]
emo=['neutral', 'joy', 'surprise', 'anger','sadness','disgust','fear']
for i in preds:
    pred.append(emo[i])

In [None]:
submit = pd.read_csv(path+'/sample_submission.csv')
submit['Target'] = pred
submit.head(10)

Unnamed: 0,ID,Target
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,joy
5,TEST_0005,joy
6,TEST_0006,joy
7,TEST_0007,joy
8,TEST_0008,joy
9,TEST_0009,joy


In [None]:
# submit.to_csv('submit.csv', index=False)
submit.to_csv(path+'/submit.csv', index=False)