In [19]:
import os
import random
import pandas as pd
import numpy as np
import warnings
from typing import Optional
from tqdm import tqdm as tq
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, MarianMTModel, MarianTokenizer
from transformers import logging
import sentencepiece
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="2,3"
warnings.filterwarnings(action='ignore')
logging.set_verbosity_error()

# 임시 토크나이저
from konlpy.tag import Okt, Komoran

# Configuration
CFG = {
    'SEED':1203,
    'EPOCHS':10,
    'LEARNING_RATE':0.001,
    'BATCH_SIZE':256,
    'PLM':"klue/roberta-large",
    'MAX_LEN':64,
}

In [20]:
os.listdir('open')

['sample_submission.csv', 'test.csv', 'train.csv']

In [21]:
# load data
folder = os.getcwd() + '/open'
train = 'train.csv'
test  = 'test.csv'
submit = 'sample_submission.csv'
train = pd.read_csv(folder + '/' + train)
train, valid = train_test_split(train, test_size=0.2, random_state=CFG['SEED'])

test = pd.read_csv(folder + '/' + test)
submit = pd.read_csv(folder + '/' + submit)
print(f"1. num train : {len(train)}\n")
print(f"2. null train check : \n{train.isnull().sum()}\n")
print(f"3. unique labels : \n{train['유형'].unique()}\n\
{[len(train[train['유형']==train['유형'].unique()[idx]]) for idx in range(len(train['유형'].unique()))]}\n\
{train['극성'].unique()}\n\
{[len(train[train['극성']==train['극성'].unique()[idx]]) for idx in range(len(train['극성'].unique()))]}\n\
{train['시제'].unique()}\n\
{[len(train[train['시제']==train['시제'].unique()[idx]]) for idx in range(len(train['시제'].unique()))]}\n\
{train['확실성'].unique()}\n\
{[len(train[train['확실성']==train['확실성'].unique()[idx]]) for idx in range(len(train['확실성'].unique()))]}\n"
)

1. num train : 13232

2. null train check : 
ID       0
문장       0
유형       0
극성       0
시제       0
확실성      0
label    0
dtype: int64

3. unique labels : 
['추론형' '사실형' '대화형' '예측형']
[1709, 10852, 463, 208]
['긍정' '부정' '미정']
[12627, 464, 141]
['현재' '미래' '과거']
[5493, 1339, 6400]
['확실' '불확실']
[12142, 1090]



### OpenNMT: Open-Source Toolkit for Neural Machine Translation  
- https://arxiv.org/abs/1701.02810  


In [22]:
from transformers import AutoModel, AutoTokenizer, MarianMTModel, MarianTokenizer
ko_to_en = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-tc-big-ko-en")
en_to_ko = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-ko")

In [23]:
from transformers import AutoModel, AutoTokenizer, MarianMTModel, MarianTokenizer

ko_to_en_tok = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-ko-en")
en_to_ko_tok = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-ko")

In [25]:
src_text = [
    "왜이렇게 번역 성능이 안좋은거야"
]
translated = en_to_ko.generate(**en_to_ko_tok(src_text, return_tensors="pt", padding=True))

for t in translated:
    print( en_to_ko_tok.decode(t, skip_special_tokens=True) )




In [27]:
from transformers import pipeline
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
print(pipe("번역기가 성능이 안좋네"))

[{'translation_text': "The translator doesn't work very well."}]


In [34]:
from transformers import pipeline
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-ko")
print(pipe("what the fuck"))

[{'translation_text': 'ok 잘 얼굴'}]


In [None]:
korean_source = "자고 싶어"
eng_translated = ko_to_en.generate(**ko_to_en_tok(korean_source, return_tensors="pt", padding=True))
[ko_to_en_tok.decode(t, skip_special_tokens=True) for t in eng_translated]

In [None]:
src_text = "자고 싶어"

translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

In [None]:
from transformers import pipeline
ko_to_en = pipeline("text2text-generation", model = "circulus/kobart-trans-ko-en-v2")
en_to_ko = pipeline("text2text-generation", model = "circulus/kobart-trans-en-ko-v2")

In [17]:
from transformers import pipeline
ko_to_en = pipeline("text2text-generation", model = "Helsinki-NLP/opus-mt-ko-en")
# en_to_ko = pipeline("text2text-generation", model = "Helsinki-NLP/opus-mt-en-ko")

In [18]:
sample_text = train['문장'][90]
print(sample_text)

first_translate = ko_to_en(sample_text, max_length = 500)[0]['generated_text']
print(first_translate)
back_translate = en_to_ko(first_translate, max_length = 500)[0]['generated_text']
print(back_translate)

NameError: name 'train' is not defined

In [38]:
predict_words = [
    "전망", "예보", "예측", "관측", "예정", "예상", "계획"
]

In [None]:
back_translate

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

src_text = [
    "2, 4, 6 등은 짝수이다.",
    "네."
]

model_name = "circulus/kobart-trans-ko-en-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))

# for t in translated:
#     print( tokenizer.decode(t, skip_special_tokens=True) )

# expected output:
#     2, 4, and 6 are even.
#     Yeah.

tokenizer.encode(src_text)

In [None]:
src_text = "자고 싶어."
tokens = tokenizer(src_text)
model(
    input_ids=torch.tensor(tokens['input_ids']).reshape(1, -1), 
    attention_mask=torch.tensor(tokens['attention_mask']).reshape(1, -1))

In [None]:
model

In [None]:
from transformers import MarianMTModel, MarianTokenizer

src_text = [
    "2, 4, and 6 are even.",
    "Yeah."
]

model_name = "circulus/kobart-trans-en-ko-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))

for t in translated:
    print( tokenizer.decode(t, skip_special_tokens=True) )

# expected output:
#     2, 4, and 6 are even.
#     Yeah.

In [None]:
src_text = [
    "2, 4, 6 등은 짝수이다.",
    "네."
]
translated = ko_to_en.generate(**ko_to_en_tokenizer(src_text, return_tensors="pt", padding=True))

for t in translated:
    print( ko_to_en_tokenizer.decode(t, skip_special_tokens=True) )
