**KoNLPy 형태소 분석기**

In [None]:
!pip install konlpy

In [None]:
from konlpy.tag import Okt

#구현 필요
okt = Okt()
print("Okt 형태소 분석: ", okt.morphs("한국어 단어는 형태소들로 구성되어 있다."))
print("Okt 품사 태깅: ", okt.pos("한국어 단어는 형태소들로 구성되어 있다."))

In [None]:
from konlpy.tag import Kkma

#구현 필요
kkma = Kkma()
print("Kkma 형태소 분석: ", kkma.morphs("한국어 단어는 형태소들로 구성되어 있다."))
print("Kkma 품사 태깅: ", kkma.pos("한국어 단어는 형태소들로 구성되어 있다."))

**네이버 영화 리뷰 감성분석**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

In [None]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [None]:
print ('Train data len :',len(train_data))
train_data[:5]

In [None]:
print ('Test data len :',len(test_data))
test_data[:5]

In [None]:
#구현 필요
train_data.drop_duplicates(subset=['document'], inplace=True)

In [None]:
train_data.groupby('label').size()

In [None]:
#구현 필요
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎ ㅏ-ㅣ 가-힣 ]", "")
train_data[:5]

In [None]:
#구현 필요
train_data['document'] = train_data['document'].str.replace('^ +', "")
train_data['document'].replace('', np.nan, inplace=True)
print(train_data.isnull().sum())

In [None]:
train_data.loc[train_data.document.isnull()][:5]

In [None]:
#구현 필요
train_data = train_data.dropna(how='any')
print(len(train_data))

In [None]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [None]:
okt = Okt()
X_train = []
for sentence in tqdm(train_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    X_train.append(stopwords_removed_sentence)
print(X_train[:3])

In [None]:
X_test = []
for sentence in tqdm(test_data['document']):
    tokenized_sentence = okt.morphs(str(sentence), stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    X_test.append(stopwords_removed_sentence)

In [None]:
from collections import Counter
def tokenize(x_train,y_train,x_val,y_val):
    word_list = []

    for sent in x_train:
      for word in sent:        
          word_list.append(word)
  
    corpus = Counter(word_list)
    # sorting on the basis of most common words
    corpus_ = sorted(corpus,key=corpus.get,reverse=True)[:10000]
    # creating a dict
    onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}
    
    # tokenize
    final_list_train,final_list_test = [],[]
    for sent in x_train:
            final_list_train.append([onehot_dict[word] for word in sent
                                     if word in onehot_dict.keys()])
    for sent in x_val:
            final_list_test.append([onehot_dict[word] for word in sent 
                                    if word in onehot_dict.keys()])
   
    return np.array(final_list_train), np.array(y_train),np.array(final_list_test), np.array(y_val),onehot_dict

In [None]:
#구현 필요
x_train, y_train, x_test, y_test, vocab = tokenize(X_train, train_data['label'], X_test, test_data['label'])

In [None]:
rev_len = [len(i) for i in x_train]
pd.Series(rev_len).hist()
plt.show()
pd.Series(rev_len).describe()

In [None]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

x_train_pad = padding_(x_train,50)
x_test_pad = padding_(x_test,50)

In [None]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))

# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [None]:
class GRU_model(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, device):
        super(GRU_model, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.device = device

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0)) # 첫번째 히든 스테이트를 0벡터로 초기화
        x, _ = self.gru(x, h_0)  # GRU의 리턴값은 (배치 크기, 시퀀스 길이, 은닉 상태의 크기)
        h_t = x[:,-1,:] # (배치 크기, 은닉 상태의 크기)의 텐서로 크기가 변경됨. 즉, 마지막 time-step의 은닉 상태만 가져온다.
        logit = self.out(h_t)  # (배치 크기, 은닉 상태의 크기) -> (배치 크기, 출력층의 크기)
        return logit

    def _init_state(self, batch_size):
        new_state = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(self.device)
        return new_state

In [None]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
n_layers = 1
vocab_size = len(vocab) + 1  # extra 1 for <pad>
hidden_dim = 128
embed_dim = 100
n_classes = 2

model = GRU_model(n_layers, hidden_dim, vocab_size, embed_dim, n_classes, device).to(device)

In [None]:
def train(model, criterion, optimizer, data_loader):
    model.train()
    train_loss = 0
    for i, (x, y) in enumerate(data_loader):
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()
        logit = model(x)
        loss = criterion(logit, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * x.size(0)
      
    return train_loss / len(data_loader.dataset)

def evaluate(model, data_loader):
    model.eval()
    corrects, total_loss = 0, 0
    for i, (x, y) in enumerate(data_loader):
        x, y = x.to(device), y.to(device)

        logit = model(x)
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(data_loader.dataset)
    
    avg_accuracy = 100.0 * corrects / size
    return avg_accuracy

In [None]:
num_epochs = 10
lr = 0.001

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for e in range(1, num_epochs+1):
    train_loss = train(model, criterion, optimizer, train_loader)
    test_accuracy = evaluate(model, test_loader)

    print("[Epoch: %d] train loss : %5.2f | test accuracy : %5.2f" % (e, train_loss, test_accuracy))

**BERT 기반 감성분석**

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import re
import urllib.request
import numpy as np

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

In [None]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [None]:
train_data.drop_duplicates(subset=['document'], inplace=True)  # 중복 데이터 제거
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")  # 한글 및 공백 이외 제거
train_data['document'] = train_data['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경
train_data['document'].replace('', np.nan, inplace=True)
train_data = train_data.dropna(how = 'any')  # null값 제거

In [None]:
test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")  # 한글 및 공백 이외 제거
test_data['document'] = test_data['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경
test_data['document'].replace('', np.nan, inplace=True)
test_data = test_data.dropna(how = 'any')  # null값 제거

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

#구현 필요
model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

In [None]:
sampled_train_data = train_data[:10000]
sampled_test_data = test_data[:10000]

#구현 필요
train_tokens = tokenizer(list(sampled_train_data['document']), padding='max_length', truncation=True, return_tensors='pt', add_special_tokens=True)
test_tokens = tokenizer(list(sampled_test_data['document']), padding='max_length', truncation=True, return_tensors='pt', add_special_tokens=True)
print(tokenizer.convert_ids_to_tokens(train_tokens['input_ids'][0]))

In [None]:
import torch

class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

#구현 필요
train_dataset = BERTDataset(train_tokens, list(sampled_train_data['label']))
test_dataset = BERTDataset(test_tokens, list(sampled_test_data['label']))

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=10,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=250,
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    acc = accuracy_score(labels, preds)    
    return {
        'accuracy': acc
    }

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=compute_metrics      # additional evaluation metrics
)

trainer.train()

In [None]:
trainer.evaluate()