<a href="https://colab.research.google.com/github/imstaHub/hanghae99/blob/master/week3_advanced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Week3 Advanced Homework

In [None]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses kagglehub

In [None]:
import torch

#device setting
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.backends.cuda.is_built():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

In [None]:
def accuracy(model, dataloader, device):
  cnt = 0
  acc = 0

  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to(device), labels.to(device)

    preds = model(inputs)
    preds = torch.argmax(preds, dim=-1)
    #preds = (preds > 0).long()[..., 0]

    cnt += labels.shape[0]
    acc += (labels == preds).sum().item()

## [My CODE] dataset download

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("debasisdotcom/name-entity-recognition-ner-dataset")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import os

csv_list = os.listdir(path)
data_list = []
for data_file in csv_list:
    tmp_path = '/'.join([path, data_file])
    print(tmp_path)
    data_list.append(pd.read_csv(tmp_path, encoding='unicode_escape'))

raw_df = pd.concat(data_list)
raw_df.head()

## [MY CODE] data prerprocessing

In [None]:
sentence_col='Sentence #'
word_col='Word'
tag_col='Tag'

# fill sentence label
df = raw_df.copy()
df[sentence_col] = df[sentence_col].ffill()

gdf = df.groupby(by=sentence_col)[word_col].count().reset_index()
print('문장 개수:', len(gdf))
print('최대 문장 길이:', gdf[word_col].max())

# 문장 분포
import matplotlib.pyplot as plt

plt.hist(gdf[word_col],bins=100)
plt.title('sentence length distribution')
plt.show()

In [None]:
## Tag 중 전부 'O'로 이루어진 문장 삭제
def remove_o_sentence(ser):
    o_all_flag = (ser=='O').all()
    if(o_all_flag): return 'remove'
    else: return ''

gdf = df.groupby(by=sentence_col)[tag_col].apply(remove_o_sentence).reset_index()
remove_sentence = list(gdf.loc[gdf[tag_col]=='remove',sentence_col])
print('###########################################')
print('(O 문장) 제거 수:', len(remove_sentence))
print('(O 문장) 제거 대상 예시:', remove_sentence[:10])

## Word에 NaN이 들어간 문장 삭제
remove_sentence2 = list(df.loc[df[word_col].isna(),sentence_col].unique())
print('###########################################')
print('(nan 포함 문장) 제거 수:', len(remove_sentence2))
print('(nan 포함 문장) 제거 대상 예시:', remove_sentence2)

remove_sentence = list(dict.fromkeys(remove_sentence+remove_sentence2))
df = df[~df[sentence_col].isin(remove_sentence)].reset_index(drop=True)

## data check
gdf = df.groupby(by=sentence_col)[word_col].count().reset_index()
print('###########################################')
print('(전처리 후)문장 개수:', len(gdf))
print('(전처리 후)최대 문장 길이:', gdf[word_col].max())

plt.hist(gdf[word_col],bins=100)
plt.title('sentence length distribution(after preprocessing)')
plt.show()

In [None]:
## label check
tag_list = list(df[tag_col].unique())
print('tag 수:', len(tag_list))
print(tag_list)

tag_info = {}
label_info = {}
for i in range(len(tag_list)):
    tag_info[tag_list[i]] = i
    label_info[i] = tag_list[i]

tag_info

In [None]:
label_info

In [None]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'distilbert-base-uncased')

df['Token'] = df[word_col].apply(lambda word: tokenizer(word).input_ids[1])
df['Label'] = [tag_info[x] for x in df[tag_col]]

## [MY CODE] data를 하나의 row로 변환 및 train, test 분할

In [None]:
df_list = []
for col  in [word_col,'Token','Label']:
    df_list.append(df.groupby(by=[sentence_col])[col].apply(list))

rst_df = pd.concat(df_list, axis=1)
rst_df = rst_df.reset_index()
#rst_df = rst_df.reset_index().rename(columns={'index':'id'})
input_df = rst_df[[word_col,'Token','Label']]
input_df.head()


In [None]:
from sklearn.model_selection import train_test_split
train_df,test_df = train_test_split(input_df, test_size = 0.2, random_state = 42)

## [MY CODE] Dataset으로 만들기

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from torch.nn.utils.rnn import pad_sequence

# dataframe을 넣어서 torch의 Dataset으로 변경하는 class
class NER_Dataset(Dataset):
    def __init__(self, df):
        self.data = df.to_dict(orient='records')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]['Word']
        y = self.data[idx]['Label']

        return x, y

# collate 함수에서 label의 shape을 input shape을 맞추기 위해 직접 pad를 넣는 함수
def pad_labels(labels, max_length):
    padded_labels = [label+[0]*(max_length-len(label)) for label in labels]
    return torch.tensor(padded_labels, dtype=torch.long)

def collate_fn(batch):
    max_len = 104
    texts, labels = [], []
    for row in batch:
        texts.append(row[0])
        labels.append(row[1])

    # 하나의 문장이 아닌 경우, list로 된 문장을 개별로 tokenize. 이후 pad_sequence로 하나의 tensor로 맞춘다
    encodings = [tokenizer(sentence, padding='max_length', truncation=True, max_length=max_len, is_split_into_words=True, return_tensors='pt') for sentence in texts]
    input_ids = [enc.input_ids.squeeze(0) for enc in encodings]
    masks = [enc.attention_mask.squeeze(0) for enc in encodings]

    input_ids = pad_sequence(input_ids, batch_first=True)
    masks = pad_sequence(masks, batch_first=True)

    labels = pad_labels(labels,max_len)

    return input_ids, masks, labels


ds_train = NER_Dataset(df=train_df)
train_loader = DataLoader(
    ds_train, batch_size=64, shuffle=True, collate_fn=collate_fn
)

ds_test = NER_Dataset(df=test_df)
test_loader = DataLoader(
    ds_test, batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [None]:
for data in train_loader:
    inputs, masks, labels = data
    break

inputs

In [None]:
labels

In [None]:
print(f'input shape: {inputs.shape} | mask shape" {masks.shape} | label shape: {labels.shape}')

## [LOG] 모델 입력,출력 tensor의 shape 확인

## [MY CODE] Model 정의

In [None]:
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
model

In [None]:
from torch import nn
from torch.optim import Adam


class NerClassifier(nn.Module):
  def __init__(self, num_labels):
    super().__init__()
    self.num_labels = num_labels

    self.encoder = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
    self.classifier = torch.nn.Linear(self.encoder.config.hidden_size, num_labels)

  def forward(self, x, mask):
    x = self.encoder(input_ids=x, attention_mask=mask)['last_hidden_state']
    x = self.classifier(x)

    return x


# 최대 문장 길이
num_labels = len(label_info)
model = NerClassifier(num_labels)

# model freeze
for param in model.encoder.parameters():
  param.requires_grad = False


model = model.to(device)
# loss function 변경, 다중분류
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

lr = 0.001
optimizer = Adam(model.parameters(), lr=lr)

## [MY CODE] model 학습

In [None]:
import time

n_epochs = 20

loss_list = []
for epoch in range(n_epochs):
  total_loss = 0.
  model.train()

  start = time.time()
  for data in train_loader:
    model.zero_grad()
    inputs, masks, labels = data
    inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device).long()

    preds = model(inputs, masks) # batch x word_dims x num_labels
    preds = preds.view(-1, preds.shape[-1]) # batch*word_dims x num_labels
    labels2 = labels.view(-1) # batch*word_dims
    loss = loss_fn(preds, labels2)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  end = time.time() - start
  average_loss = total_loss/len(train_loader)
  loss_list.append(average_loss)
  print(f"Epoch {epoch:3d} | Time : {end} | Train Loss: {total_loss} | Average Train Loss: {average_loss}")


In [None]:
import numpy as np
from matplotlib import pyplot as plt

x = np.arange(len(loss_list))

plt.plot(x, loss_list, label='loss')
plt.show()

## [LOG] 학습결과: loss가 조금씩 내려가고 있다.

In [None]:
# 기존 accuracy 코드
# with torch.no_grad():
#   model.eval()
#   train_acc = accuracy(model, train_loader, device)
#   test_acc = accuracy(model, test_loader, device)
#   print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")

## [FEEDBACK] accuracy를 어떻게 비교해야 할 지 모르겠습니다.