In [1]:
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers


Collecting transformers
  Downloading transformers-4.3.3-py3-none-any.whl (1.9 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.1-cp38-cp38-win_amd64.whl (2.0 MB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py): started
  Building wheel for sacremoses (setup.py): finished with status 'done'
  Created wheel for sacremoses: filename=sacremoses-0.0.43-py3-none-any.whl size=893262 sha256=8db64d7367ee61b99d7c82e7f05a5f2219e6376a8e4489eabf9bb19c12000a4d
  Stored in directory: c:\users\dongh\appdata\local\pip\cache\wheels\7b\78\f4\27d43a65043e1b75dbddaa421b573eddc67e712be4b1c80677
Successfully built sacremoses
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.43 tokenizers-0.10.1 transformers-4.3.3


In [None]:
#!pip install torch==1.7.0
!pip install torch===1.7.1 torchvision===0.8.2 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
import gc
from sklearn.model_selection import KFold
import wget
import numpy as np

In [None]:
wget.download('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt')
wget.download('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt')
wget.download('https://raw.githubusercontent.com/donghyun-daniel/4_party_project/master/Review_crawler/csv_output/Preprocessing_review_pos_neg.csv')

In [None]:
device = torch.device("cuda")

In [None]:
class NSMCDataset(Dataset):
  
    def __init__(self, csv_file):
        # 일부 값중에 NaN이 있음...
        self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
        # 중복제거
        self.dataset.drop_duplicates(subset=['document'], inplace=True)
        self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

        print(self.dataset.describe())

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 1:3].values
        text = row[0]
        y = row[1]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=256,
            pad_to_max_length=True,
            add_special_tokens=True
            )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, y


    
def data_concat(naverTrainData, naverTestData, hotelData): 
    #data_concat("ratings_train.txt", "ratings_test.txt", "Preprocessing_review_pos_neg.csv")
    trainRate = 0.5
    
    data1 = pd.read_table(naverTrainData)
    data2 = pd.read_table(naverTestData)
    data3 = pd.read_csv(hotelData)
    
    data3 = data3.drop(['len_text'], axis=1)
    data3['id'] = 1
    data3 = data3[['id', 'Text', 'Label']]
    data3.columns = ['id','document','label']
    data3 = data3.iloc[np.random.permutation(data3.index)].reset_index(drop=True)
    
    data3Train = data3.sample(frac = trainRate, random_state=2000)
    data3Test = data3.drop(data3Train.index)
    
    trainData = pd.concat([data1, data3Train], ignore_index=True)
    testData = pd.concat([data2, data3Test], ignore_index=True)
    
    trainData.to_csv('sampleTrain.txt', sep = '\t', index = False)
    testData.to_csv('sampleTest.txt', sep = '\t', index = False)

In [None]:
data_concat("ratings_train.txt", "ratings_test.txt", "Preprocessing_review_pos_neg.csv")

In [None]:
train_dataset = NSMCDataset("sampleTrain.txt")
test_dataset = NSMCDataset("sampleTest.txt")

In [None]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator").to(device)

In [None]:
# 에러나도 그냥 넘어가면 됨
model.load_state_dict(torch.load("model.pt"))



In [None]:
model

In [None]:
# GPU 메모리 터지는거 방지
gc.collect()
torch.cuda.empty_cache()

In [None]:
epochs = 3
batch_size = 64

optimizer = AdamW(model.parameters(), lr=1e-5)

# 10 fold cross val
kfold = KFold(n_splits=10, shuffle=True)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

kfold_losses = []
kfold_acc = []


for fold, (train_ids, val_ids) in enumerate(kfold.split(train_dataset)):
    print('----------------------------------')
    print(fold)

    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_ids)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
    val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=val_subsampler)
    
    losses = []
    accuracies = []
    
    for i in range(epochs):
        total_loss = 0.0
        correct = 0
        total = 0
        batches = 0

        model.train()

        for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
            optimizer.zero_grad()
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
            loss = F.cross_entropy(y_pred, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            _, predicted = torch.max(y_pred, 1)
            correct += (predicted == y_batch).sum()
            total += len(y_batch)

            batches += 1
            if batches % 100 == 0:
                print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)

        losses.append(total_loss)
        accuracies.append(correct.float() / total)
        print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)
    loss_avg = sum(losses) / len(losses)
    acc_avg = sum(accuracies) / len(accuracies)
    kfold_losses.append(loss_avg)
    kfold_acc.append(acc_avg)
    
    print('----------------------------------')

In [None]:
kfold_losses, kfold_acc

In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

In [None]:
torch.save(model.state_dict(), "model.pt")

## 여기까지

In [None]:
y_pred

In [None]:
_, pre=torch.max(y_pred, 1)

In [None]:
pre

In [None]:
train_dataset

In [None]:
train_loader

In [None]:
input_ids_batch

In [None]:
attention_masks_batch

In [None]:
y_batch

In [None]:
for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    break

In [None]:
input_ids_batch

In [None]:
attention_masks_batch

In [None]:
y_batch

# 올릴 방법

- 어근 치환 ***
- k-fold crossval 

- pretrain 모델 변경 ***

- fine tuning
// - 데이터 추가 영화, 호텔, 쇼핑 데이터




In [None]:
data = [1,2,3,4,5,6,7,8,9,10]