In [1]:
import pickle
with open('train_label.pkl', 'rb') as g:
    training_set = pickle.load(g)
with open('test_set.pkl', 'rb') as f:
    test_set = pickle.load(f)

In [2]:
training_set['label'] = training_set['label'].astype(float)

In [3]:
import transformers
transformers.__version__

'4.36.0.dev0'

In [4]:
import os
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO']= '0.0'

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoTokenizer, pipeline
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader
from tqdm.notebook import tqdm

# model_path = 'kakaobrain/kogpt'
# revision = 'KoGPT6B-ryan1.5b-float16'
# tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision,
#             bos_token='[BOS]', eos_token='[EOS]', unk_token='[UNK]', pad_token='[PAD]', mask_token='[MASK]') 
# model = AutoModelForSequenceClassification.from_pretrained(model_path, revision=revision,
#             num_labels=2,  pad_token_id=tokenizer.eos_token_id, torch_dtype='auto', low_cpu_mem_usage=True).to('mps')
# classifier = pipeline(
#     "sentiment-analysis",
#     tokenizer=tokenizer, 
#     model=model, 
#     return_all_scores=True 
# )

model_path = 'skt/ko-gpt-trinity-1.2B-v0.5'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2).to('mps')
classifier = pipeline(
    "sentiment-analysis",
    tokenizer=tokenizer, 
    model=model, 
    return_all_scores=True 
)
X_training = training_set['no_tag_review'].tolist()
Y_training = training_set['label'].tolist()
X_test = test_set['no_tag_review'].tolist()
Y_test = test_set['human_label'].tolist()

train_encodings = tokenizer(X_training, truncation=True, padding=True, return_tensors='pt', max_length=512)
test_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt', max_length=512)
# 'mps'로 데이터 옮기기
train_encodings = {key: val.to('mps') for key, val in train_encodings.items()}
test_encodings = {key: val.to('mps') for key, val in test_encodings.items()}
Y_training_tensor = torch.tensor(Y_training).long().to('mps')
Y_test_tensor = torch.tensor(Y_test).long().to('mps')

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], Y_training_tensor)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], Y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 옵티마이저 및 손실 함수 설정
optimizer = AdamW(classifier.model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

classifier.model.train()
for epoch in range(4):  # 4번의 epoch을 진행, 필요에 따라 변경 가능
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = classifier.model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits.cpu(), labels.cpu())
        loss = loss.to('mps')
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/ko-gpt-trinity-1.2B-v0.5 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1, Loss: 0.14459311962127686


  0%|          | 0/94 [00:00<?, ?it/s]

Epoch 2, Loss: 0.023904554545879364


  0%|          | 0/94 [00:00<?, ?it/s]

Epoch 3, Loss: 0.05988304689526558


  0%|          | 0/94 [00:00<?, ?it/s]

Epoch 4, Loss: 0.028891177847981453


In [7]:
# 학습된 모델 저장
classifier.save_pretrained("./classification_model/"+ model_path+ '/')
tokenizer.save_pretrained("./classification_model/"+ model_path+ '/')

('./classification_model/skt/ko-gpt-trinity-1.2B-v0.5/tokenizer_config.json',
 './classification_model/skt/ko-gpt-trinity-1.2B-v0.5/special_tokens_map.json',
 './classification_model/skt/ko-gpt-trinity-1.2B-v0.5/vocab.json',
 './classification_model/skt/ko-gpt-trinity-1.2B-v0.5/merges.txt',
 './classification_model/skt/ko-gpt-trinity-1.2B-v0.5/added_tokens.json',
 './classification_model/skt/ko-gpt-trinity-1.2B-v0.5/tokenizer.json')

In [8]:
import numpy as np

def accuracy(y_true, y_pred):
    return (y_true == y_pred).sum().item() / len(y_true)

def precision(y_true, y_pred):
    tp = (y_true * y_pred).sum().item()
    fp = ((1 - y_true) * y_pred).sum().item()
    return tp / (tp + fp)

def recall(y_true, y_pred):
    tp = (y_true * y_pred).sum().item()
    fn = (y_true * (1 - y_pred)).sum().item()
    return tp / (tp + fn)

def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * p * r / (p + r)


# 모델을 평가 모드로 설정
classifier.model.eval()

correct_predictions = 0
total_predictions = 0
y_pred = []

# 예측 및 성능 지표 계산을 위한 루프
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to('mps'), attention_mask.to('mps'), labels.to('mps')

        outputs = classifier.model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)

        pred_label = [0, 1]
        batch_pred = []
        for j in predicted:
            predicted_label = pred_label[j]
            y_pred.append(predicted_label)
            batch_pred.append(predicted_label)
        correct_predictions += torch.eq(torch.tensor(batch_pred).to('mps'), labels).sum().item()
        total_predictions += labels.size(0)

y_true = np.array(Y_test.copy())
y_pred = np.array(y_pred)
print(correct_predictions / total_predictions)
print(f"Test Accuracy: {accuracy(y_pred, y_true) * 100:.2f}%")
print(f"Precision: {precision(y_true, y_pred)* 100:.2f}%")
print(f"Recall: {recall(y_true, y_pred)* 100:.2f}%")
print(f"F1 Score: {f1_score(y_true, y_pred)* 100:.2f}%")

  0%|          | 0/14 [00:00<?, ?it/s]

0.9282511210762332
Test Accuracy: 92.83%
Precision: 91.00%
Recall: 92.86%
F1 Score: 91.92%


In [34]:
import pickle
with open('./long_bads_1001.pkl', 'rb') as f:
    real_data = pickle.load(f)

In [58]:
x_real = real_data['no_tag_review'].tolist()
real_encodings = tokenizer(x_real, truncation=True, padding=True, return_tensors='pt', max_length=512)
# 'mps'로 데이터 옮기기
real_encodings = {key: val.to('mps') for key, val in real_encodings.items()}
real_dataset = TensorDataset(real_encodings['input_ids'], real_encodings['attention_mask'], )
real_loader = DataLoader(real_dataset, batch_size=16)


In [86]:
len(x_real)

548538

In [87]:
classifier.model.to('mps')
classifier.model.eval()

real_pred = []
real_prob = []

# 예측 및 성능 지표 계산을 위한 루프
with torch.no_grad():
    for batch in tqdm(real_loader):
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to('mps'), attention_mask.to('mps')

        outputs = classifier.model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

        pred_label = [0, 1]
        for j in predicted:
            predicted_label = pred_label[j]
            real_pred.append(predicted_label)
        real_prob += probs.max(dim=-1)[0].tolist()
        
real_data['updated_pred_label'] = real_pred
real_data['updated_pred_score'] = real_prob

with open('./long_bads_1106.pkl', 'wb') as f:
    pickle.dump(real_data, f)

  0%|          | 0/34284 [00:00<?, ?it/s]

In [2]:
import pickle
with open('./long_bads_1106.pkl', 'rb') as f:
    real_data = pickle.load(f)
with open('data_0930.pkl', 'rb') as f:
    df = pickle.load(f)

In [3]:
real_data[real_data['updated_pred_label']==0].shape

(377557, 18)

In [4]:
# 기존 엔진의 31.1%가 긍정인데 부정으로 잘못 보여지고 있었음
real_data[real_data['updated_pred_label']==1].shape[0] / real_data.shape[0]

0.3117031089915375

In [12]:
real_data['updated_pred_label']

0         0
1         0
2         0
3         0
4         1
         ..
548533    0
548534    0
548535    1
548536    1
548537    1
Name: updated_pred_label, Length: 548538, dtype: int64

In [15]:
str(round((1 - 10 / 100) * 100, 2))

'90.0'