In [1]:
# 메모리내 변수 제거

all = [var for var in globals() if var[0] != "_"]   # globals() 목록의 첫글자가 _ 로 시작하지 않는 자료의 리스트만 가져와서
for var in all:
    del globals()[var]

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from pytorch_transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.optim import Adam, AdamW
import torch.nn.functional as F

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


### 데이터 불러오기 및 전처리

In [3]:
# mbti_df = pd.read_csv('C:/Users/user/Desktop/MBTI_2.0/model/mbti_data.csv')
mbti_df = pd.read_csv('C:/Users/user/Desktop/MBTI_2.0/model/mbti_data_add.csv', encoding='cp949')

mbti_df.head()

Unnamed: 0,mbti,first_comment,second_comment,third_comment
0,N,어떠한 과정을 거쳐 변화되었는지 주변에 마법사가 생겼는지 찾아본다,피 안빨면 같이 살게 해준다,머리에 물을 준다!
1,N,1.망했,2.망했,3.망했
2,N,1 설마 진짜 기타가 젤리로 변하진 않았을거라고 생각하고 가족들한테 이 장난 누가 ...,2 고민해본다,3 사진을 찍은 뒤 살살 당겨보고 괜찮을것같으면 뽑는다
3,N,1. 연주해본다,2. 나를 괴롭히지 않는다는 계약서를 쓴다,3. 물은 어떻게 주지 고민한다
4,S,1.먹는다,2.싫다,3.자른다


In [4]:
#################################################### 전처리 작업 - 숫자 및 특수 기호 제거

mbti_df['first_comment']=mbti_df['first_comment'].str.replace(r"[0-9]","")
mbti_df["first_comment"] = mbti_df["first_comment"].str.replace(pat = r'[^\w]', repl=r'', regex=True)

mbti_df['second_comment']=mbti_df['second_comment'].str.replace(r"[0-9]","")
mbti_df["second_comment"] = mbti_df["second_comment"].str.replace(pat = r'[^\w]', repl=r'', regex=True)

mbti_df['third_comment']=mbti_df['third_comment'].str.replace(r"[0-9]","")
mbti_df["third_comment"] = mbti_df["third_comment"].str.replace(pat = r'[^\w]', repl=r'', regex=True)

In [5]:
# X1 = mbti_df['first_comment']
# X2 = mbti_df['second_comment']
# X3 = mbti_df['third_comment']
# y = mbti_df['mbti']

In [6]:
#################################################### label을 숫자로 변환

le = preprocessing.LabelEncoder()
mbti_df['mbti'] = le.fit_transform(mbti_df['mbti'])

In [7]:
mbti_df.head()

Unnamed: 0,mbti,first_comment,second_comment,third_comment
0,0,어떠한과정을거쳐변화되었는지주변에마법사가생겼는지찾아본다,피안빨면같이살게해준다,머리에물을준다
1,0,1망했,2망했,3망했
2,0,1설마진짜기타가젤리로변하진않았을거라고생각하고가족들한테이장난누가친거냐고물어본다,2고민해본다,3사진을찍은뒤살살당겨보고괜찮을것같으면뽑는다
3,0,1연주해본다,2나를괴롭히지않는다는계약서를쓴다,3물은어떻게주지고민한다
4,1,1먹는다,2싫다,3자른다


In [8]:
mbti_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 934 entries, 0 to 933
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   mbti            934 non-null    int32 
 1   first_comment   934 non-null    object
 2   second_comment  932 non-null    object
 3   third_comment   934 non-null    object
dtypes: int32(1), object(3)
memory usage: 25.7+ KB


In [9]:
mbti_df['mbti'].value_counts()

mbti
1    473
0    461
Name: count, dtype: int64

In [10]:
device = torch.device("cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", do_lower_case=False)
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [11]:
#################################################### 토큰화 작업
tokenized_texts = [tokenizer.tokenize(comment) for comment in mbti_df['first_comment']]

# 문장 최대 시퀀스를 설정해 정수 인코딩 및 제로 패딩 수행
max_len = max(len(item) for item in tokenized_texts)
print(max_len)

input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post") # 문서 뒤에 0을 채우기 위해 padding="post" 지정
input_ids = torch.tensor(input_ids)
labels = torch.tensor(mbti_df['mbti'])

52


In [12]:
print(tokenized_texts)
print(input_ids)
print(labels)

[['어', '##떠', '##한', '##과', '##정을', '##거', '##쳐', '##변', '##화', '##되', '##었', '##는', '##지', '##주', '##변', '##에', '##마', '##법', '##사가', '##생', '##겼', '##는', '##지', '##찾', '##아', '##본', '##다'], ['1', '##망', '##했'], ['1', '##설', '##마', '##진', '##짜', '##기', '##타', '##가', '##젤', '##리로', '##변', '##하', '##진', '##않', '##았', '##을', '##거', '##라고', '##생', '##각', '##하고', '##가', '##족', '##들', '##한', '##테', '##이', '##장', '##난', '##누', '##가', '##친', '##거', '##냐', '##고', '##물', '##어', '##본', '##다'], ['1', '##연', '##주', '##해', '##본', '##다'], ['1', '##먹', '##는다'], ['1일', '##단', '##다', '##시', '##돌', '##아', '##올', '##수', '##잇', '##으', '##니', '##모', '##셔', '##둔', '##다'], ['1', '##어', '##쩔', '##수', '##없', '##다'], ['[UNK]'], ['1', '##줄', '##도', '##젤', '##리', '##인', '##지', '##먹', '##어', '##본', '##다'], ['1', '##뭔', '##소', '##리', '##지'], ['1', '##먹', '##었'], ['먹', '##었다'], ['다시', '##기', '##타', '##로', '##되', '##돌', '##릴', '##방', '##법을', '##찾', '##아', '##본', '##다'], ['[UNK]'], ['1', '##딱', '##한', '##입', '##만', '#

In [13]:
#################################################### train/test/val split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
# train_inputs, val_inputs, train_labels, val_labels = train_test_split(train_inputs, train_labels, test_size=0.1, random_state=42)

print(len(train_inputs), len(test_inputs))
# print(len(train_inputs), len(test_inputs), len(val_inputs))

747 187


In [14]:
train_data = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)

test_data = TensorDataset(test_inputs, test_labels)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False)

In [15]:
#################################################### 학습 진행
epochs = 25
# optimizer = Adam(model.parameters(), lr=1e-5)
# optimizer = AdamW(model.parameters(), lr=2e-5)
optimizer = AdamW(model.parameters(), lr=2e-5, eps = 1e-8)
# 1e-8, 2e-5, 1e-5 / eps : 1e-8

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

best_val_accuracy = 0.0
patience = 5
# early stopping - 몇 번의 에폭 동안 성능 향상이 없으면 중지할 것인지 지정

for epoch in range(epochs):
    model.train()
    total_correct, total_samples, total_loss = 0,0,0
    for step, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        inputs, labels = batch
        outputs = model(inputs, labels=labels.to(device).long())
        loss, logits = outputs

        total_loss += loss.item()
        
        predicted = torch.argmax(logits, 1)
        # print(predicted)
        # print(labels)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

        loss.backward()

        optimizer.step()

        scheduler.step()
        
        model.zero_grad()

    epoch_loss = total_loss / len(train_dataloader)
    epoch_accuracy = total_correct / total_samples

    print(f"train - epoch [{epoch+1}/{epochs}] - Loss: {epoch_loss:.4f} - Accuracy: {epoch_accuracy:.4f}")

    # evaluate
    model.eval()
    val_loss, val_correct, val_samples = 0,0,0
    for test_batch in test_dataloader:
        test_inputs, test_labels = test_batch
        with torch.no_grad():
            test_outputs = model(test_inputs, labels=test_labels.to(device).long())
            
        t_loss, t_logits = test_outputs

        val_loss += t_loss.item()

        t_predicted = torch.argmax(t_logits, 1)
        val_correct += (t_predicted == test_labels).sum().item()
        val_samples += test_labels.size(0)
        
    val_loss /= len(test_dataloader)
    val_accuracy = val_correct / val_samples

    print(f"validation - epoch [{epoch+1}/{epochs}] -  Loss: {val_loss:.4f} - Accuracy: {val_accuracy:.4f}")

    # early stopping
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        patience_cnt = 0

        model_name = f'best_model_{best_val_accuracy:.4f}.pth'
        torch.save(model.state_dict(), model_name)
        # torch.save(model, model_name)
    else:
        patience_cnt += 1
        
    if patience_cnt >= patience:
        print('Early stopping')
        break


train - epoch [1/25] - Loss: 0.6519 - Accuracy: 0.6064
validation - epoch [1/25] -  Loss: 0.6510 - Accuracy: 0.6471
train - epoch [2/25] - Loss: 0.6082 - Accuracy: 0.6760
validation - epoch [2/25] -  Loss: 0.6385 - Accuracy: 0.6524
train - epoch [3/25] - Loss: 0.5941 - Accuracy: 0.7068
validation - epoch [3/25] -  Loss: 0.6568 - Accuracy: 0.6310
train - epoch [4/25] - Loss: 0.5702 - Accuracy: 0.7189
validation - epoch [4/25] -  Loss: 0.6660 - Accuracy: 0.6417
train - epoch [5/25] - Loss: 0.5062 - Accuracy: 0.7537
validation - epoch [5/25] -  Loss: 0.6677 - Accuracy: 0.6952
train - epoch [6/25] - Loss: 0.4850 - Accuracy: 0.7617
validation - epoch [6/25] -  Loss: 0.7796 - Accuracy: 0.6096
train - epoch [7/25] - Loss: 0.4766 - Accuracy: 0.7537
validation - epoch [7/25] -  Loss: 0.6827 - Accuracy: 0.6845
train - epoch [8/25] - Loss: 0.4294 - Accuracy: 0.7992
validation - epoch [8/25] -  Loss: 0.7461 - Accuracy: 0.7166
train - epoch [9/25] - Loss: 0.4047 - Accuracy: 0.8086
validation - epoc

In [16]:
# #################################################### 학습 진행
# epochs = 100
# optimizer = Adam(model.parameters(), lr=2e-5)
# # 1e-8

# patience = 3
# # early stopping - 몇 번의 에폭 동안 성능 향상이 없으면 중지할 것인지 지정

# for epoch in range(epochs):
#     model.train()
#     total_correct = 0
#     total_samples = 0
#     total_loss = 0
#     for step, batch in enumerate(train_dataloader):
#         optimizer.zero_grad()
#         inputs, labels = batch
#         outputs = model(inputs, labels=labels.to(device).long())
#         loss, logits = outputs

#         loss.backward()

#         optimizer.step()

#         total_loss += loss.item()
#         predicted = torch.argmax(logits, 1)
#         # print(predicted)
#         # print(labels)
#         total_correct += (predicted == labels).sum().item()
#         total_samples += labels.size(0)

#     epoch_loss = total_loss / len(train_dataloader)
#     epoch_accuracy = total_correct / total_samples

#     print(f"Epoch [{epoch+1}/{epochs}] - Loss: {epoch_loss:.4f} - Accuracy: {epoch_accuracy:.4f}")

In [17]:
# #################################################### 학습 진행
# epochs = 4
# optimizer = Adam(model.parameters(), lr=2e-5)
# total_loss = 0
# total_len = 0
# total_accuracy = 0

# for epoch in range(epochs):
#     model.train()
#     for step, batch in enumerate(train_dataloader):
#         optimizer.zero_grad()
#         inputs, labels = batch
#         outputs = model(inputs, labels=labels)
#         loss, logits = outputs

#         # 예측 결과
#         pred = torch.argmax(F.softmax(logits), dim=1)
#         # print('pred',pred)
#         # print('real_label', labels)

#         # 실제 레이블과 예측 값 비교
#         accuracy = accuracy_score(labels, pred)
#         total_accuracy += accuracy

#         total_len += len(labels)
#         total_loss += loss.item()

#         # gradient 연산
#         loss.backward()
#         # parameter update
#         optimizer.step()
#         # gradient 초기화
#         model.zero_grad()

#     print('[Epoch {}] -> Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, total_loss/len(train_dataloader), total_accuracy/len(train_dataloader)))


In [18]:
# # Training loop
# optimizer = AdamW(model.parameters(), lr=2e-5)
# for epoch in range(epochs):
#     model.train()
#     for batch in train_dataloader:
#         optimizer.zero_grad()
#         inputs, labels = batch
#         outputs = model(inputs, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

# # Evaluation
# model.eval()
# with torch.no_grad():
#     # Evaluate on validation set or test set
#     # ...

# # Prediction
# new_comments = ["A new comment.", "Another new comment."]
# tokenized_new_comments = [tokenizer.tokenize(comment) for comment in new_comments]
# input_ids_new = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_new_comments]
# input_ids_new = torch.tensor(input_ids_new)
# with torch.no_grad():
#     logits = model(input_ids_new)
#     predicted_labels = torch.argmax(logits, dim=1)

# print(predicted_labels)
