<a href="https://colab.research.google.com/github/hsmu-jinhyeong/deep_learning_project/blob/main/mbti_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install -U transformers



In [None]:
import pandas as pd

# 데이터 경로
reddit_df = pd.read_csv("/content/drive/MyDrive/2025-1/dl/dl project /MBTI 500.csv")
twitter_df = pd.read_csv("/content/drive/MyDrive/2025-1/dl/dl project /twitter_MBTI.csv")

# 데이터 확인
print(reddit_df.head())
print(twitter_df.head())

                                               posts  type
0  know intj tool use interaction people excuse a...  INTJ
1  rap music ehh opp yeah know valid well know fa...  INTJ
2  preferably p hd low except wew lad video p min...  INTJ
3  drink like wish could drink red wine give head...  INTJ
4  space program ah bad deal meing freelance max ...  INTJ
   Unnamed: 0                                               text label
0           0  @Pericles216 @HierBeforeTheAC @Sachinettiyil T...  intj
1           1  @Hispanthicckk Being you makes you look cute||...  intj
2           2  @Alshymi Les balles sont réelles et sont tirée...  intj
3           3  I'm like entp but idiotic|||Hey boy, do you wa...  intj
4           4  @kaeshurr1 Give it to @ZargarShanif ... He has...  intj


In [None]:
print(reddit_df.columns)
print(twitter_df.columns)

Index(['type', 'text'], dtype='object')
Index(['Unnamed: 0', 'text', 'label'], dtype='object')


In [7]:
# 필요한 라이브러리
!pip install transformers datasets scikit-learn --quiet

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# 1. 데이터 로드
reddit_df = pd.read_csv("/content/drive/MyDrive/2025-1/dl/dl project /MBTI 500.csv")
twitter_df = pd.read_csv("/content/drive/MyDrive/2025-1/dl/dl project /twitter_MBTI.csv")

# 2. 열 이름 정리 및 통일
reddit_df = reddit_df.rename(columns={"posts": "text", "type": "label"})
twitter_df = twitter_df.rename(columns={"text": "text", "label": "label"})

# 3. 소문자 처리 및 데이터 합치기
reddit_df['label'] = reddit_df['label'].str.lower()
twitter_df['label'] = twitter_df['label'].str.lower()
combined_df = pd.concat([reddit_df[['text', 'label']], twitter_df[['text', 'label']]])
combined_df = combined_df.dropna(subset=["text", "label"])

# 4. 클래스 균형 맞추기 (undersampling)
min_count = combined_df['label'].value_counts().min()
balanced_df = pd.concat([
    resample(df, replace=False, n_samples=min_count, random_state=42)
    for label, df in combined_df.groupby("label")
])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 5. 토크나이저 준비
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# 6. 라벨 인코딩
labels = balanced_df["label"].tolist()
label_to_id = {label: idx for idx, label in enumerate(sorted(set(labels)))}
id_to_label = {idx: label for label, idx in label_to_id.items()}
encoded_labels = [label_to_id[label] for label in labels]

# 7. 데이터 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(
    balanced_df["text"].tolist(), encoded_labels, test_size=0.2, stratify=encoded_labels, random_state=42
)

# 8. 텍스트 토크나이징
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# 9. PyTorch Dataset 정의
class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MBTIDataset(train_encodings, train_labels)
val_dataset = MBTIDataset(val_encodings, val_labels)

# 10. BERT 분류 모델 정의
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_to_id))

# 11. 학습 파라미터 정의
training_args = TrainingArguments(
    output_dir="./result",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=500
)

# 12. Trainer 정의
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# 13. 모델 학습
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········




ValueError: API key must be 40 characters long, yours was 4