<a href="https://colab.research.google.com/github/hsmu-jinhyeong/deep_learning_project/blob/main/mbti_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers



In [None]:
import pandas as pd

# 데이터 경로
reddit_df = pd.read_csv("/content/drive/MyDrive/2025-1/dl/dl project /MBTI 500.csv")
twitter_df = pd.read_csv("/content/drive/MyDrive/2025-1/dl/dl project /twitter_MBTI.csv")

# 데이터 확인
print(reddit_df.head())
print(twitter_df.head())

                                               posts  type
0  know intj tool use interaction people excuse a...  INTJ
1  rap music ehh opp yeah know valid well know fa...  INTJ
2  preferably p hd low except wew lad video p min...  INTJ
3  drink like wish could drink red wine give head...  INTJ
4  space program ah bad deal meing freelance max ...  INTJ
   Unnamed: 0                                               text label
0           0  @Pericles216 @HierBeforeTheAC @Sachinettiyil T...  intj
1           1  @Hispanthicckk Being you makes you look cute||...  intj
2           2  @Alshymi Les balles sont réelles et sont tirée...  intj
3           3  I'm like entp but idiotic|||Hey boy, do you wa...  intj
4           4  @kaeshurr1 Give it to @ZargarShanif ... He has...  intj


In [None]:
print(reddit_df.columns)
print(twitter_df.columns)

Index(['posts', 'type'], dtype='object')
Index(['Unnamed: 0', 'text', 'label'], dtype='object')


In [None]:
# 필요한 라이브러리
!pip install transformers datasets scikit-learn --quiet

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# 1. 데이터 로드
reddit_df = pd.read_csv("/content/drive/MyDrive/2025-1/dl/dl project /MBTI 500.csv")
twitter_df = pd.read_csv("/content/drive/MyDrive/2025-1/dl/dl project /twitter_MBTI.csv")

# 2. 열 이름 정리 및 통일
reddit_df = reddit_df.rename(columns={"posts": "text", "type": "label"})
twitter_df = twitter_df.rename(columns={"text": "text", "label": "label"})

# 3. 소문자 처리 및 데이터 합치기
reddit_df['label'] = reddit_df['label'].str.lower()
twitter_df['label'] = twitter_df['label'].str.lower()
combined_df = pd.concat([reddit_df[['text', 'label']], twitter_df[['text', 'label']]])
combined_df = combined_df.dropna(subset=["text", "label"])

# 4. 클래스 균형 맞추기 (undersampling)
min_count = combined_df['label'].value_counts().min()
balanced_df = pd.concat([
    resample(df, replace=False, n_samples=min_count, random_state=42)
    for label, df in combined_df.groupby("label")
])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 5. 토크나이저 준비
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# 6. 라벨 인코딩
labels = balanced_df["label"].tolist()
label_to_id = {label: idx for idx, label in enumerate(sorted(set(labels)))}
id_to_label = {idx: label for label, idx in label_to_id.items()}
encoded_labels = [label_to_id[label] for label in labels]

# 7. 데이터 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(
    balanced_df["text"].tolist(), encoded_labels, test_size=0.2, stratify=encoded_labels, random_state=42
)

# 8. 텍스트 토크나이징
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# 9. PyTorch Dataset 정의
class MBTIDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MBTIDataset(train_encodings, train_labels)
val_dataset = MBTIDataset(val_encodings, val_labels)

# 10. BERT 분류 모델 정의
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_to_id))



# 11. 학습 파라미터 정의
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate = 2e-5,
    num_train_epochs=10,  # 최대 에포크 수 (early stopping 때문에 여유 있게 줘도 됨)
    logging_strategy="epoch",
    report_to="none",
    load_best_model_at_end=True,  # 가장 좋은 모델 저장
    metric_for_best_model="eval_loss",  # 기준이 될 metric (eval_accuracy도 가능)
    greater_is_better=False  # loss는 낮을수록 좋음
)



# 12. Trainer 정의
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # 2 에포크 연속으로 나빠지면 중단
)

# 13. 모델 학습
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,2.527,2.08963
2,1.7725,1.547514
3,1.4112,1.454226
4,1.2232,1.474138
5,1.0802,1.517419


TrainOutput(global_step=2290, training_loss=1.6028160928118176, metrics={'train_runtime': 2185.037, 'train_samples_per_second': 16.75, 'train_steps_per_second': 2.096, 'total_flos': 4815537551769600.0, 'train_loss': 1.6028160928118176, 'epoch': 5.0})

Epoch	Training Loss	Validation Loss ( epoch = 5, learning rate = 2e- 5)


1	2.533000	2.194306

2	1.856900	1.593964

3	1.445100	1.478045

4	1.212600	1.446583

5	1.066600	1.439478

In [None]:
from sklearn.metrics import accuracy_score

# 예측 결과 얻기
predictions = trainer.predict(val_dataset)

# logits → 예측 클래스 ID로 변환
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# 정확도 계산
accuracy = accuracy_score(val_labels, preds)
print(f"Validation Accuracy: {accuracy:.4f}")


Validation Accuracy: 0.5841


In [None]:
# tensor 값을 int로 변환하여 KeyError 방지
decoded_preds = [id_to_label[int(p)] for p in preds]
decoded_labels = [id_to_label[int(l)] for l in val_labels]

# 예시 10개 출력
for i in range(10):
    print(f"[TEXT] {val_texts[i][:100]}...")
    print(f"[REAL] {decoded_labels[i]} / [PRED] {decoded_preds[i]}")
    print("-" * 50)


[TEXT] terrify angry scream calm steadily kind turn psychopath quite literally also scar rarely get angry i...
[REAL] istj / [PRED] istj
--------------------------------------------------
[TEXT] fps league starcraft limit amount possibility possible si prep prepare literally every possibility a...
[REAL] isfp / [PRED] isfp
--------------------------------------------------
[TEXT] chorus carry vein often find ground bring back reality feel bite stimulate luckily know thing satiat...
[REAL] isfp / [PRED] isfp
--------------------------------------------------
[TEXT] set interest maybe could buy help add fund medical bill nadala dami daw niya effort pero walng napal...
[REAL] infj / [PRED] infj
--------------------------------------------------
[TEXT] go away oh god honey imp wristed hit shoulder insu man big cock like hudder go world tbh lot mbti sc...
[REAL] entj / [PRED] entj
--------------------------------------------------
[TEXT] right fi would come say particular situation generall

In [None]:
model.save_pretrained("./bert_mbti_model")
tokenizer.save_pretrained("./bert_mbti_model")


('./bert_mbti_model/tokenizer_config.json',
 './bert_mbti_model/special_tokens_map.json',
 './bert_mbti_model/vocab.txt',
 './bert_mbti_model/added_tokens.json')

In [None]:
pip install spotipy

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# 1. Spotify API 인증
client_id = "9d2c224497d54b5b8ed758c39c8190e3"
client_secret = "22c02013960e4d338013937ded394671"
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

# 2. MBTI → 키워드 매핑 (원하는 대로 수정 가능)
mbti_keywords = {
    "intj": "instrumental",
    "intp": "experimental",
    "infj": "indie",
    "infp": "acoustic",
    "entj": "rock",
    "entp": "hip hop",
    "enfj": "pop",
    "enfp": "dance",
    "istj": "classical",
    "isfj": "lofi",
    "istp": "techno",
    "isfp": "ambient",
    "estj": "metal",
    "esfj": "kpop",
    "estp": "edm",
    "esfp": "rnb"
}

# 3. 예측된 MBTI 하나 선택 (예: "enfp"로 가정)
predicted_mbti = "enfp"
search_query = mbti_keywords.get(predicted_mbti.lower(), "pop")  # fallback to pop

# 4. Spotify에서 검색
results = sp.search(q=f"genre:{search_query}", type="track", limit=5)

# 5. 결과 출력
for i, track in enumerate(results['tracks']['items']):
    name = track['name']
    artist = track['artists'][0]['name']
    url = track['external_urls']['spotify']
    print(f"{i+1}. {name} - {artist}\n   ▶ {url}")


In [None]:
# 필요한 라이브러리 설치
!pip install transformers datasets scikit-learn spotipy --quiet

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from spotipy import Spotify
from spotipy.oauth2 import SpotifyClientCredentials

# 1. Spotify API 인증 정보 설정
client_id = "9d2c224497d54b5b8ed758c39c8190e3"
client_secret = "22c02013960e4d338013937ded394671"
sp = Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

# 2. MBTI → 키워드 매핑 사전
mbti_keywords = {
    "intj": "instrumental",
    "intp": "experimental",
    "infj": "indie",
    "infp": "acoustic",
    "entj": "rock",
    "entp": "hip hop",
    "enfj": "pop",
    "enfp": "dance",
    "istj": "classical",
    "isfj": "lofi",
    "istp": "techno",
    "isfp": "ambient",
    "estj": "metal",
    "esfj": "kpop",
    "estp": "edm",
    "esfp": "rnb"
}

# 3. BERT 모델과 토크나이저 로드 (본인의 학습된 모델 경로에 맞게 수정)
model_path = "/content/bert_mbti_model"  # 예시 경로
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# 4. 라벨 id -> 문자열 매핑 (훈련 시 사용한 id_to_label 예시)
id_to_label = {
    0: "intj",
    1: "intp",
    2: "infj",
    3: "infp",
    4: "entj",
    5: "entp",
    6: "enfj",
    7: "enfp",
    8: "istj",
    9: "isfj",
    10: "istp",
    11: "isfp",
    12: "estj",
    13: "esfj",
    14: "estp",
    15: "esfp"
}

# 5. 텍스트를 받아 MBTI 예측 함수
def predict_mbti(text, tokenizer, model, id_to_label):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    pred_id = torch.argmax(logits, dim=1).item()
    pred_label = id_to_label[pred_id]
    return pred_label

# 6. MBTI로 음악 추천 함수
def recommend_music_by_mbti(predicted_mbti, sp, mbti_keywords):
    search_query = mbti_keywords.get(predicted_mbti.lower(), "pop")
    results = sp.search(q=f"genre:{search_query}", type="track", limit=5)
    print(f"\n[MBTI: {predicted_mbti}] 장르 키워드: '{search_query}' 에 맞는 음악 추천입니다.\n")
    for i, track in enumerate(results['tracks']['items']):
        name = track['name']
        artist = track['artists'][0]['name']
        url = track['external_urls']['spotify']
        print(f"{i+1}. {name} - {artist}\n   ▶ {url}")

# 7. 메인 함수 (입력 → 예측 → 추천)
def main():
    user_text = input("텍스트를 입력하세요 (MBTI 예측 및 음악 추천):\n")
    predicted_mbti = predict_mbti(user_text, tokenizer, model, id_to_label)
    recommend_music_by_mbti(predicted_mbti, sp, mbti_keywords)

if __name__ == "__main__":
    main()


텍스트를 입력하세요 (MBTI 예측 및 음악 추천):
나는 전화오는게 싫어

[MBTI: istp] 장르 키워드: 'techno' 에 맞는 음악 추천입니다.

1. Clarity - Zedd
   ▶ https://open.spotify.com/track/60wwxj6Dd9NJlirf84wr2c
2. The Middle - Zedd
   ▶ https://open.spotify.com/track/09IStsImFySgyp0pIQdqAc
3. Stay - Zedd
   ▶ https://open.spotify.com/track/6uBhi9gBXWjanegOb2Phh0
4. Everytime We Touch - Cascada
   ▶ https://open.spotify.com/track/5YJtMNWKe55yr49cyJgxva
5. Stay The Night - Featuring Hayley Williams Of Paramore - Zedd
   ▶ https://open.spotify.com/track/2QtJA4gbwe1AcanB2p21aP
