In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

In [2]:
df = pd.read_csv('C:/Users/JenMing/Desktop/MBTI/LSTM/mbti_to_LSTM_DF.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,"['INFP', 'INFP', 'INFJ', 'ENFP', 'ISTP', 'INTP..."
1,ENTP,"['INTJ', 'INTP', 'ENFP', 'INTJ', 'INTP', 'INTP..."
2,INTP,"['INTJ', 'INFP', 'INFP', 'INTP', 'INTP', 'INTJ..."
3,INTJ,"['INTJ', 'ISFJ', 'INFP', 'INTP', 'INTP', 'INTP..."
4,ENTJ,"['ENTJ', 'INTP', 'ENFP', 'INTP', 'ENTJ', 'INTJ..."


In [3]:
# 編碼轉換
personality_mapping = {'INFJ': 0,
                        'ENTP': 1,
                        'INTP': 2,
                        'INTJ': 3,
                        'ENTJ': 4,
                        'ENFJ': 5,
                        'INFP': 6,
                        'ENFP': 7,
                        'ISFP': 8,
                        'ISTP': 9,
                        'ISFJ': 10,
                        'ISTJ': 11,
                        'ESTP': 12,
                        'ESFP': 13,
                        'ESTJ': 14,
                        'ESFJ': 15 }

In [4]:
# 資料載入和轉換
encoded_data = []

chars_to_remove = "][' "    

for index, row in df.iterrows():
    dialogues = row["posts"] #字串
    target_personality = row["type"]
    for char in chars_to_remove:
        dialogues = dialogues.replace(char, "")
    
    dialogues_list = dialogues.split(',')
    
    
    dialogue_ids = [personality_mapping[personality] for personality in dialogues_list]
    target_personality_id = personality_mapping[target_personality]
    
    encoded_data.append((dialogue_ids, target_personality_id))
    

In [5]:
min_dialogue_length = min(len(dialogue) for dialogue, _ in encoded_data)
min_dialogue_length

1

In [6]:
# 將每個對話文本轉換為N-gram序列
    
n = 35  # N-gram的N值
ngram_encoded_data = []

for dialogue_ids, target_personality_id in encoded_data:
    if len(dialogue_ids) < n:
        ngram_encoded_data.append((dialogue_ids, target_personality_id))
    else:
        ngram_dialogue_ids = [dialogue_ids[i:i + n] for i in range(len(dialogue_ids) - n + 1)]
        ngram_encoded_data.extend([(ngram_dialogue, target_personality_id) for ngram_dialogue in ngram_dialogue_ids])


In [7]:
print(*encoded_data[:10])

([6, 6, 0, 7, 9, 2, 3, 7, 0, 0, 3, 7, 7, 9, 3, 2, 2, 2, 9, 2, 6, 0, 6, 1, 9, 9, 6, 6], 0) ([3, 2, 7, 3, 2, 2, 7, 2, 7, 3, 9, 9, 7, 7, 7, 7, 9, 7, 7, 3, 2, 3, 6, 11, 0, 7, 4, 1, 3, 7, 10, 9, 7, 7, 3, 6, 6, 8, 2, 11, 8, 7, 2], 1) ([3, 6, 6, 2, 2, 3, 3, 6, 6, 6, 7, 0, 6, 6, 2, 6, 6, 0, 0, 0, 0, 3, 2, 2, 2, 2, 2, 7, 0, 6, 3, 7, 2, 2, 2, 1, 3, 2, 2], 2) ([3, 10, 6, 2, 2, 2, 6, 7, 2, 7, 2, 3, 9, 0, 0, 9, 7, 2, 9, 1, 9, 9, 4, 2, 9, 2, 9, 2, 7, 6, 2, 9, 2, 2, 1, 2, 2, 7, 11, 2, 1, 6, 2, 6, 3, 0, 3, 3], 3) ([4, 2, 7, 2, 4, 3, 3, 3, 2, 1, 1, 3, 7, 0, 9, 3, 3, 9, 1, 8, 4, 6, 2, 2, 6, 0, 3, 1, 0, 3, 3, 0, 9, 7, 3, 6, 0, 3, 1, 2, 7, 6, 0, 3], 4) ([2, 7, 2, 7, 3, 2, 3, 6, 6, 3, 0, 6, 3, 6, 7, 2, 5, 2, 6, 3, 3, 7, 3, 0, 2, 6, 2, 0, 7, 7, 6, 2, 2, 7, 0, 2, 2, 2, 2, 3, 2, 1, 9, 3, 3, 7, 3], 3) ([8, 6, 7, 0, 3, 6, 6, 7, 3, 7, 6, 7, 11, 7, 7, 3, 7, 6, 7, 6, 2, 6, 0, 7, 0, 2, 0, 2, 2, 7, 0, 6, 6, 0, 0, 7, 7, 4, 6, 6, 0, 6, 0, 1, 0, 6, 6, 1], 0) ([3, 8, 6, 6, 7, 3, 2, 0, 6, 2, 6, 2, 2, 6, 2, 1, 6, 6, 1, 2,

In [8]:
type(encoded_data)

list

In [9]:
ngram_encoded_data

[([6,
   6,
   0,
   7,
   9,
   2,
   3,
   7,
   0,
   0,
   3,
   7,
   7,
   9,
   3,
   2,
   2,
   2,
   9,
   2,
   6,
   0,
   6,
   1,
   9,
   9,
   6,
   6],
  0),
 ([3,
   2,
   7,
   3,
   2,
   2,
   7,
   2,
   7,
   3,
   9,
   9,
   7,
   7,
   7,
   7,
   9,
   7,
   7,
   3,
   2,
   3,
   6,
   11,
   0,
   7,
   4,
   1,
   3,
   7,
   10,
   9,
   7,
   7,
   3],
  1),
 ([2,
   7,
   3,
   2,
   2,
   7,
   2,
   7,
   3,
   9,
   9,
   7,
   7,
   7,
   7,
   9,
   7,
   7,
   3,
   2,
   3,
   6,
   11,
   0,
   7,
   4,
   1,
   3,
   7,
   10,
   9,
   7,
   7,
   3,
   6],
  1),
 ([7,
   3,
   2,
   2,
   7,
   2,
   7,
   3,
   9,
   9,
   7,
   7,
   7,
   7,
   9,
   7,
   7,
   3,
   2,
   3,
   6,
   11,
   0,
   7,
   4,
   1,
   3,
   7,
   10,
   9,
   7,
   7,
   3,
   6,
   6],
  1),
 ([3,
   2,
   2,
   7,
   2,
   7,
   3,
   9,
   9,
   7,
   7,
   7,
   7,
   9,
   7,
   7,
   3,
   2,
   3,
   6,
   11,
   0,
   7,
   4,
   1,
   3,
   7,
   10

In [10]:
# 動態計算 input_size
max_dialogue_length = max(len(dialogue) for dialogue, _ in ngram_encoded_data)
input_size = max_dialogue_length
input_size

35

In [11]:
# 填充序列並轉換為張量
padded_dialogues = [torch.tensor(dialogue, dtype=torch.float32) for dialogue, _ in ngram_encoded_data]
padded_dialogues = pad_sequence(padded_dialogues, batch_first=True)

target_personality = torch.tensor([target for _, target in ngram_encoded_data], dtype=torch.float32)

In [12]:
# 定義 RNN 模型
class PersonalityPredictionRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size,dropout_prob=0.5):
        super(PersonalityPredictionRNN, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        #self.fc1 = nn.Linear(hidden_size, 64)  # 添加全連接層
        #self.relu = nn.ReLU()  # 添加 ReLU 激活函數
        #self.fc2 = nn.Linear(64, output_size)
        self.dropout = nn.Dropout(dropout_prob)
        #單向
        self.fc = nn.Linear(hidden_size, output_size)
        
        #雙向
        #self.fc = nn.Linear(hidden_size, output_size)
        #self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True,  bidirectional=True)
        #self.fc = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, x):
        #print("Input shape:", x.shape)  # 檢查輸入張量的形狀
        #out, _ = self.rnn(x)
        #out = out.unsqueeze(1)
        #out = out[:, -1, :]  # 只取最後一個時間步的輸出
        #out = self.fc1(out)
        #out = self.relu(out) #relu
        #out = F.sigmoid(out) #sigmoid
        #out = F.tanh(out) #Tanh
        #out = self.fc2(out)
        
        out, _ = self.rnn(x)
        out = out.unsqueeze(1)
        out = self.dropout(out)
        out = self.fc(out[:, -1, :])  # 將 out 張量轉為 3 維再進行索引
        return out


In [13]:
# 資料集切分為訓練集和驗證集
train_dialogues, val_dialogues, train_target, val_target = train_test_split(padded_dialogues, target_personality, test_size=0.15, random_state=42)

# 初始化模型
hidden_size = 128
output_size = len(personality_mapping)
model = PersonalityPredictionRNN(input_size, hidden_size, output_size)

In [14]:
# 定義損失函數和優化器
#criterion = nn.MSELoss()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.05)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

In [15]:
# Early Stopping 相關設定
best_val_loss = float('inf')
best_model_state = None
patience = 10
counter = 0

In [16]:
file_path = "C:/Users/JenMing/Desktop/MBTI/LSTM/Model/n-grams/note.txt"

In [17]:
# 訓練模型
with open(file_path, "w") as f:
    num_epochs = 60
    dimension_counts = {'E/I': 0,
                        'S/N': 0,
                        'T/F': 0,
                        'J/P': 0}
    item_count = 0
    for epoch in range(num_epochs):
        # 在每個訓練循環中檢查 train_dialogues 張量的形狀
        print("Shape of train_dialogues:", train_dialogues.shape)
        f.write("Shape of train_dialogues: {}\n".format(train_dialogues.shape))
        # 在每個驗證循環中檢查 val_dialogues 張量的形狀
        print("Shape of val_dialogues:", val_dialogues.shape)
        f.write("Shape of val_dialogues: {}\n".format(val_dialogues.shape))
        total_loss = 0.0
        model.train()
        for dialogue_batch, target_batch in zip(train_dialogues, train_target):
            target_batch = target_batch.to(torch.long)
            optimizer.zero_grad()

            outputs = model(dialogue_batch.unsqueeze(0))
            loss = criterion(outputs.squeeze(0), target_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        scheduler.step() # 调整学习率
        
        average_loss = total_loss / len(train_dialogues)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}\n")
        f.write(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}\n")
        # 驗證模型
        val_loss = 0.0
        model.eval()
        with torch.no_grad():
            for dialogue_batch, target_batch in zip(val_dialogues, val_target):
                target_batch = target_batch.to(torch.long)
                outputs = model(dialogue_batch.unsqueeze(0))
                loss = criterion(outputs.squeeze(1), target_batch.unsqueeze(0))
                val_loss += loss.item()
                
                predicted_personality_ids = outputs.argmax(dim=1)
                predicted_personalities = [personality for personality, id in personality_mapping.items() if id in predicted_personality_ids]
                true_personality_id = target_batch.item()
                true_personality = [personality for personality, id in personality_mapping.items() if id == true_personality_id][0]
                
                #print(predicted_personalities[0])
                #print(true_personality)
                for n in range(4):
                    if n == 0:
                        if predicted_personalities[0][n] == true_personality[n]:
                            dimension_counts['E/I'] += 1
                    elif n == 1:
                        if predicted_personalities[0][n] == true_personality[n]:
                            dimension_counts['S/N'] += 1
                    elif n == 2:
                        if predicted_personalities[0][n] == true_personality[n]:
                            dimension_counts['T/F'] += 1
                    elif n == 3:
                        if predicted_personalities[0][n] == true_personality[n]:
                            dimension_counts['J/P'] += 1
                item_count += 1
                
        average_val_loss = val_loss / len(val_dialogues)
        print(f"Validation Loss: {average_val_loss:.4f}\n")
        f.write(f"Validation Loss: {average_val_loss:.4f}\n")
        
        # 比較驗證損失，並根據需要保存最佳模型
        '''
        if average_val_loss < best_val_loss:
            best_val_loss = average_val_loss
            best_model_state = model.state_dict()
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print("Early Stopping: Validation loss has not improved for {} epochs. Stopping training.".format(patience))
                break
        '''
    EI_counts = dimension_counts['E/I']
    SN_counts = dimension_counts['S/N']
    TF_counts = dimension_counts['T/F']
    JP_counts = dimension_counts['J/P']
    print(f'E.I: {EI_counts}/{item_count}')
    print('Accuracy: '+ str(EI_counts/item_count)+'\n')
    print(f'S.N: {SN_counts}/{item_count}')
    print('Accuracy: '+ str(SN_counts/item_count)+'\n')
    print(f'T.F: {TF_counts}/{item_count}')
    print('Accuracy: '+ str(TF_counts/item_count)+'\n')
    print(f'J.P: {JP_counts}/{item_count}')
    print('Accuracy: '+ str(JP_counts/item_count)+'\n')
    
    f.write(f'E.I: {EI_counts}/{item_count}')
    f.write('Accuracy: '+ str(EI_counts/item_count)+'\n')
    f.write(f'S.N: {SN_counts}/{item_count}')
    f.write('Accuracy: '+ str(SN_counts/item_count)+'\n')
    f.write(f'T.F: {TF_counts}/{item_count}')
    f.write('Accuracy: '+ str(TF_counts/item_count)+'\n')
    f.write(f'J.P: {JP_counts}/{item_count}')
    f.write('Accuracy: '+ str(JP_counts/item_count)+'\n')
    
    '''
    # 計算整體準確率和各維度的準確率
    total_correct = sum([counts['correct'] for counts in dimension_counts.values()])
    total_total = sum([counts['total'] for counts in dimension_counts.values()])
    overall_accuracy = total_correct / total_total if total_total > 0 else 0.0
    
    print(f"Overall Accuracy: {overall_accuracy:.4f}")
    f.write(f"Overall Accuracy: {overall_accuracy:.4f}\n")
    
    for dimension, counts in dimension_counts.items():
        dimension_accuracy = counts['correct'] / counts['total'] if counts['total'] > 0 else 0.0
        print(f"{dimension} Accuracy: {dimension_accuracy:.4f}")
        f.write(f"{dimension} Accuracy: {dimension_accuracy:.4f}\n")
    '''    
f.close()

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [1/60], Loss: 4.9700

Validation Loss: 3.4946

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [2/60], Loss: 4.5642

Validation Loss: 2.9504

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [3/60], Loss: 4.3810

Validation Loss: 2.8658

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [4/60], Loss: 4.2302

Validation Loss: 2.8005

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [5/60], Loss: 4.2074

Validation Loss: 2.6640

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [6/60], Loss: 4.2082

Validation Loss: 2.8743

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 3

Epoch [55/60], Loss: 2.2865

Validation Loss: 2.2810

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [56/60], Loss: 2.2870

Validation Loss: 2.2814

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [57/60], Loss: 2.2875

Validation Loss: 2.2819

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [58/60], Loss: 2.2879

Validation Loss: 2.2823

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [59/60], Loss: 2.2886

Validation Loss: 2.2828

Shape of train_dialogues: torch.Size([88366, 35])
Shape of val_dialogues: torch.Size([15595, 35])
Epoch [60/60], Loss: 2.2894

Validation Loss: 2.2833

E.I: 655781/935700
Accuracy: 0.7008453564176552

S.N: 809611/935700
Accuracy: 0.8652463396387731

T.F: 500748/935700
Accuracy: 0.535158704713049

J.P: 552916/935700
Accuracy: 0.590911616

In [18]:
if best_model_state:
    torch.save(best_model_state, "C:/Users/JenMing/Desktop/MBTI/LSTM/Model/n-grams/best_model.pth")
else:
    torch.save(model.state_dict(), "C:/Users/JenMing/Desktop/MBTI/LSTM/Model/n-grams/best_model.pth")

In [19]:
# 加載已經訓練好的模型
best_model = PersonalityPredictionRNN(input_size, hidden_size, output_size)
best_model.load_state_dict(torch.load("C:/Users/JenMing/Desktop/MBTI/LSTM/Model/n-grams/best_model.pth"))
best_model.eval()

# 將驗證數據轉換為張量並進行預測
with torch.no_grad():
    val_outputs = best_model(val_dialogues)
    predicted_personality_ids = val_outputs.argmax(dim=1)

# 計算準確率
correct_predictions = (predicted_personality_ids == val_target).sum().item()
total_samples = val_dialogues.shape[0]
accuracy = correct_predictions / total_samples

print("Validation Accuracy: {:.2%}".format(accuracy))
with open(file_path, "a") as f:
    f.write("\n------------\n")
    f.write("Validation Accuracy: {:.2%}".format(accuracy) + "\n")

Validation Accuracy: 21.26%


In [20]:
# 將測試數據進行預處理，並轉換為張量
test_personality_list = ['INFJ', 'INTP', 'ENFP', 'INFP', 'INTJ', 'ENFP', 'INFP', 'ENTP', 'ENFP', 'ENFP']  # 加載測試數據，類似於您的訓練數據
# 使用 personality_mapping 將人格類別轉換為 ID
test_personality_ids = [personality_mapping[personality] for personality in test_personality_list]

# 根據人格類別 ID 找到對應的對話編碼
test_dialogues_encoded = [ngram_encoded_data[id][0] for id in test_personality_ids]

# 填充對話編碼，使其長度與 input_size 相同
max_dialogue_length = input_size
padded_test_dialogues = [dialogue + [0] * (max_dialogue_length - len(dialogue)) for dialogue in test_dialogues_encoded]

# 將對話編碼進行轉換，並轉換為張量
test_dialogues_padded = pad_sequence([torch.tensor(dialogue, dtype=torch.float32) for dialogue in padded_test_dialogues], batch_first=True)

In [21]:
# 使用模型進行預測
with torch.no_grad():
    test_outputs = best_model(test_dialogues_padded)
    predicted_personality_probs = torch.softmax(test_outputs, dim=1)

# 找到最相近的人格及其概率
closest_personality_id = torch.argmax(predicted_personality_probs, dim=1)
closest_personality = [personality for personality, id in personality_mapping.items() if id == closest_personality_id[0].item()][0]
closest_personality_prob = predicted_personality_probs[0][closest_personality_id[0]].item()

# 輸出最相近的人格及其概率
print("Closest Personality:",closest_personality)
print("Probability:",closest_personality_prob)
print("------------")

# 找到前四高的人格及其概率
top_5_personality_probs, top_5_personality_ids = torch.topk(predicted_personality_probs, k=5)
top_5_personality_probs = top_5_personality_probs[0]
top_5_personality_ids = top_5_personality_ids[0]

# 輸出前四高的人格及其概率
with open(file_path, "a") as f:
    f.write("\n------------\n")
    f.write("Closest Personality: {}\n".format(closest_personality))
    f.write("Probability: {}\n".format(closest_personality_prob))
    f.write("\n------------\n")
    
    for i in range(5):
        personality_id = top_5_personality_ids[i].item()
        personality = [personality for personality, id in personality_mapping.items() if id == personality_id][0]
        prob = top_5_personality_probs[i].item()
        print("Top ", i + 1, " Personality:", personality)
        print("\nProbability:", prob , "\n")
        f.write("Top "+ str(i + 1) + " Personality:"+ personality)
        f.write("\nProbability:"+ str(prob) + "\n")
f.close()

Closest Personality: INFP
Probability: 0.19502240419387817
------------
Top  1  Personality: INFP

Probability: 0.19502240419387817 

Top  2  Personality: INFJ

Probability: 0.162883922457695 

Top  3  Personality: INTP

Probability: 0.1398811638355255 

Top  4  Personality: INTJ

Probability: 0.12019474804401398 

Top  5  Personality: ENTP

Probability: 0.0846424400806427 

