In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from torch.nn.functional import softmax
from torch.nn.utils.rnn import pad_sequence

In [2]:
df = pd.read_csv('C:/Users/JenMing/Desktop/MBTI/LSTM/mbti_to_LSTM_DF.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,"['INFP', 'INFP', 'INFJ', 'ENFP', 'ISTP', 'INTP..."
1,ENTP,"['INTJ', 'INTP', 'ENFP', 'INTJ', 'INTP', 'INTP..."
2,INTP,"['INTJ', 'INFP', 'INFP', 'INTP', 'INTP', 'INTJ..."
3,INTJ,"['INTJ', 'ISFJ', 'INFP', 'INTP', 'INTP', 'INTP..."
4,ENTJ,"['ENTJ', 'INTP', 'ENFP', 'INTP', 'ENTJ', 'INTJ..."


In [3]:
# 編碼轉換
personality_mapping = {'INFJ': 0,
                        'ENTP': 1,
                        'INTP': 2,
                        'INTJ': 3,
                        'ENTJ': 4,
                        'ENFJ': 5,
                        'INFP': 6,
                        'ENFP': 7,
                        'ISFP': 8,
                        'ISTP': 9,
                        'ISFJ': 10,
                        'ISTJ': 11,
                        'ESTP': 12,
                        'ESFP': 13,
                        'ESTJ': 14,
                        'ESFJ': 15 }

In [4]:
# 資料載入和轉換
encoded_data = []

chars_to_remove = "][' "    

for index, row in df.iterrows():
    dialogues = row["posts"] #字串
    target_personality = row["type"]
    for char in chars_to_remove:
        dialogues = dialogues.replace(char, "")
    
    dialogues_list = dialogues.split(',')
    
    
    dialogue_ids = [personality_mapping[personality] for personality in dialogues_list]
    target_personality_id = personality_mapping[target_personality]
    
    encoded_data.append((dialogue_ids, target_personality_id))

In [5]:
# 將每個對話轉換為列表
encoded_data = [(list(dialogue), target) for dialogue, target in encoded_data]
padded_dialogues = pad_sequence([torch.tensor(dialogue, dtype=torch.long) for dialogue, _ in encoded_data], batch_first=True)

# 動態計算 input_size
max_dialogue_length = max(len(dialogue) for dialogue, _ in encoded_data)

# 將目標轉換為PyTorch張量
target_personality = torch.tensor([target for _, target in encoded_data], dtype=torch.long)

In [6]:
# 定義CNN模型
class PersonalityPredictionCNN(nn.Module):
    def __init__(self, input_size, num_classes, num_filters, kernel_size):
        super(PersonalityPredictionCNN, self).__init__()
        self.embedding = nn.Embedding(input_size, 64)  # 嵌入層
        self.conv1 = nn.Conv1d(64, num_filters, kernel_size)  # 第一個卷積層
        self.relu = nn.ReLU()  # ReLU激活函數
        self.fc = nn.Linear(num_filters, num_classes)  # 全連接層

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)  # 將維度進行調整
        x = self.conv1(x)
        x = self.relu(x)
        x = torch.max(x, dim=2)[0]  # 使用最大池化
        x = self.fc(x)
        return x

In [7]:
# 將序列轉換為張量
train_dialogues = [torch.tensor(dialogue, dtype=torch.long) for dialogue in padded_dialogues]
train_target = torch.tensor(target_personality, dtype=torch.long)

# 創建包含元組的列表
train_data_tuples = [(dialogue, target) for dialogue, target in zip(train_dialogues, train_target)]

# 創建 TensorDataset，分別傳遞對話和目標
train_dataset = TensorDataset(torch.stack(train_dialogues), train_target)

# 切分訓練集和驗證集
train_size = int(0.85 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

# 創建 DataLoader
batch_size = 32  # 設置批次大小
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)


  train_dialogues = [torch.tensor(dialogue, dtype=torch.long) for dialogue in padded_dialogues]
  train_target = torch.tensor(target_personality, dtype=torch.long)


In [8]:
# 初始化模型
input_size = len(personality_mapping)
num_classes = len(personality_mapping)
num_filters = 128  # 卷積層的濾波器數量
kernel_size = 3   # 卷積核大小
model = PersonalityPredictionCNN(input_size, num_classes, num_filters, kernel_size)

# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [9]:
file_path = "C:/Users/JenMing/Desktop/MBTI/CNN/Model/"

In [28]:
# 設定訓練參數
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        dialogues, targets = batch
        optimizer.zero_grad()
        outputs = model(dialogues)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)

    # 在每個 epoch 結束後評估模型
    model.eval()
    val_loss = 0.0
    val_predictions = []
    val_target = []

    with torch.no_grad():
        a=1
        for batch in val_loader:
            dialogues, targets = batch
            val_target.extend(targets.cpu().numpy())
            val_outputs = model(dialogues)
            batch_loss = criterion(val_outputs, targets)
            val_loss += batch_loss.item()
            val_predictions.extend(val_outputs.argmax(dim=1).cpu().numpy())
            
            # 使用編碼轉換映射
            mbti_labels_pre = [key for key, value in personality_mapping.items() if value in val_predictions]
            mbti_labels_tar = [key for key, value in personality_mapping.items() if value in val_target]

            if a == 1:
                print(mbti_labels_tar)
                print("\n")
                print(mbti_labels_pre)
                print("\n")
                a += 1

    val_accuracy = accuracy_score(val_target, val_predictions)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {average_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

['INFJ', 'INTP', 'INTJ', 'ENTJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISFJ', 'ESFJ']


['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISTJ']


Epoch [1/10], Train Loss: 1.2122, Val Loss: 128.7268, Val Accuracy: 0.2604
['INFJ', 'INTP', 'INTJ', 'ENTJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISFJ', 'ESFJ']


['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP', 'ISTP', 'ISTJ']


Epoch [2/10], Train Loss: 1.2148, Val Loss: 130.5601, Val Accuracy: 0.2911
['INFJ', 'INTP', 'INTJ', 'ENTJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISFJ', 'ESFJ']


['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISTJ']


Epoch [3/10], Train Loss: 1.2115, Val Loss: 127.8226, Val Accuracy: 0.2680
['INFJ', 'INTP', 'INTJ', 'ENTJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISFJ', 'ESFJ']


['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISTJ']


Epoch [4/10], Train Loss: 1.2164, Val Loss: 126.7238, Val Accuracy: 0.2688
['INFJ', 'IN

In [None]:
with torch.no_grad():
    a = 1
    for batch in val_loader:  # 使用 DataLoader 來遍歷驗證數據
        dialogues, targets = batch
        val_outputs = model(dialogues)
        batch_loss = criterion(val_outputs, targets)
        val_loss += batch_loss.item()
        val_predictions.extend(val_outputs.argmax(dim=1).cpu().numpy())
        
        # 使用編碼轉換映射
        mbti_labels_pre = [key for key, value in personality_mapping.items() if value in val_predictions]
        mbti_labels_tar = [key for key, value in personality_mapping.items() if value in targets]
        
        if a == 1:
            print("Validation Examples:")
            for i in range(10):  # 打印前10個示例
                print(f"Example {i+1}: Target = {mbti_labels_tar[i]}, Prediction = {mbti_labels_pre[i]}")
            print("\n")
            a += 1

val_accuracy = accuracy_score(val_target, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")

with open(file_path+"note.txt", "w") as f:
    dimension_counts = {'E/I': 0,
                        'S/N': 0,
                        'T/F': 0,
                        'J/P': 0}
    item_count = 0
    
    num_epochs = 10

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0

        for batch in train_loader:  # 使用 DataLoader 來遍歷訓練數據
            dialogues, targets = batch
            optimizer.zero_grad()
            outputs = model(dialogues)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        average_loss = total_loss / len(train_loader)

        # 在每個 epoch 結束後評估模型
        model.eval()
        val_loss = 0.0
        val_predictions = []
        val_target = []
        
        with torch.no_grad():
            a = 1
            for batch in val_loader:  # 使用 DataLoader 來遍歷驗證數據
                dialogues, targets = batch
                val_outputs = model(dialogues)
                batch_loss = criterion(val_outputs, targets)
                val_loss += batch_loss.item()
                val_predictions.extend(val_outputs.argmax(dim=1).cpu().numpy())
                
                val_target.extend(targets.cpu().numpy())
                
                # 使用編碼轉換映射
                mbti_labels_pre = [key for key, value in personality_mapping.items() if value in val_predictions]
                mbti_labels_tar = [key for key, value in personality_mapping.items() if value in targets]
                
                print("Val outputs shape:", val_outputs.shape)
                print("Val targets shape:", targets.shape)
                
    
                if a == 1:
                    print(mbti_labels_pre)
                    print("\n")
                    print(mbti_labels_tar)
                    print("\n")
                    a +=1
                '''
                for n in range(4):
                    if n == 0:
                        if val_predictions[0][n] == val_target[n]:
                            dimension_counts['E/I'] += 1
                    elif n == 1:
                        if val_predictions[0][n] == val_target[n]:
                            dimension_counts['S/N'] += 1
                    elif n == 2:
                        if val_predictions[0][n] == val_target[n]:
                            dimension_counts['T/F'] += 1
                    elif n == 3:
                        if val_predictions[0][n] == val_target[n]:
                            dimension_counts['J/P'] += 1
                item_count += 1
                '''

        val_accuracy = accuracy_score(val_target, val_predictions)
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {average_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
        f.write(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {average_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
        '''
        EI_counts = dimension_counts['E/I']
        SN_counts = dimension_counts['S/N']
        TF_counts = dimension_counts['T/F']
        JP_counts = dimension_counts['J/P']
        print(f'E.I: {EI_counts}/{item_count} ')
        print('Accuracy: '+ str(EI_counts/item_count)+'\n')
        print(f'S.N: {SN_counts}/{item_count} ')
        print('Accuracy: '+ str(SN_counts/item_count)+'\n')
        print(f'T.F: {TF_counts}/{item_count} ')
        print('Accuracy: '+ str(TF_counts/item_count)+'\n')
        print(f'J.P: {JP_counts}/{item_count} ')
        print('Accuracy: '+ str(JP_counts/item_count)+'\n')

        f.write(f'E.I: {EI_counts}/{item_count} ')
        f.write('Accuracy: '+ str(EI_counts/item_count)+'\n')
        f.write(f'S.N: {SN_counts}/{item_count} ')
        f.write('Accuracy: '+ str(SN_counts/item_count)+'\n')
        f.write(f'T.F: {TF_counts}/{item_count} ')
        f.write('Accuracy: '+ str(TF_counts/item_count)+'\n')
        f.write(f'J.P: {JP_counts}/{item_count} ')
        f.write('Accuracy: '+ str(JP_counts/item_count)+'\n')
        '''
        # 保存模型
        torch.save(model.state_dict(), file_path+'cnn_model.pth')
f.close()