分段式處理 TF JP

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

In [2]:
df = pd.read_csv('C:/Users/JenMing/Desktop/MBTI/LSTM/mbti_to_LSTM_DF.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,"['INFP', 'INFP', 'INFJ', 'ENFP', 'ISTP', 'INTP..."
1,ENTP,"['INTJ', 'INTP', 'ENFP', 'INTJ', 'INTP', 'INTP..."
2,INTP,"['INTJ', 'INFP', 'INFP', 'INTP', 'INTP', 'INTJ..."
3,INTJ,"['INTJ', 'ISFJ', 'INFP', 'INTP', 'INTP', 'INTP..."
4,ENTJ,"['ENTJ', 'INTP', 'ENFP', 'INTP', 'ENTJ', 'INTJ..."


In [3]:
# 編碼轉換
personality_mapping_output = {'TJ': 0,
                        'TP': 1,
                        'FJ': 2,
                        'FP': 3}

In [4]:
# 編碼轉換
personality_mapping = {'INFJ': 0,
                        'ENTP': 1,
                        'INTP': 2,
                        'INTJ': 3,
                        'ENTJ': 4,
                        'ENFJ': 5,
                        'INFP': 6,
                        'ENFP': 7,
                        'ISFP': 8,
                        'ISTP': 9,
                        'ISFJ': 10,
                        'ISTJ': 11,
                        'ESTP': 12,
                        'ESFP': 13,
                        'ESTJ': 14,
                        'ESFJ': 15 }

In [5]:
encoded_data = []

chars_to_remove = "][' "  
total = 0
dimension_counts = {'TJ': 0,
                        'TP': 0,
                        'FJ': 0,
                        'FP': 0}

for index, row in df.iterrows():
    dialogues = row["posts"] #字串
    target_personality = row["type"]
    
    target_personality = target_personality[2] + target_personality[3]
    for char in chars_to_remove:
        dialogues = dialogues.replace(char, "")
    
    dialogues_list = dialogues.split(',')
    
    '''
    for i in range(len(dialogues_list)):
        dialogues_list[i] = dialogues_list[i][2] + dialogues_list[i][3]
        if dialogues_list[i] == 'TJ':
            dimension_counts['TJ'] += 1
        elif dialogues_list[i] == 'TP':
            dimension_counts['TP'] += 1
        elif dialogues_list[i] == 'FJ':
            dimension_counts['FJ'] += 1
        elif dialogues_list[i] == 'FP':
            dimension_counts['FP'] += 1
        total += 1

    '''
    dialogue_ids = [personality_mapping[personality] for personality in dialogues_list]
    target_personality_id = personality_mapping_output[target_personality]
    
    encoded_data.append((dialogue_ids, target_personality_id))
'''
TJ_counts = dimension_counts['TJ']
TP_counts = dimension_counts['TP']
FJ_counts = dimension_counts['FJ']
FP_counts = dimension_counts['FP']
print(f'T.J: '+ str(TJ_counts/total*100) +'%\n')
print(f'T.P: '+ str(TP_counts/total*100) +'%\n')
print(f'F.J: '+ str(FJ_counts/total*100) +'%\n')
print(f'F.P: '+ str(FP_counts/total*100) +'%\n')
'''

"\nTJ_counts = dimension_counts['TJ']\nTP_counts = dimension_counts['TP']\nFJ_counts = dimension_counts['FJ']\nFP_counts = dimension_counts['FP']\nprint(f'T.J: '+ str(TJ_counts/total*100) +'%\n')\nprint(f'T.P: '+ str(TP_counts/total*100) +'%\n')\nprint(f'F.J: '+ str(FJ_counts/total*100) +'%\n')\nprint(f'F.P: '+ str(FP_counts/total*100) +'%\n')\n"

In [6]:
# 動態計算 input_size
max_dialogue_length = max(len(dialogue) for dialogue, _ in encoded_data)
input_size = max_dialogue_length

In [7]:
# 填充序列並轉換為張量
padded_dialogues = [torch.tensor(dialogue, dtype=torch.float32) for dialogue, _ in encoded_data]
padded_dialogues = pad_sequence(padded_dialogues, batch_first=True)

target_personality = torch.tensor([target for _, target in encoded_data], dtype=torch.float32)

In [8]:
# 定义 ANN 模型
class PersonalityPredictionANN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, dropout_prob=0.5):
        super(PersonalityPredictionANN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)
        self.fc3 = nn.Linear(hidden_size2, output_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        
        return x

In [9]:
# 資料集切分為訓練集和驗證集
train_dialogues, val_dialogues, train_target, val_target = train_test_split(padded_dialogues, target_personality, test_size=0.15, random_state=42)

# 初始化模型
hidden_size = 64
output_size = len(personality_mapping_output)
model = PersonalityPredictionANN(input_size, hidden_size, hidden_size, output_size)

In [10]:
# 定義損失函數和優化器 (CEL:分類問題 MSE:回归问题)
criterion = nn.CrossEntropyLoss()
#criterion = nn.MSELoss() 
weight_decay = 0.01
#optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay)

In [11]:
#早停
patience = 10  # 設定早期停止的耐心值
best_val_loss = float('inf')
counter = 0  # 用於計算連續的驗證損失沒有改善的次數

In [12]:
with open("C:/Users/JenMing/Desktop/MBTI/ANN/Model/TFJP/note.txt", "w") as f:
    num_epochs = 60
    dimension_counts = {'T/F': 0,
                        'J/P': 0}
    item_count = 0
    for epoch in range(num_epochs):
        model.train()  # 將模型設置為訓練模式
        total_loss = 0.0

        for dialogue_batch, target_batch in zip(train_dialogues, train_target):
            optimizer.zero_grad()

            outputs = model(dialogue_batch)
            loss = criterion(outputs, target_batch.long())  # 使用交叉熵損失
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

        average_loss = total_loss / len(train_dialogues)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}")
        f.write(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}\n")
        
        # 驗證模型
        model.eval()  # 將模型設置為評估模式
        correct_predictions = 0
        total_samples = len(val_dialogues)
        val_loss = 0.0

        with torch.no_grad():
            for dialogue_batch, target_batch in zip(val_dialogues, val_target):
                outputs = model(dialogue_batch)
                loss = criterion(outputs, target_batch.long())  # 使用交叉熵損失
                val_loss += loss.item()

                predicted_class = torch.argmax(outputs).item()
                true_class = target_batch.item()

                for personality, value in personality_mapping_output.items():
                    if value == predicted_class:
                        mbti_labels_pre = personality
                        break
                for personality, value in personality_mapping_output.items():
                    if value == int(true_class):
                        mbti_labels_tru = personality 
                        break

                for n in range(4):
                    if n == 0:
                        if mbti_labels_pre[n] == mbti_labels_tru[n]:
                            dimension_counts['T/F'] += 1
                    elif n == 1:
                        if mbti_labels_pre[n] == mbti_labels_tru[n]:
                            dimension_counts['J/P'] += 1

                if predicted_class == true_class:
                    correct_predictions += 1

        item_count += total_samples
        average_val_loss = val_loss / len(val_dialogues)
        accuracy = correct_predictions / total_samples
        print(f"Validation Loss: {average_val_loss:.4f}, Validation Accuracy: {accuracy*100:.4f}%")
        f.write(f"Validation Loss: {average_val_loss:.4f}, Validation Accuracy: {accuracy*100:.4f}%\n")
        
        # 檢查驗證損失是否改善
        if average_val_loss < best_val_loss:
            best_val_loss = average_val_loss
            counter = 0
        else:
            counter += 1

        # 如果連續一定次數（耐心值）驗證損失沒有改善，則停止訓練
        if counter >= patience:
            print(f"Early Stopping: Validation loss has not improved for {patience} epochs. Stopping training.")
            break

    TF_counts = dimension_counts['T/F']
    JP_counts = dimension_counts['J/P']
   
    print(f'T.F: {TF_counts}/{item_count} ')
    print('Accuracy: '+ str(TF_counts/item_count)+'\n')
    print(f'J.P: {JP_counts}/{item_count} ')
    print('Accuracy: '+ str(JP_counts/item_count)+'\n')
    
    f.write(f'T.F: {TF_counts}/{item_count} ')
    f.write('Accuracy: '+ str(TF_counts/item_count)+'\n')
    f.write(f'J.P: {JP_counts}/{item_count} ')
    f.write('Accuracy: '+ str(JP_counts/item_count)+'\n')

Epoch [1/60], Loss: 1.3806
Validation Loss: 1.3365, Validation Accuracy: 35.8679%
Epoch [2/60], Loss: 1.3517
Validation Loss: 1.3264, Validation Accuracy: 37.4808%
Epoch [3/60], Loss: 1.3386
Validation Loss: 1.3172, Validation Accuracy: 39.4777%
Epoch [4/60], Loss: 1.3303
Validation Loss: 1.3148, Validation Accuracy: 39.7081%
Epoch [5/60], Loss: 1.3181
Validation Loss: 1.3079, Validation Accuracy: 39.3241%
Epoch [6/60], Loss: 1.3089
Validation Loss: 1.3014, Validation Accuracy: 39.4777%
Epoch [7/60], Loss: 1.2977
Validation Loss: 1.2988, Validation Accuracy: 39.5545%
Epoch [8/60], Loss: 1.2940
Validation Loss: 1.2953, Validation Accuracy: 38.7097%
Epoch [9/60], Loss: 1.2887
Validation Loss: 1.2954, Validation Accuracy: 40.0922%
Epoch [10/60], Loss: 1.2812
Validation Loss: 1.2934, Validation Accuracy: 39.7081%
Epoch [11/60], Loss: 1.2799
Validation Loss: 1.2891, Validation Accuracy: 39.8618%
Epoch [12/60], Loss: 1.2711
Validation Loss: 1.2883, Validation Accuracy: 40.2458%
Epoch [13/60]

In [13]:
torch.save(model.state_dict(), "C:/Users/JenMing/Desktop/MBTI/ANN/Model/TFJP/best_model.pth")

現在是先把training data的資料改成4個字元的 原因是如果要跟原本的模型在validation的地方和用的話 那資料必須要是一樣的 
只是這樣改的話 TF跟JP就跟之前差不多了
可能要嘗試用2字元 然後再跟之前的模型合用會比較好 