In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

In [2]:
df = pd.read_csv('C:/Users/JenMing/Desktop/MBTI/one_type_one_sentence/mbti_to_LSTM_DF.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,"['INFJ', 'INFJ', 'INFP', 'INFJ', 'INFJ', 'INTP..."
1,ENTP,"['INTP', 'INTP', 'INTP', 'ENTP', 'ENTP', 'ENTP..."
2,INTP,"['INTP', 'INTP', 'INTP', 'INTP', 'INTP', 'INFP..."
3,INTJ,"['INTP', 'ENTP', 'INTJ', 'INFP', 'INTJ', 'INTJ..."
4,ENTJ,"['ENTJ', 'ENTJ', 'ENTJ', 'ENTP', 'ENTJ', 'ENTJ..."


In [3]:
# 編碼轉換
personality_mapping = {'INFJ': 0,
                        'ENTP': 1,
                        'INTP': 2,
                        'INTJ': 3,
                        'ENTJ': 4,
                        'ENFJ': 5,
                        'INFP': 6,
                        'ENFP': 7,
                        'ISFP': 8,
                        'ISTP': 9,
                        'ISFJ': 10,
                        'ISTJ': 11,
                        'ESTP': 12,
                        'ESFP': 13,
                        'ESTJ': 14,
                        'ESFJ': 15 }

In [4]:
# 資料載入和轉換
encoded_data = []

chars_to_remove = "][' "    

for index, row in df.iterrows():
    mbti_counts = {0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0,12:0,13:0,14:0,15:0}
    mbti_per_count = []
    dialogues = row["posts"] #字串
    target_personality = row["type"]
    for char in chars_to_remove:
        dialogues = dialogues.replace(char, "")
    
    dialogues_list = dialogues.split(',')
    
    
    dialogue_ids = [personality_mapping[personality] for personality in dialogues_list]
    
    for personality_id in dialogue_ids:
        mbti_counts[personality_id] += 1
    
    for i in range(len(personality_mapping)):
        mbti_per_count.append(round(mbti_counts[i]/len(dialogue_ids), 2))
    
    target_personality_id = personality_mapping[target_personality]
    
    encoded_data.append((mbti_per_count, target_personality_id))

'''
# 資料載入和轉換
encoded_data = []

chars_to_remove = "][' "    
total_count_del = 0

for index, row in df.iterrows():
    count_del = 0
    mbti_counts = {0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0,12:0,13:0,14:0,15:0}
    mbti_per_count = []
    dialogues = row["posts"] #字串
    target_personality = row["type"]
    for char in chars_to_remove:
        dialogues = dialogues.replace(char, "")
    
    dialogues_list = dialogues.split(',')
    new_dialogues_list = []
    #print(dialogues_list)
    for n in range(len(dialogues_list)):
        counter = 0
        for m in range(4):
            if dialogues_list[n][m] == target_personality[m]:
                counter += 1
        if counter >= 3:
            new_dialogues_list.append(dialogues_list[n])
        else:
            count_del += 1
    #print(new_dialogues_list)
    if len(new_dialogues_list) == 0:
        dialogue_ids = [personality_mapping[personality] for personality in dialogues_list]
        count_del = 0
    else:
        dialogue_ids = [personality_mapping[personality] for personality in new_dialogues_list]
    
    
    #print(dialogue_ids)
    
    for personality_id in dialogue_ids:
        mbti_counts[personality_id] += 1
    
    for i in range(len(personality_mapping)):
        mbti_per_count.append(round(mbti_counts[i]/len(dialogue_ids), 2))
    
    target_personality_id = personality_mapping[target_personality]
    
    encoded_data.append((mbti_per_count, target_personality_id))
    
    total_count_del += count_del
    
print(total_count_del)
'''

In [5]:
# 填充序列並轉換為張量
input_data = torch.tensor([feature for feature, _ in encoded_data], dtype=torch.float32)
target_personality = torch.tensor([target for _, target in encoded_data], dtype=torch.int64)  # 使用int64类型，因为它是类别标签

In [6]:
# 資料集切分為訓練集和驗證集
train_dialogues, X_temp, train_target, y_temp = train_test_split(input_data, target_personality, test_size=0.3, random_state=42)

# 进一步划分剩余的数据为验证集和测试集
val_dialogues, X_test, val_target, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [7]:
print(len(train_dialogues))
print(len(val_dialogues))

6071
1301


In [8]:
print(val_dialogues[0])
print(val_target[0])

tensor([0.6400, 0.0400, 0.0600, 0.0900, 0.0000, 0.0000, 0.1100, 0.0600, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000])
tensor(0)


In [9]:
# 创建反向映射
reverse_personality_mapping = {v: k for k, v in personality_mapping.items()}

for index, personality_num in enumerate(train_target):
    tmp_dialogues = train_dialogues[index]
    # 找到前三大的值
    top_three = sorted(set(tmp_dialogues), reverse=True)[:3]

    # 将列表中不在前三大值中的元素设为0
    result  = [x if x in top_three else 0 for x in tmp_dialogues]
    
    # 转换为PyTorch张量
    train_dialogues[index] = torch.FloatTensor(result)
    

In [10]:
similar_tag = 1 #控制多少個指標以下要捨棄 (1~4)
total_del_count_times = 0
for index, personality_num in enumerate(train_target):
    del_count_times = 0
    # 使用反向映射将数字转换为对应的个性
    personality = reverse_personality_mapping.get(int(personality_num), "Unknown")
    tmp_dialogues = train_dialogues[index]
    for n in range(len(personality_mapping)):
        counter = 0
        for m in range(4):
            if reverse_personality_mapping[n][m] == personality[m]:
                counter += 1
        if counter < similar_tag:
            train_dialogues[index][n] = 0.0
            del_count_times += 1
            
    # 计算所有非零值的总和
    non_zero_sum = torch.sum(train_dialogues[index])

    #如果都是0
    if non_zero_sum == 0: 
        train_dialogues[index] = tmp_dialogues
    else:  # 重新分配其他非零值，使它们的总和为1
        train_dialogues[index] = train_dialogues[index] / non_zero_sum
        total_del_count_times += del_count_times

print(total_del_count_times)

6071


from sklearn import svm
from sklearn.model_selection import GridSearchCV

# 创建一个SVM分类器
svm_classifier = svm.SVC(kernel='linear', C=1)

# 使用训练数据训练SVM模型
svm_classifier.fit(train_dialogues, train_target)

# 使用训练好的SVM模型进行预测
predicted_labels = svm_classifier.predict(val_dialogues)

In [11]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

# 創建一個SVM分類器
svm_classifier = svm.SVC(kernel='rbf')

# 定義超參數網格，這是一個示例，您可以根據需要擴展它
param_grid = {'C': [0.1, 0.5]}

# 使用GridSearchCV進行超參數調整
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5)
grid_search.fit(train_dialogues, train_target)

# 打印最佳超參數配置和交叉驗證得分
print("最佳超參數配置:", grid_search.best_params_)
print("最佳交叉驗證得分:", grid_search.best_score_)

# 使用最佳的超參數配置來訓練模型
best_svm_classifier = grid_search.best_estimator_
best_svm_classifier.fit(train_dialogues, train_target)

# 使用訓練好的SVM模型進行預測
predicted_labels = best_svm_classifier.predict(val_dialogues)

最佳超參數配置: {'C': 0.5}
最佳交叉驗證得分: 0.9932470966298534


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 计算准确度
accuracy = accuracy_score(val_target, predicted_labels)

# 计算精确度
precision = precision_score(val_target, predicted_labels, average='weighted')

# 计算召回率
recall = recall_score(val_target, predicted_labels, average='weighted')

# 计算F1分数
f1 = f1_score(val_target, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.98


  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
with open("C:/Users/JenMing/Desktop/MBTI/SVM/note.txt", "w") as f:
    dimension_counts = {'E/I': 0,
                        'S/N': 0,
                        'T/F': 0,
                        'J/P': 0}
    correct_predictions = 0

    item_count = len(predicted_labels)
    
    for n in range(len(predicted_labels)):
        for personality, value in personality_mapping.items():
            if value == predicted_labels[n]:
                mbti_labels_pre = personality
                break
        for personality, value in personality_mapping.items():
            if value == val_target[n]:
                mbti_labels_tru = personality 
                break

        if mbti_labels_pre == mbti_labels_tru:
            correct_predictions += 1

        for n in range(4):
            if n == 0:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['E/I'] += 1
            elif n == 1:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['S/N'] += 1
            elif n == 2:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['T/F'] += 1
            elif n == 3:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['J/P'] += 1


    accuracy = correct_predictions / len(predicted_labels) 

    #f.write(f"del_count: {total_del_count_times}\n")
    f.write(f"Accuracy: {accuracy:.2f}\n")
    f.write(f"Precision: {precision:.2f}\n")
    f.write(f"Recall: {recall:.2f}\n")
    f.write(f"F1 Score: {f1:.2f}\n\n")
    
    print(f"Validation Accuracy: {accuracy*100:.4f}%")
    f.write(f"Validation Accuracy: {accuracy*100:.4f}%\n")
    
    EI_counts = dimension_counts['E/I']
    SN_counts = dimension_counts['S/N']
    TF_counts = dimension_counts['T/F']
    JP_counts = dimension_counts['J/P']
    print(f'E.I: {EI_counts}/{item_count} ')
    print('Accuracy: '+ str(EI_counts/item_count)+'\n')
    print(f'S.N: {SN_counts}/{item_count} ')
    print('Accuracy: '+ str(SN_counts/item_count)+'\n')
    print(f'T.F: {TF_counts}/{item_count} ')
    print('Accuracy: '+ str(TF_counts/item_count)+'\n')
    print(f'J.P: {JP_counts}/{item_count} ')
    print('Accuracy: '+ str(JP_counts/item_count)+'\n')
    
    f.write(f'E.I: {EI_counts}/{item_count} ')
    f.write('Accuracy: '+ str(EI_counts/item_count)+'\n')
    f.write(f'S.N: {SN_counts}/{item_count} ')
    f.write('Accuracy: '+ str(SN_counts/item_count)+'\n')
    f.write(f'T.F: {TF_counts}/{item_count} ')
    f.write('Accuracy: '+ str(TF_counts/item_count)+'\n')
    f.write(f'J.P: {JP_counts}/{item_count} ')
    f.write('Accuracy: '+ str(JP_counts/item_count)+'\n')

Validation Accuracy: 98.4627%
E.I: 1294/1301 
Accuracy: 0.994619523443505

S.N: 1292/1301 
Accuracy: 0.9930822444273636

T.F: 1290/1301 
Accuracy: 0.9915449654112222

J.P: 1294/1301 
Accuracy: 0.994619523443505



In [14]:
import joblib  # 或者可以使用pickle

# 保存SVM模型
model_filename = 'C:/Users/JenMing/Desktop/MBTI/SVM/svm_model.pkl'
joblib.dump(best_svm_classifier, model_filename)
#joblib.dump(svm_classifier, model_filename) 沒用交叉驗證的話要改這個

['C:/Users/JenMing/Desktop/MBTI/SVM/svm_model.pkl']

In [15]:
# 加载SVM模型
loaded_svm_model = joblib.load('C:/Users/JenMing/Desktop/MBTI/SVM/svm_model.pkl')

# 使用加载的模型进行预测
loaded_predicted_labels = loaded_svm_model.predict(X_test)

In [16]:
predicted_labels = loaded_predicted_labels
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 计算准确度
accuracy = accuracy_score(y_test, predicted_labels)

# 计算精确度
precision = precision_score(y_test, predicted_labels, average='weighted')

# 计算召回率
recall = recall_score(y_test, predicted_labels, average='weighted')

# 计算F1分数
f1 = f1_score(y_test, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.98


  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
with open("C:/Users/JenMing/Desktop/MBTI/SVM/note.txt", "w") as f:
    dimension_counts = {'E/I': 0,
                        'S/N': 0,
                        'T/F': 0,
                        'J/P': 0}
    correct_predictions = 0

    item_count = len(predicted_labels)
    
    for n in range(len(predicted_labels)):
        for personality, value in personality_mapping.items():
            if value == predicted_labels[n]:
                mbti_labels_pre = personality
                break
        for personality, value in personality_mapping.items():
            if value == y_test[n]:
                mbti_labels_tru = personality 
                break

        if mbti_labels_pre == mbti_labels_tru:
            correct_predictions += 1

        for n in range(4):
            if n == 0:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['E/I'] += 1
            elif n == 1:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['S/N'] += 1
            elif n == 2:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['T/F'] += 1
            elif n == 3:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['J/P'] += 1


    accuracy = correct_predictions / len(predicted_labels) 

    #f.write(f"del_count: {total_del_count_times}\n")
    f.write(f"Accuracy: {accuracy:.2f}\n")
    f.write(f"Precision: {precision:.2f}\n")
    f.write(f"Recall: {recall:.2f}\n")
    f.write(f"F1 Score: {f1:.2f}\n\n")
    
    print(f"Test Accuracy: {accuracy*100:.4f}%")
    f.write(f"Test Accuracy: {accuracy*100:.4f}%\n")
    
    EI_counts = dimension_counts['E/I']
    SN_counts = dimension_counts['S/N']
    TF_counts = dimension_counts['T/F']
    JP_counts = dimension_counts['J/P']
    print(f'E.I: {EI_counts}/{item_count} ')
    print('Accuracy: '+ str(EI_counts/item_count)+'\n')
    print(f'S.N: {SN_counts}/{item_count} ')
    print('Accuracy: '+ str(SN_counts/item_count)+'\n')
    print(f'T.F: {TF_counts}/{item_count} ')
    print('Accuracy: '+ str(TF_counts/item_count)+'\n')
    print(f'J.P: {JP_counts}/{item_count} ')
    print('Accuracy: '+ str(JP_counts/item_count)+'\n')
    
    f.write(f'E.I: {EI_counts}/{item_count} ')
    f.write('Accuracy: '+ str(EI_counts/item_count)+'\n')
    f.write(f'S.N: {SN_counts}/{item_count} ')
    f.write('Accuracy: '+ str(SN_counts/item_count)+'\n')
    f.write(f'T.F: {TF_counts}/{item_count} ')
    f.write('Accuracy: '+ str(TF_counts/item_count)+'\n')
    f.write(f'J.P: {JP_counts}/{item_count} ')
    f.write('Accuracy: '+ str(JP_counts/item_count)+'\n')

Test Accuracy: 97.8495%
E.I: 1299/1302 
Accuracy: 0.9976958525345622

S.N: 1289/1302 
Accuracy: 0.9900153609831029

T.F: 1284/1302 
Accuracy: 0.9861751152073732

J.P: 1290/1302 
Accuracy: 0.9907834101382489



In [18]:
print(predicted_labels)

[12  0  0 ...  4  1  6]
