In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [3]:
#df = pd.read_csv('C:/Users/JenMing/Desktop/MBTI/LSTM/mbti_to_LSTM_DF.csv')
df = pd.read_csv('C:/Users/JenMing/Desktop/MBTI/one_type_one_sentence/mbti_to_LSTM_DF.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,"['INFJ', 'INFJ', 'INFP', 'INFJ', 'INFJ', 'INTP..."
1,ENTP,"['INTP', 'INTP', 'INTP', 'ENTP', 'ENTP', 'ENTP..."
2,INTP,"['INTP', 'INTP', 'INTP', 'INTP', 'INTP', 'INFP..."
3,INTJ,"['INTP', 'ENTP', 'INTJ', 'INFP', 'INTJ', 'INTJ..."
4,ENTJ,"['ENTJ', 'ENTJ', 'ENTJ', 'ENTP', 'ENTJ', 'ENTJ..."


In [4]:
# 編碼轉換
personality_mapping = {'INFJ': 0,
                        'ENTP': 1,
                        'INTP': 2,
                        'INTJ': 3,
                        'ENTJ': 4,
                        'ENFJ': 5,
                        'INFP': 6,
                        'ENFP': 7,
                        'ISFP': 8,
                        'ISTP': 9,
                        'ISFJ': 10,
                        'ISTJ': 11,
                        'ESTP': 12,
                        'ESFP': 13,
                        'ESTJ': 14,
                        'ESFJ': 15 }

In [5]:
# 資料載入和轉換
encoded_data = []

chars_to_remove = "][' "    

for index, row in df.iterrows():
    mbti_counts = {0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0,12:0,13:0,14:0,15:0}
    mbti_per_count = []
    dialogues = row["posts"] #字串
    target_personality = row["type"]
    for char in chars_to_remove:
        dialogues = dialogues.replace(char, "")
    
    dialogues_list = dialogues.split(',')
    
    
    dialogue_ids = [personality_mapping[personality] for personality in dialogues_list]
    
    for personality_id in dialogue_ids:
        mbti_counts[personality_id] += 1
    
    for i in range(len(personality_mapping)):
        mbti_per_count.append(round(mbti_counts[i]/len(dialogue_ids), 2))
    
    target_personality_id = personality_mapping[target_personality]
    
    encoded_data.append((mbti_per_count, target_personality_id))

# 資料載入和轉換
encoded_data = []

chars_to_remove = "][' "    

for index, row in df.iterrows():
    dialogues = row["posts"] #字串
    target_personality = row["type"]
    for char in chars_to_remove:
        dialogues = dialogues.replace(char, "")
    
    dialogues_list = dialogues.split(',')
    
    
    dialogue_ids = [personality_mapping[personality] for personality in dialogues_list]
    target_personality_id = personality_mapping[target_personality]
    
    encoded_data.append((dialogue_ids, target_personality_id))

In [6]:
# 填充序列並轉換為張量
input_data = torch.tensor([feature for feature, _ in encoded_data], dtype=torch.float32)
target_personality = torch.tensor([target for _, target in encoded_data], dtype=torch.int64)  # 使用int64类型，因为它是类别标签

In [7]:
# 資料集切分為訓練集和驗證集
train_dialogues, X_temp, train_target, y_temp = train_test_split(input_data, target_personality, test_size=0.3, random_state=42)

# 进一步划分剩余的数据为验证集和测试集
val_dialogues, X_test, val_target, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

#train_dialogues, val_dialogues, train_target, val_target = train_test_split(input_data, target_personality, test_size=0.15, random_state=42)

In [8]:
# 创建反向映射
reverse_personality_mapping = {v: k for k, v in personality_mapping.items()}

similar_tag = 1 #控制多少個指標以下要捨棄 (1~4)
total_del_count_times = 0
for index, personality_num in enumerate(train_target):
    del_count_times = 0
    # 使用反向映射将数字转换为对应的个性
    personality = reverse_personality_mapping.get(int(personality_num), "Unknown")
    tmp_dialogues = train_dialogues[index]
    for n in range(len(personality_mapping)):
        counter = 0
        for m in range(4):
            if reverse_personality_mapping[n][m] == personality[m]:
                counter += 1
        if counter < similar_tag:
            train_dialogues[index][n] = 0.0
            del_count_times += 1
            
    # 计算所有非零值的总和
    non_zero_sum = torch.sum(train_dialogues[index])

    #如果都是0
    if non_zero_sum == 0: 
        train_dialogues[index] = tmp_dialogues
    else:  # 重新分配其他非零值，使它们的总和为1
        train_dialogues[index] = train_dialogues[index] / non_zero_sum
        total_del_count_times += del_count_times

print(total_del_count_times)

In [9]:
# 將你的訓練數據和測試數據轉換為NumPy數組
X_train = train_dialogues.numpy()
y_train = train_target.numpy()
X_val = val_dialogues.numpy()
X_test = X_test.numpy()

In [10]:
print(X_train[0])

[0.04 0.07 0.7  0.02 0.   0.   0.15 0.02 0.   0.   0.   0.   0.   0.
 0.   0.  ]


In [29]:
print(y_train[0])

2


In [11]:
# 設置類別數目
num_classes = len(personality_mapping)

# 將一維列表轉換為二維列表（獨熱編碼）
y_train_2d_list = [[0] * num_classes for _ in range(len(y_train))]
for i, item in enumerate(y_train):
    y_train_2d_list[i][item] = 1

In [12]:
print(y_train[0])
print(y_train_2d_list[0])

2
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [13]:
# 初始化 ELM 參數
input_size = X_train.shape[1]
hidden_size = 25  # 隱藏層神經元數量
output_size = len(np.unique(y_train))  # 類別數目

In [14]:
# 生成隨機權重和偏差
input_weights = np.random.normal(size=(input_size, hidden_size))
biases = np.random.normal(size=(hidden_size,))
output_weights = np.zeros((hidden_size, output_size))

In [15]:
# 定義 sigmoid 函數
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


In [16]:
# 運行 ELM 訓練
hidden_activations = sigmoid(np.dot(X_train, input_weights) + biases)
output_weights = np.dot(np.linalg.pinv(hidden_activations), y_train_2d_list)

In [17]:
# 預測
hidden_activations_test = sigmoid(np.dot(X_val, input_weights) + biases)
y_pred = np.dot(hidden_activations_test, output_weights)

In [18]:
predicted_labels = np.argmax(y_pred, axis=1)
print(predicted_labels)

[ 0  2 11 ...  5  8  0]


In [19]:
print(predicted_labels.max())

15


In [20]:
# 計算準確度
accuracy = accuracy_score(val_target.numpy(), predicted_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9861644888547272


In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 计算准确度
accuracy = accuracy_score(val_target, predicted_labels)

# 计算精确度
precision = precision_score(val_target, predicted_labels, average='weighted')

# 计算召回率
recall = recall_score(val_target, predicted_labels, average='weighted')

# 计算F1分数
f1 = f1_score(val_target, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1 Score: 0.98


In [22]:
with open("C:/Users/JenMing/Desktop/MBTI/ELM/note.txt", "w") as f:
    dimension_counts = {'E/I': 0,
                        'S/N': 0,
                        'T/F': 0,
                        'J/P': 0}
    correct_predictions = 0

    item_count = len(predicted_labels)
    
    for n in range(len(predicted_labels)):
        for personality, value in personality_mapping.items():
            if value == predicted_labels[n]:
                mbti_labels_pre = personality
                break
        for personality, value in personality_mapping.items():
            if value == val_target[n]:
                mbti_labels_tru = personality 
                break

        if mbti_labels_pre == mbti_labels_tru:
            correct_predictions += 1

        for n in range(4):
            if n == 0:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['E/I'] += 1
            elif n == 1:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['S/N'] += 1
            elif n == 2:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['T/F'] += 1
            elif n == 3:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['J/P'] += 1


    accuracy = correct_predictions / len(predicted_labels) 

    #f.write(f"del_count: {total_del_count_times}\n")
    f.write(f"Accuracy: {accuracy:.2f}\n")
    f.write(f"Precision: {precision:.2f}\n")
    f.write(f"Recall: {recall:.2f}\n")
    f.write(f"F1 Score: {f1:.2f}\n\n")
    
    print(f"Validation Accuracy: {accuracy*100:.4f}%")
    f.write(f"Validation Accuracy: {accuracy*100:.4f}%\n")
    
    EI_counts = dimension_counts['E/I']
    SN_counts = dimension_counts['S/N']
    TF_counts = dimension_counts['T/F']
    JP_counts = dimension_counts['J/P']
    print(f'E.I: {EI_counts}/{item_count} ')
    print('Accuracy: '+ str(EI_counts/item_count)+'\n')
    print(f'S.N: {SN_counts}/{item_count} ')
    print('Accuracy: '+ str(SN_counts/item_count)+'\n')
    print(f'T.F: {TF_counts}/{item_count} ')
    print('Accuracy: '+ str(TF_counts/item_count)+'\n')
    print(f'J.P: {JP_counts}/{item_count} ')
    print('Accuracy: '+ str(JP_counts/item_count)+'\n')
    
    f.write(f'E.I: {EI_counts}/{item_count} ')
    f.write('Accuracy: '+ str(EI_counts/item_count)+'\n')
    f.write(f'S.N: {SN_counts}/{item_count} ')
    f.write('Accuracy: '+ str(SN_counts/item_count)+'\n')
    f.write(f'T.F: {TF_counts}/{item_count} ')
    f.write('Accuracy: '+ str(TF_counts/item_count)+'\n')
    f.write(f'J.P: {JP_counts}/{item_count} ')
    f.write('Accuracy: '+ str(JP_counts/item_count)+'\n')

Validation Accuracy: 98.6164%
E.I: 1296/1301 
Accuracy: 0.9961568024596464

S.N: 1292/1301 
Accuracy: 0.9930822444273636

T.F: 1295/1301 
Accuracy: 0.9953881629515757

J.P: 1291/1301 
Accuracy: 0.9923136049192929



In [23]:
import joblib  # 或者可以使用pickle

# 保存 ELM 模型的權重和偏差
elm_model = {
    'input_weights': input_weights,
    'biases': biases,
    'output_weights': output_weights
}

# 保存ELM模型
model_filename = 'C:/Users/JenMing/Desktop/MBTI/ELM/elm_model.pkl'

joblib.dump(elm_model, model_filename)

['C:/Users/JenMing/Desktop/MBTI/ELM/elm_model.pkl']

In [24]:
# 載入 ELM 模型
loaded_elm_model = joblib.load(model_filename)

# 使用載入的模型進行預測
#X_test = X_test.numpy()
hidden_activations_test = sigmoid(np.dot(X_test, loaded_elm_model['input_weights']) + loaded_elm_model['biases'])
y_pred = np.dot(hidden_activations_test, loaded_elm_model['output_weights'])
predicted_labels = np.argmax(y_pred, axis=1)

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 计算准确度
accuracy = accuracy_score(y_test, predicted_labels)

# 计算精确度
precision = precision_score(y_test, predicted_labels, average='weighted')

# 计算召回率
recall = recall_score(y_test, predicted_labels, average='weighted')

# 计算F1分数
f1 = f1_score(y_test, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.98


In [30]:
with open("C:/Users/JenMing/Desktop/MBTI/ELM/note.txt", "w") as f:
    dimension_counts = {'E/I': 0,
                        'S/N': 0,
                        'T/F': 0,
                        'J/P': 0}
    correct_predictions = 0

    item_count = len(predicted_labels)
    
    for n in range(len(predicted_labels)):
        for personality, value in personality_mapping.items():
            if value == predicted_labels[n]:
                mbti_labels_pre = personality
                break
        for personality, value in personality_mapping.items():
            if value == y_test[n]:
                mbti_labels_tru = personality 
                break

        if mbti_labels_pre == mbti_labels_tru:
            correct_predictions += 1

        for n in range(4):
            if n == 0:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['E/I'] += 1
            elif n == 1:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['S/N'] += 1
            elif n == 2:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['T/F'] += 1
            elif n == 3:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['J/P'] += 1


    #accuracy = correct_predictions / len(predicted_labels) 

    #f.write(f"del_count: {total_del_count_times}\n")
    f.write(f"Accuracy: {accuracy:.2f}\n")
    f.write(f"Precision: {precision:.2f}\n")
    f.write(f"Recall: {recall:.2f}\n")
    f.write(f"F1 Score: {f1:.2f}\n\n")
    
    print(f"Test Accuracy: {accuracy*100:.4f}%")
    f.write(f"Test Accuracy: {accuracy*100:.4f}%\n")
    
    EI_counts = dimension_counts['E/I']
    SN_counts = dimension_counts['S/N']
    TF_counts = dimension_counts['T/F']
    JP_counts = dimension_counts['J/P']
    print(f'E.I: {EI_counts}/{item_count} ')
    print('Accuracy: '+ str(EI_counts/item_count)+'\n')
    print(f'S.N: {SN_counts}/{item_count} ')
    print('Accuracy: '+ str(SN_counts/item_count)+'\n')
    print(f'T.F: {TF_counts}/{item_count} ')
    print('Accuracy: '+ str(TF_counts/item_count)+'\n')
    print(f'J.P: {JP_counts}/{item_count} ')
    print('Accuracy: '+ str(JP_counts/item_count)+'\n')
    
    f.write(f'E.I: {EI_counts}/{item_count} ')
    f.write('Accuracy: '+ str(EI_counts/item_count)+'\n')
    f.write(f'S.N: {SN_counts}/{item_count} ')
    f.write('Accuracy: '+ str(SN_counts/item_count)+'\n')
    f.write(f'T.F: {TF_counts}/{item_count} ')
    f.write('Accuracy: '+ str(TF_counts/item_count)+'\n')
    f.write(f'J.P: {JP_counts}/{item_count} ')
    f.write('Accuracy: '+ str(JP_counts/item_count)+'\n')

Test Accuracy: 98.4639%
E.I: 1292/1302 
Accuracy: 0.9923195084485407

S.N: 1290/1302 
Accuracy: 0.9907834101382489

T.F: 1292/1302 
Accuracy: 0.9923195084485407

J.P: 1293/1302 
Accuracy: 0.9930875576036866



In [26]:
print(y_pred)

[[-0.10501957  0.03113521  0.06603042 ...  0.03802156  0.00727338
   0.04746361]
 [ 0.84181579  0.00311728  0.01315136 ...  0.00641402  0.00645428
   0.00793565]
 [ 1.14392709 -0.03434104 -0.00979809 ... -0.00812583  0.00752317
  -0.01404023]
 ...
 [ 0.02672751  0.05672022 -0.00200336 ...  0.00716682  0.01972081
   0.02158144]
 [-0.02605363  1.04994902  0.02437874 ... -0.00729558 -0.02234383
  -0.01253467]
 [-0.02718612  0.02073254 -0.04145333 ... -0.03295789  0.01374188
  -0.01214026]]
