In [1]:
import xgboost as xgb

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence


In [3]:
# 讀取數據
df = pd.read_csv('C:/Users/JenMing/Desktop/MBTI/LSTM/mbti_to_LSTM_DF.csv')

In [4]:
# 編碼轉換
personality_mapping = {'INFJ': 0, 'ENTP': 1, 'INTP': 2, 'INTJ': 3, 'ENTJ': 4, 'ENFJ': 5, 'INFP': 6, 'ENFP': 7, 
                       'ISFP': 8, 'ISTP': 9, 'ISFJ': 10, 'ISTJ': 11, 'ESTP': 12, 'ESFP': 13, 'ESTJ': 14, 'ESFJ': 15 }


In [5]:
# 資料載入和轉換
encoded_data = []

chars_to_remove = "][' "    

for index, row in df.iterrows():
    mbti_counts = {0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0,12:0,13:0,14:0,15:0}
    mbti_per_count = []
    dialogues = row["posts"] #字串
    target_personality = row["type"]
    for char in chars_to_remove:
        dialogues = dialogues.replace(char, "")
    
    dialogues_list = dialogues.split(',')
    
    
    dialogue_ids = [personality_mapping[personality] for personality in dialogues_list]
    
    for personality_id in dialogue_ids:
        mbti_counts[personality_id] += 1
    
    for i in range(len(personality_mapping)):
        mbti_per_count.append(round(mbti_counts[i]/len(dialogue_ids), 2))
    
    target_personality_id = personality_mapping[target_personality]
    
    encoded_data.append((mbti_per_count, target_personality_id))

In [6]:
# 填充序列並轉換為張量
input_data = torch.tensor([feature for feature, _ in encoded_data], dtype=torch.float32)
target_personality = torch.tensor([target for _, target in encoded_data], dtype=torch.int64)  # 使用int64类型，因为它是类别标签

# 資料集切分為訓練集和驗證集
train_dialogues, val_dialogues, train_target, val_target = train_test_split(input_data, target_personality, test_size=0.15, random_state=42)

In [7]:
# 將你的訓練數據和測試數據轉換為NumPy數組
X_train = train_dialogues.numpy()
y_train = train_target.numpy()
X_test = val_dialogues.numpy()
Y_test = val_target.numpy()

In [12]:
# 使用默認超參數創建一個XGBoost模型
xgb_model = xgb.XGBClassifier()


# 定義網格搜索的超參數組合
param_grid = {
    'n_estimators': [ 50, 100, 200, 500],  # 嘗試不同的樹的數量
    'max_depth': [1, 3, 5, 10],  # 嘗試不同的最大深度
    'learning_rate': [0.1, 0.05, 0.5]  # 嘗試不同的最小樣本分裂數
}

# 使用網格搜索進行超參數調優
grid_search = GridSearchCV(xgb_model, param_grid, cv=5)  # 使用5折交叉驗證

# 訓練模型
grid_search.fit(X_train, y_train)

# 找到最佳超參數組合
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')



Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 200}


In [13]:
# 使用最佳超參數訓練最終模型
best_XGB_classifier = xgb.XGBClassifier(**best_params)
best_XGB_classifier.fit(X_train, y_train)

# 預測
y_pred = best_XGB_classifier.predict(X_test)

# 計算驗證準確度
val_accuracy = accuracy_score(Y_test, y_pred)
print(f'Validation Accuracy: {val_accuracy}')

Validation Accuracy: 0.42089093701996927


# 使用默認超參數創建一個XGBoost模型
xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

val_accuracy = accuracy_score(Y_test, y_pred)
print(f'Validation Accuracy: {val_accuracy}')

In [14]:
predicted_labels = y_pred
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 计算准确度
accuracy = accuracy_score(val_target, predicted_labels)

# 计算精确度
precision = precision_score(val_target, predicted_labels, average='weighted')

# 计算召回率
recall = recall_score(val_target, predicted_labels, average='weighted')

# 计算F1分数
f1 = f1_score(val_target, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.42
Precision: 0.38
Recall: 0.42
F1 Score: 0.39


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
with open("C:/Users/JenMing/Desktop/MBTI/XGBoost/note.txt", "w") as f:
    dimension_counts = {'E/I': 0,
                        'S/N': 0,
                        'T/F': 0,
                        'J/P': 0}
    correct_predictions = 0

    item_count = len(predicted_labels)
    
    for n in range(len(predicted_labels)):
        for personality, value in personality_mapping.items():
            if value == predicted_labels[n]:
                mbti_labels_pre = personality
                break
        for personality, value in personality_mapping.items():
            if value == val_target[n]:
                mbti_labels_tru = personality 
                break

        if mbti_labels_pre == mbti_labels_tru:
            correct_predictions += 1

        for n in range(4):
            if n == 0:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['E/I'] += 1
            elif n == 1:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['S/N'] += 1
            elif n == 2:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['T/F'] += 1
            elif n == 3:
                if mbti_labels_pre[n] == mbti_labels_tru[n]:
                    dimension_counts['J/P'] += 1


    accuracy = correct_predictions / len(predicted_labels) 

    #f.write(f"del_count: {total_del_count_times}\n")
    f.write(f"Accuracy: {accuracy:.2f}\n")
    f.write(f"Precision: {precision:.2f}\n")
    f.write(f"Recall: {recall:.2f}\n")
    f.write(f"F1 Score: {f1:.2f}\n\n")
    
    print(f"Validation Accuracy: {accuracy*100:.4f}%")
    f.write(f"Validation Accuracy: {accuracy*100:.4f}%\n")
    
    EI_counts = dimension_counts['E/I']
    SN_counts = dimension_counts['S/N']
    TF_counts = dimension_counts['T/F']
    JP_counts = dimension_counts['J/P']
    print(f'E.I: {EI_counts}/{item_count} ')
    print('Accuracy: '+ str(EI_counts/item_count)+'\n')
    print(f'S.N: {SN_counts}/{item_count} ')
    print('Accuracy: '+ str(SN_counts/item_count)+'\n')
    print(f'T.F: {TF_counts}/{item_count} ')
    print('Accuracy: '+ str(TF_counts/item_count)+'\n')
    print(f'J.P: {JP_counts}/{item_count} ')
    print('Accuracy: '+ str(JP_counts/item_count)+'\n')
    
    f.write(f'E.I: {EI_counts}/{item_count} ')
    f.write('Accuracy: '+ str(EI_counts/item_count)+'\n')
    f.write(f'S.N: {SN_counts}/{item_count} ')
    f.write('Accuracy: '+ str(SN_counts/item_count)+'\n')
    f.write(f'T.F: {TF_counts}/{item_count} ')
    f.write('Accuracy: '+ str(TF_counts/item_count)+'\n')
    f.write(f'J.P: {JP_counts}/{item_count} ')
    f.write('Accuracy: '+ str(JP_counts/item_count)+'\n')

Validation Accuracy: 42.0891%
E.I: 1014/1302 
Accuracy: 0.7788018433179723

S.N: 1144/1302 
Accuracy: 0.8786482334869432

T.F: 1033/1302 
Accuracy: 0.793394777265745

J.P: 878/1302 
Accuracy: 0.674347158218126



In [16]:
print(y_pred)

[6 6 0 ... 7 3 6]


In [17]:
import joblib

# 保存Random Forest模型
model_filename = 'C:/Users/JenMing/Desktop/MBTI/XGBoost/XGB_model.pkl'

# 保存模型到文件
joblib.dump(best_XGB_classifier, model_filename)

['C:/Users/JenMing/Desktop/MBTI/XGBoost/XGB_model.pkl']

# 從文件載入模型
loaded_model = joblib.load(model_filename)

# 使用載入的模型進行預測
new_predictions = loaded_model.predict(new_data)