In [1]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [2]:
df = pd.read_csv('mbti_1.csv')

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|[^a-z\s]", "", text)
    return text

df['clean_posts'] = df['posts'].apply(clean_text)

df['IE'] = df['type'].apply(lambda x: 0 if x[0] == 'I' else 1)
df['NS'] = df['type'].apply(lambda x: 0 if x[1] == 'N' else 1)
df['FT'] = df['type'].apply(lambda x: 0 if x[2] == 'F' else 1)
df['JP'] = df['type'].apply(lambda x: 0 if x[3] == 'J' else 1)

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['clean_posts']).toarray()

joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

def train_binary_ann(X, y, label):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = Sequential([
        Dense(256, activation='relu', input_shape=(X.shape[1],)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1, verbose=1)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    print(f"\n--- {label} Trait ---")
    print(classification_report(y_test, y_pred))
    model.save(f'{label}_model.h5')
    return model

model_IE = train_binary_ann(X, df['IE'], 'IE')
model_NS = train_binary_ann(X, df['NS'], 'NS')
model_FT = train_binary_ann(X, df['FT'], 'FT')
model_JP = train_binary_ann(X, df['JP'], 'JP')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.7524 - loss: 0.5661 - val_accuracy: 0.7824 - val_loss: 0.4724
Epoch 2/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7808 - loss: 0.4490 - val_accuracy: 0.8357 - val_loss: 0.3771
Epoch 3/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8586 - loss: 0.3451 - val_accuracy: 0.8343 - val_loss: 0.3607
Epoch 4/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8919 - loss: 0.2884 - val_accuracy: 0.8228 - val_loss: 0.3718
Epoch 5/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.8855 - loss: 0.2801 - val_accuracy: 0.8401 - val_loss: 0.3687
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step

--- IE Trait ---




              precision    recall  f1-score   support

           0       0.89      0.93      0.91      1353
           1       0.70      0.58      0.64       382

    accuracy                           0.85      1735
   macro avg       0.80      0.76      0.77      1735
weighted avg       0.85      0.85      0.85      1735

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.8378 - loss: 0.4650 - val_accuracy: 0.8559 - val_loss: 0.3922
Epoch 2/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8700 - loss: 0.3573 - val_accuracy: 0.8559 - val_loss: 0.3461
Epoch 3/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8603 - loss: 0.3109 - val_accuracy: 0.8660 - val_loss: 0.3050
Epoch 4/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9009 - loss: 0.2428 - val_accuracy: 0.8818 - val_loss: 0.2965
Epoch 5/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9178 - loss: 0.2171 - val_accuracy: 0.8818 - val_loss: 0.3164
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step





--- NS Trait ---
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1489
           1       0.78      0.41      0.54       246

    accuracy                           0.90      1735
   macro avg       0.85      0.70      0.74      1735
weighted avg       0.89      0.90      0.89      1735



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.6252 - loss: 0.6399 - val_accuracy: 0.7594 - val_loss: 0.4838
Epoch 2/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8239 - loss: 0.3974 - val_accuracy: 0.8401 - val_loss: 0.3774
Epoch 3/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8683 - loss: 0.3215 - val_accuracy: 0.8401 - val_loss: 0.3770
Epoch 4/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8880 - loss: 0.2833 - val_accuracy: 0.8444 - val_loss: 0.3824
Epoch 5/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8812 - loss: 0.2950 - val_accuracy: 0.8357 - val_loss: 0.3779
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step





--- FT Trait ---
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       937
           1       0.83      0.80      0.81       798

    accuracy                           0.83      1735
   macro avg       0.83      0.83      0.83      1735
weighted avg       0.83      0.83      0.83      1735



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.5932 - loss: 0.6655 - val_accuracy: 0.6888 - val_loss: 0.5816
Epoch 2/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7337 - loss: 0.5393 - val_accuracy: 0.7536 - val_loss: 0.4928
Epoch 3/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.8097 - loss: 0.4349 - val_accuracy: 0.7651 - val_loss: 0.4798
Epoch 4/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8258 - loss: 0.4075 - val_accuracy: 0.7594 - val_loss: 0.4819
Epoch 5/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.8440 - loss: 0.3755 - val_accuracy: 0.7579 - val_loss: 0.5016
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step





--- JP Trait ---
              precision    recall  f1-score   support

           0       0.79      0.59      0.68       669
           1       0.78      0.90      0.84      1066

    accuracy                           0.78      1735
   macro avg       0.79      0.75      0.76      1735
weighted avg       0.79      0.78      0.78      1735



In [3]:
np.random.seed(42)
responses = np.random.randint(1, 6, size=(1000, 20))

def assign_label(block):
    avg = block.mean(axis=1)
    return (avg > 3).astype(int) 

IE = assign_label(responses[:, 0:5])    
NS = assign_label(responses[:, 5:10])    
FT = assign_label(responses[:, 10:15])   
JP = assign_label(responses[:, 15:20])   

labels = np.stack([IE, NS, FT, JP], axis=1)

X_train, X_test, y_train, y_test = train_test_split(responses, labels, test_size=0.2)

model = Sequential([
    Dense(64, activation='relu', input_shape=(20,)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.1),
    Dense(4, activation='sigmoid') 
])

model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

model.save('questionnaire_ann_model.h5')
print("✅ Questionnaire model saved as questionnaire_ann_model.h5")


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.1626 - loss: 1.0288 - val_accuracy: 0.3625 - val_loss: 0.7492
Epoch 2/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2748 - loss: 0.7814 - val_accuracy: 0.4125 - val_loss: 0.6770
Epoch 3/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.3386 - loss: 0.7001 - val_accuracy: 0.5250 - val_loss: 0.6475
Epoch 4/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.3914 - loss: 0.6684 - val_accuracy: 0.4125 - val_loss: 0.6250
Epoch 5/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.3815 - loss: 0.6352 - val_accuracy: 0.4125 - val_loss: 0.5984
Epoch 6/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.4461 - loss: 0.6061 - val_accuracy: 0.4250 - val_loss: 0.5767
Epoch 7/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━



✅ Questionnaire model saved as questionnaire_ann_model.h5
