In [None]:
from google.colab import files
uploaded = files.upload()


Saving mbti_1.csv to mbti_1.csv


In [None]:
import pandas as pd

# Load the CSV file
data = pd.read_csv("mbti_1.csv")

# Preview the first few rows
print(data.head())


   type                                              posts
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1  ENTP  'I'm finding the lack of me in these posts ver...
2  INTP  'Good one  _____   https://www.youtube.com/wat...
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...
4  ENTJ  'You're fired.|||That's another silly misconce...


In [None]:
print(data.isnull().sum())


type     0
posts    0
dtype: int64


In [None]:
print(data['type'].value_counts())


type
INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: count, dtype: int64


In [None]:
# Convert to lowercase
data['posts'] = data['posts'].str.lower()

# Remove URLs and special characters
data['posts'] = data['posts'].str.replace(r'http\S+|www\S+|https\S+', '', regex=True)
data['posts'] = data['posts'].str.replace(r'[^\w\s]', '', regex=True)  # Removes punctuation


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Set a reasonable feature limit
X = vectorizer.fit_transform(data['posts'])  # Feature matrix
y = data['type']  # Labels


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=1000)  # Increase iterations if needed
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.6


In [None]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Using unigrams and bigrams
X = vectorizer.fit_transform(data['posts'])


In [None]:
model = LogisticRegression(max_iter=1000, class_weight='balanced')


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'C': 10}


In [None]:
# Retrain with best parameter
optimized_model = LogisticRegression(C=10, max_iter=1000)
optimized_model.fit(X_train, y_train)

# Evaluate on test data
y_pred = optimized_model.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report
print("Optimized Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Optimized Accuracy: 0.6305475504322766
Classification Report:
               precision    recall  f1-score   support

        ENFJ       0.67      0.29      0.41        41
        ENFP       0.65      0.62      0.63       125
        ENTJ       0.67      0.45      0.54        44
        ENTP       0.60      0.53      0.56       135
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00         8
        ESTJ       0.00      0.00      0.00         7
        ESTP       1.00      0.20      0.33        15
        INFJ       0.59      0.62      0.60       288
        INFP       0.65      0.80      0.72       370
        INTJ       0.56      0.68      0.61       193
        INTP       0.65      0.77      0.71       293
        ISFJ       0.94      0.36      0.52        45
        ISFP       0.72      0.34      0.46        53
        ISTJ       0.79      0.34      0.48        44
        ISTP       0.69      0.43      0.53        67

    accuracy     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
pip install transformers scikit-learn


In [None]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import torch

# Load and preprocess data (Assuming `data` contains your text and label columns)
X = data['posts']  # Text data
y = data['type']  # Labels

# Step 1: BERT Embedding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    # Get CLS token embedding (the first token's embedding)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

# Generate embeddings for all text data
X_embeddings = np.array([bert_embeddings(text) for text in X])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

# Step 2: Define Base Models (SVM and Logistic Regression)
svm_clf = make_pipeline(StandardScaler(), SVC(kernel='linear', C=1, probability=True, class_weight='balanced'))
log_reg_clf = LogisticRegression(C=10, max_iter=1000, class_weight='balanced')

# Step 3: Stacking Ensemble
estimators = [('svm', svm_clf), ('log_reg', log_reg_clf)]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Step 4: Train Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Step 5: Predictions and Evaluation
y_pred = stacking_clf.predict(X_test)
print("Stacking Ensemble Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import torch

# Load the data (modify file path and check columns as necessary)
data = pd.read_csv('mbti_1.csv')  # Replace with your file path

# Verify columns to ensure the correct data is used
print("Columns in data:", data.columns)

# Assign columns based on actual names
X = data['posts']  # Replace 'posts' with the actual column name if different
y = data['type']   # Assuming 'type' contains the labels

# Step 1: BERT Embedding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()  # CLS token embedding

# Generate embeddings for all text data
X_embeddings = np.array([bert_embeddings(text) for text in X])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

# Step 2: Define Base Models (SVM and Logistic Regression)
svm_clf = make_pipeline(StandardScaler(), SVC(kernel='linear', C=1, probability=True, class_weight='balanced'))
log_reg_clf = LogisticRegression(C=10, max_iter=1000, class_weight='balanced')

# Step 3: Stacking Ensemble
estimators = [('svm', svm_clf), ('log_reg', log_reg_clf)]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Step 4: Train Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Step 5: Predictions and Evaluation
y_pred = stacking_clf.predict(X_test)
print("Stacking Ensemble Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))
