In [6]:
# Personality Prediction from Social Media Data (MBTI + Big Five Mapping)

import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# 1. Load Dataset
df = pd.read_csv("D:/Users/KUSUMA/mbti_1.csv")   # change filename if needed
print("Dataset loaded successfully. Shape:", df.shape)

# Dataset has columns: "type" (MBTI label), "posts" (user posts)

# 2. Text Cleaning
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', str(text))  # remove links
    text = re.sub(r'[^A-Za-z\s]', '', text)                  # remove symbols
    text = text.lower()
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df['clean_text'] = df['posts'].apply(clean_text)

# 3. Features & Labels
X = df['clean_text']
y = df['type']  # MBTI personality types (16 classes)

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X).toarray()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

# 4. Train Model with balanced classes
model = LogisticRegression(max_iter=500, class_weight="balanced")
model.fit(X_train, y_train)

# 5. Predictions
y_pred = model.predict(X_test)

# 6. Evaluation
print("\n✅ Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

# 7. MBTI → Big Five Mapping
def mbti_to_bigfive(mbti_type):
    traits = {}
    traits["Extraversion"] = "Introvert" if mbti_type[0] == "I" else "Extravert"
    traits["Openness"] = "Intuitive (Open to ideas)" if mbti_type[1] == "N" else "Sensing (Practical)"
    traits["Agreeableness"] = "Thinking (Logical)" if mbti_type[2] == "T" else "Feeling (Empathetic)"
    traits["Conscientiousness"] = "Judging (Organized)" if mbti_type[3] == "J" else "Perceiving (Flexible)"
    return traits

# 8. AI Personality Summary
def generate_summary(mbti_type):
    traits = mbti_to_bigfive(mbti_type)
    summary = (f"As a {mbti_type}, you are likely {traits['Extraversion'].lower()}, "
               f"{traits['Openness'].lower()}, "
               f"{traits['Agreeableness'].lower()}, "
               f"and {traits['Conscientiousness'].lower()}.")
    return summary

# 9. Show sample prediction
sample_text = X_test[0]  # first sample from test
predicted_mbti = model.predict([X_test[0]])[0]
print("\n🔮 Predicted MBTI Type:", predicted_mbti)
print("🧠 Big Five Traits:", mbti_to_bigfive(predicted_mbti))
print("📝 Summary:", generate_summary(predicted_mbti))


Dataset loaded successfully. Shape: (8675, 2)


[nltk_data] Downloading package stopwords to C:\Users\GOUTHAM
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



✅ Model Accuracy: 0.6680115273775216

Classification Report:
               precision    recall  f1-score   support

        ENFJ       0.50      0.51      0.51        41
        ENFP       0.68      0.69      0.69       125
        ENTJ       0.41      0.66      0.50        44
        ENTP       0.60      0.65      0.62       135
        ESFJ       0.12      0.29      0.17         7
        ESFP       0.00      0.00      0.00         8
        ESTJ       0.38      0.43      0.40         7
        ESTP       0.67      0.53      0.59        15
        INFJ       0.75      0.58      0.65       288
        INFP       0.76      0.73      0.75       370
        INTJ       0.64      0.68      0.66       193
        INTP       0.75      0.74      0.75       293
        ISFJ       0.74      0.58      0.65        45
        ISFP       0.53      0.58      0.55        53
        ISTJ       0.59      0.68      0.63        44
        ISTP       0.55      0.72      0.62        67

    accuracy     