In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv('../data/mbti_1.csv')

In [4]:
# Text preprocessing function
def preprocess(text):
    text = re.sub(r'http\S+', '', text)  # Removes URLs that start with http
    text = re.sub(r'www\S+', '', text)   # Removes URLs that start with www

    text = text.lower()  # Lowercase
    tokens = word_tokenize(text)  # Tokenize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stopwords.words('english')]
    return tokens

# Apply preprocessing
df['processed_posts'] = df['posts'].apply(preprocess)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_posts']).toarray()
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Word2Vec
word2vec_model = Word2Vec(sentences=df['processed_posts'], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(df['processed_posts'], total_examples=word2vec_model.corpus_count, epochs=10)

# Example: Get the vector for a word
word_vector = word2vec_model.wv['climate']  # Get the vector for the word "climate"

# Show TF-IDF result
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)
print("TF-IDF Feature Names:", tfidf_feature_names)

# Example output from Word2Vec
print("Word Vector for 'climate':", word_vector)

[nltk_data] Downloading package punkt to /Users/luanan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luanan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/luanan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TF-IDF Matrix Shape: (8675, 79874)
TF-IDF Feature Names: ['aa' 'aaa' 'aaaa' ... 'ﾉｼ' 'ﾟ' 'ﾟдﾟщ']
Word Vector for 'climate': [ 1.90252766e-01  1.48653913e+00 -3.71686339e-01 -2.38267437e-01
 -3.82821679e-01  2.07242668e-01 -6.41261637e-02  1.50640702e+00
  5.11804104e-01 -2.56112039e-01 -1.93555343e+00  5.99927306e-01
 -6.67593300e-01 -1.69792816e-01  3.74819905e-01 -1.75895822e+00
  1.54398731e-04 -2.61034757e-01 -8.36529732e-01  1.14265487e-01
  1.39894664e+00 -4.06002641e-01 -6.70712531e-01  3.16157162e-01
  1.17947578e+00  5.12195706e-01 -1.29408395e+00 -8.09257090e-01
 -1.35896909e+00  9.06167805e-01 -1.91821493e-02 -1.40571013e-01
  1.88803303e+00  1.20748699e+00 -6.21329188e-01  6.91762924e-01
  9.66809750e-01 -2.07818389e-01 -2.25952808e-02 -4.32303488e-01
  2.39120558e-01 -1.57070351e+00 -2.52456486e-01 -4.93468612e-01
 -3.66193771e-01 -1.97554088e+00  6.96181297e-01  8.45668316e-01
 -5.95512629e-01  4.14425820e-01  1.51373878e-01  1.07751215e+00
  1.53143597e+00 -1.95859993e+0

In [9]:
# Save tfidf embedding result
np.save('../data/tfidf_matrix.npy', tfidf_matrix)

# to load the tf-idf matrix:
# loaded_matrix = np.load('tfidf_matrix.npy')

# save word2vec model
word2vec_model.save('../data/word2vec_model.gensim')
# Load the model
# df = Word2Vec.load('word2vec_model.gensim')

In [17]:
df = pd.read_csv("../data/mbti_1.csv")
"""
Here we create 4 new columns each containing information about one of the key dichotomies of MBTI
"""

def label_mbti_ie(row):
    if "I" in row['type']:
        return 0
    else:
        return 1
    
def label_mbti_ns(row):
    if "N" in row['type']:
        return 0
    else:
        return 1

def label_mbti_ft(row):
    if "F" in row['type']:
        return 0
    else:
        return 1

def label_mbti_pj(row):
    if "P" in row['type']:
        return 0
    else:
        return 1

df['I/E'] = df.apply(label_mbti_ie, axis=1)
df['N/S'] = df.apply(label_mbti_ns, axis=1)
df['F/T'] = df.apply(label_mbti_ft, axis=1)
df['P/J'] = df.apply(label_mbti_pj, axis=1)

In [6]:
embedding_matrix = np.load("tfidf_matrix.npy")
embed = pd.DataFrame({'Row': [np.array(row)[1:-1] for row in embedding_matrix]})

In [30]:
four_cat = df[['I/E', 'N/S','F/T','P/J']]
four_cat

Unnamed: 0,I/E,N/S,F/T,P/J
0,0,0,0,1
1,1,0,1,0
2,0,0,1,0
3,0,0,1,1
4,1,0,1,1
...,...,...,...,...
8670,0,1,0,0
8671,1,0,0,0
8672,0,0,1,0
8673,0,0,0,0


In [28]:
new_df = pd.concat([df, embed], axis=1)
new_df

Unnamed: 0,type,posts,I/E,N/S,F/T,P/J,Row
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,ENTP,'I'm finding the lack of me in these posts ver...,1,0,1,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,0,0,1,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",0,0,1,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,ENTJ,'You're fired.|||That's another silly misconce...,1,0,1,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8671,ENFP,'So...if this thread already exists someplace ...,1,0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8672,INTP,'So many questions when i do these things. I ...,0,0,1,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8673,INFP,'I am very conflicted right now when it comes ...,0,0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [29]:
new_df.to_csv('../data/tfidf_preprocessed.csv')

In [31]:
four_cat.to_csv('../data/four_cat.csv')