In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

# Make sure to download these resources if you haven't already
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv('data/mbti_1.csv')


# Text preprocessing function
def preprocess(text):
    text = re.sub(r'http\S+', '', text)  # Removes URLs that start with http
    text = re.sub(r'www\S+', '', text)   # Removes URLs that start with www

    text = text.lower()  # Lowercase
    tokens = word_tokenize(text)  # Tokenize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stopwords.words('english')]
    return tokens

# Apply preprocessing
df['processed_posts'] = df['posts'].apply(preprocess)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_posts']).toarray()
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Word2Vec
word2vec_model = Word2Vec(sentences=df['processed_posts'], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(df['processed_posts'], total_examples=word2vec_model.corpus_count, epochs=10)

# Example: Get the vector for a word
word_vector = word2vec_model.wv['climate']  # Get the vector for the word "climate"

# Show TF-IDF result
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)
print("TF-IDF Feature Names:", tfidf_feature_names)

# Example output from Word2Vec
print("Word Vector for 'climate':", word_vector)