# Train Word2Vec word embeddings, explore and visualize them

In [2]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import re
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import random

In [7]:
# Ensure NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [30]:
# Load your data
# file_path = '/content/combined_text_visual_labels.csv'
file_path = '/content/messages.txt'
# data = pd.read_csv(file_path)
data = pd.read_csv(file_path, sep='\t')

In [39]:
# Preprocessing function for text
# def preprocess_text(text):
#     # Lowercase
#     text = text.lower()
#     # Remove non-alphanumeric characters (basic cleaning)
#     text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
#     # Tokenize
#     tokens = word_tokenize(text)
#     return tokens

# Preprocessing function for text
def preprocess_text(text):
    # Check if the text is a string before applying lower()
    if isinstance(text, str):
        # Remove links and similar non-words using regular expressions
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove links
        text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags

        # Lowercase
        text = text.lower()
        # Remove non-alphanumeric characters (basic cleaning)
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # Tokenize
        tokens = word_tokenize(text)
        return tokens
    else:
        # Handle non-string values (e.g., float) - you can choose to skip them,
        # replace them with an empty string, or handle them in a way that makes
        # sense for your data.
        return []  # Returning an empty list for non-string values


# Apply preprocessing to the 'utterance' column
data['tokens'] = data['utterance'].apply(preprocess_text)
# print(data.head())

In [40]:
# Train Word2Vec model
tokenized_corpus = data['tokens'].tolist()
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

In [41]:
# Extract word vectors for visualization
words = list(model.wv.index_to_key)
word_vectors = model.wv[words]

In [43]:
# Get the vector for a specific word
word_vector = model.wv['rapture']

# # Example:
# word_vector = model.wv['hello']
print(word_vector)  # Prints the vector representation of 'hello'

[-1.6375694   0.87641317  0.47428113 -0.6601066   1.1313608  -0.28947112
 -0.16658887  2.0931907   2.5158696  -0.19141366 -0.1302067  -0.77523404
  0.6766735   0.55199283  0.72020084 -1.097189    0.65282977  0.11378887
  0.13688046 -1.5672868   0.868893   -0.10696364  2.2126522   1.155414
  0.6142786  -1.315182    0.09236972  0.5233969   0.06617496  0.59227705
 -0.29106268 -0.52912974 -0.97642875  0.19655797 -0.00990307  0.41919494
  0.28485382 -0.16640693  0.36843815 -0.05643533  0.176577    0.3282708
  0.6719951  -0.83766675 -0.12759058 -0.17451616 -1.3079484  -0.02385449
 -0.30156586 -0.40379208  0.5687018  -0.5405348   0.8332079  -0.6553475
  0.7039992  -0.6530111   0.34768307  0.914566    0.3236766  -1.4503043
 -0.47211426  0.05285298  0.6264814   0.48881993 -0.05239639 -1.6242442
  0.14549874  0.36125132  0.6770163   1.1054379   0.3455163   1.0821544
 -0.03672126 -0.6040956  -0.19070745  0.2411879  -0.17256866  0.7501581
  0.2256451  -2.0480902  -0.67912954  0.6607992   0.2974880

In [45]:
# # Calculate cosine similarity between two word vectors
# similarity = model.wv.similarity('word1', 'word2')

# # Find similarity between words:
# similarity = model.wv.similarity('hello', 'hi')
# print(similarity)  # Prints the similarity score between 'hello' and 'hi'

# Find the most similar words
similar_words = model.wv.most_similar(positive=[word_vector], topn=20)

# Print the results:
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

rapture: 1.0
bliss: 0.8620722889900208
piti: 0.8212586045265198
sukkha: 0.8126170039176941
sukha: 0.80827796459198
pleasure: 0.8054018020629883
joy: 0.793758749961853
contentment: 0.7523196935653687
tranquility: 0.7446420192718506
lightness: 0.741054892539978
calmness: 0.7373017072677612
pleasantness: 0.7068516612052917
spaciousness: 0.6907825469970703
wave: 0.6906237602233887
blissful: 0.6746405363082886
waves: 0.6698039770126343
euphoria: 0.6593618392944336
bursts: 0.6495271921157837
heaviness: 0.6328808665275574
elation: 0.6288173794746399


In [42]:
# Reduce dimensions using t-SNE
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(word_vectors)

KeyboardInterrupt: 

In [None]:
# Get the list of English stop words from nltk
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
# Random shuffle the words in the data to be able to look at a different subset every time
#random.shuffle(words)

# filter stopwords before display
words_subset = [w for w in words if not w in stop_words][:200]

In [None]:
# Plotting the embeddings
plt.figure(figsize=(12, 8))
for i, word in enumerate(words_subset):
    plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1])
    plt.text(reduced_vectors[i, 0] + 0.1, reduced_vectors[i, 1] + 0.1, word, fontsize=9)
plt.title('t-SNE visualization of Word2Vec word embeddings')
plt.show()