In [None]:
# Required libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

In [None]:
# Sample text corpus
corpus = [
    'Natural language processing with word vectors is interesting',
    'Word embeddings capture relationships between words',
    'PCA helps reduce the dimensions of word vectors',
    'Visualizing word vectors reveals patterns in text data',
    'Machine learning models can be applied to natural language tasks'
]

In [None]:
# Preprocess text: Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 for padding
sequences = tokenizer.texts_to_sequences(corpus)
padded_sequences = pad_sequences(sequences, padding='post')

In [None]:
# Create an embedding model
embedding_dim = 50  # Size of the word vectors

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=padded_sequences.shape[1]))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model (even though it's a dummy task, we need to initialize embeddings)
model.fit(padded_sequences, np.zeros((padded_sequences.shape[0], 1)), epochs=10, verbose=0)

In [None]:
# Extract the learned word embeddings
embedding_layer = model.layers[0]
word_embeddings = embedding_layer.get_weights()[0]  # The embedding matrix

In [None]:
# Reduce dimensions using PCA
def plot_word_vectors_2d(embeddings, word_index):
    # Extract word vectors for each word in the vocabulary
    words = list(word_index.keys())
    vectors = np.array([embeddings[word_index[word]] for word in words])
    # Apply PCA to reduce vector dimensions to 2
    pca = PCA(n_components=2)
    result = pca.fit_transform(vectors)

    # Create a scatter plot of the PCA results
    plt.figure(figsize=(8, 6))
    plt.scatter(result[:, 0], result[:, 1], edgecolors='k', c='r', s=100)

    # Annotate points with words
    for i, word in enumerate(words):
        plt.annotate(word, xy=(result[i, 0], result[i, 1]), fontsize=12)
    
    plt.title("Word Vectors Visualized with PCA (Keras Embeddings)")
    plt.show()

In [None]:
# Visualize the word vectors in 2D
plot_word_vectors_2d(word_embeddings, word_index)