In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Lambda, Dense, Input
from tensorflow.keras.models import Model, Sequential
import tensorflow.keras.backend as K
from sklearn.metrics.pairwise import euclidean_distances
import re

In [2]:
data = """
Climate change refers to significant, long-term changes in the global climate.
It is primarily driven by human activities, such as burning fossil fuels, deforestation, and industrial processes,
which increase the levels of greenhouse gases in the atmosphere. These gases trap heat, leading to a warming effect known as global warming.
Consequences of climate change include more frequent and severe weather events, rising sea levels, and impacts on ecosystems and biodiversity.
Efforts to address climate change focus on reducing emissions, transitioning to renewable energy, and enhancing adaptation strategies.
"""

# Preprocessing: lowercasing and removing punctuation
climate_data = re.sub(r'[^\w\s]', '', data.lower()).split()

In [3]:
tokenizer = tf.keras.layers.TextVectorization(split='whitespace')
tokenizer.adapt(climate_data)
vocab = tokenizer.get_vocabulary()
word2id = {word: index for index, word in enumerate(vocab)}
id2word = {index: word for word, index in word2id.items()}
vocab_size = len(vocab)

In [4]:
print('Vocabulary Size:', vocab_size)
print('Sample Vocabulary:', list(word2id.items())[:10])

Vocabulary Size: 67
Sample Vocabulary: [('', 0), ('[UNK]', 1), ('and', 2), ('to', 3), ('climate', 4), ('the', 5), ('change', 6), ('warming', 7), ('on', 8), ('of', 9)]


In [5]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for i in range(window_size, len(corpus) - window_size):
        context = corpus[i - window_size:i] + corpus[i + 1:i + window_size + 1]
        target = corpus[i]
        x = pad_sequences([context], maxlen=context_length, padding='post')
        y = to_categorical([target], vocab_size)
        yield x, y

In [6]:
corpus_ids = [word2id[word] for word in climate_data]

In [7]:
embed_size = 100  # Embedding vector size
window_size = 2

cbow = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size * 2),
    Lambda(lambda x: K.mean(x, axis=1)),
    Dense(vocab_size, activation='softmax')
])

cbow.compile(optimizer='adam', loss='categorical_crossentropy')
cbow.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            6700      
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 67)                6767      
                                                                 
Total params: 13,467
Trainable params: 13,467
Non-trainable params: 0
_________________________________________________________________


In [8]:
epochs = 20
for epoch in range(epochs):
    loss = 0
    for x, y in generate_context_word_pairs(corpus=corpus_ids, window_size=window_size, vocab_size=vocab_size):
        loss += cbow.train_on_batch(x, y)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}")

Epoch 1/20, Loss: 349.2967734336853
Epoch 2/20, Loss: 344.31340503692627
Epoch 3/20, Loss: 339.5966351032257
Epoch 4/20, Loss: 333.8280704021454
Epoch 5/20, Loss: 326.6017556190491
Epoch 6/20, Loss: 317.60401368141174
Epoch 7/20, Loss: 306.6628601551056
Epoch 8/20, Loss: 293.7828891277313
Epoch 9/20, Loss: 279.1548116207123
Epoch 10/20, Loss: 263.1309566497803
Epoch 11/20, Loss: 246.17086362838745
Epoch 12/20, Loss: 228.77025842666626
Epoch 13/20, Loss: 211.39238679409027
Epoch 14/20, Loss: 194.4177689552307
Epoch 15/20, Loss: 178.12046813964844
Epoch 16/20, Loss: 162.6712628006935
Epoch 17/20, Loss: 148.1606655716896
Epoch 18/20, Loss: 134.62850654125214
Epoch 19/20, Loss: 122.08715283870697
Epoch 20/20, Loss: 110.53339275717735


In [9]:
weights = cbow.get_layer('embedding').get_weights()[0]
print("Embedding Matrix Shape:", weights.shape)

Embedding Matrix Shape: (67, 100)


In [10]:
embeddings_df = pd.DataFrame(weights, index=[id2word[i] for i in range(vocab_size)])
print(embeddings_df.head())

               0         1         2         3         4         5         6   \
        -0.001759 -0.049306 -0.007503  0.016459 -0.030443 -0.034881 -0.008138   
[UNK]   -0.007659 -0.034708 -0.010460 -0.002799  0.049308 -0.036350 -0.031440   
and      0.174661 -0.114852 -0.128286 -0.151958  0.573530 -0.061079 -0.409518   
to      -0.267396  0.527209 -0.029663 -0.206797  0.146578 -0.154393 -0.012032   
climate -0.217565 -0.438991 -0.042155  0.270349  0.011287  0.456488 -0.034412   

               7         8         9   ...        90        91        92  \
         0.034004  0.034712  0.023625  ...  0.026993  0.040169  0.013453   
[UNK]   -0.002631  0.015436  0.045613  ...  0.036392 -0.037663  0.045154   
and     -0.392264  0.080493 -0.323466  ... -0.270399 -0.077988 -0.020328   
to       0.186201  0.020365 -0.325079  ... -0.585742 -0.024113 -0.258423   
climate  0.265897  0.063658 -0.324793  ...  0.299212  0.094291 -0.222663   

               93        94        95        96        9

In [11]:
distance_matrix = euclidean_distances(weights)


In [12]:
def get_similar_words(search_term, top_n=5):
    term_id = word2id[search_term]
    distances = distance_matrix[term_id]
    closest_ids = distances.argsort()[1:top_n + 1]
    similar_words = [id2word[idx] for idx in closest_ids]
    return similar_words

print("Similar words to 'climate':", get_similar_words('climate'))

Similar words to 'climate': ['change', '', 'address', '[UNK]', 'more']
