In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from lib.data_preparation import DataPreparation
from lib.preprocessing_data import Preprocessing


data_set = DataPreparation().load_data()
data_set = Preprocessing().lemmatization(data_set)

data_set = data_set.sample(frac=0.02).reset_index(drop=True)
processed_tweets = data_set['processed_tweet']

for sentiment, tweet in zip(data_set['sentiment'], data_set['processed_tweet']):
    print(f"{sentiment}: {tweet}")


In [None]:
# Create vocabulary

vocab = set(word for tweet in processed_tweets for word in tweet)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(vocab)


In [None]:
import numpy as np
embedding_size = 71
learning_rate = 0.01
# Initialize embeddings
main_embeddings = np.random.normal(0, 0.1, (vocab_size, embedding_size))
context_embeddings = np.random.normal(0, 0.1, (vocab_size, embedding_size))

In [None]:
def sigmoid(x):
    # Numerically stable sigmoid function
    return np.where(
        x >= 0,
        1 / (1 + np.exp(-x)),
        np.exp(x) / (1 + np.exp(x))
    )

In [None]:
def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

In [None]:
def get_negative_samples(vocab_size, exclude_idx, num_samples, word_freq):
    """Sample negative examples based on word frequencies."""
    probabilities = np.array([freq ** 0.75 for freq in word_freq])
    probabilities /= probabilities.sum()

    negative_samples = []
    while len(negative_samples) < num_samples:
        sampled_idx = np.random.choice(vocab_size, p=probabilities)
        if sampled_idx != exclude_idx:
            negative_samples.append(sampled_idx) 
    return negative_samples

In [None]:
def update_embeddings(center_idx, context_idx, label):
    # Update embeddings for one center-context pair
    center_vector = main_embeddings[center_idx]
    context_vector = context_embeddings[context_idx]

    dot_product = np.dot(center_vector, context_vector)
    prediction = sigmoid(dot_product)
    error = label - prediction

    # Gradient updates
    grad_center = error * context_vector
    grad_context = error * center_vector

    main_embeddings[center_idx] += learning_rate * grad_center
    context_embeddings[context_idx] += learning_rate * grad_context

In [None]:
def plot_words2(word1, word2, word_to_index, main_embeddings, context_embeddings):
    import numpy as np
    import matplotlib.pyplot as plt
    from scipy.spatial.distance import cosine

    plt.figure(figsize=(8, 4))

    # Subplot 1: Main embeddings
    plt.subplot(1, 2, 1)
    vector1_main = main_embeddings[word_to_index[word1]]
    vector2_main = main_embeddings[word_to_index[word2]]

    # Plot vectors for word1 and word2
    plt.scatter(vector1_main[0], vector1_main[1], color='red', label=word1)
    plt.arrow(0, 0, vector1_main[0], vector1_main[1], color='red', head_width=0.00)

    plt.scatter(vector2_main[0], vector2_main[1], color='blue', label=word2)
    plt.arrow(0, 0, vector2_main[0], vector2_main[1], color='blue', head_width=0.00)

    # Add similarity as title
    similarity_main = 1 - cosine(vector1_main, vector2_main)
    plt.title(f"Main Embeddings (Sim = {round(similarity_main, 4)})", fontsize=12)

    # Add gridlines and axes
    plt.axvline(0, color='gray', linestyle='--', alpha=0.5)
    plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
    plt.grid()

    # Set zoomed-in axis limits
    plt.xlim(-0.2, 0.2)
    plt.ylim(-0.2, 0.2)

    
    plt.legend()
    # Add optional unit circle reference
    t = np.linspace(0, 2 * np.pi, 100)
    plt.plot(np.cos(t), np.sin(t), linewidth=0.5, color='black', linestyle='--', alpha=0.5)

    plt.tight_layout()
    plt.show()


In [None]:
# extract three most frequent pairs

from collections import Counter
from itertools import combinations


sentiment_pairs = {'positive': [], 'negative': [], 'neutral': []}

for sentiment, processed_tweet in zip(data_set['sentiment'], data_set['processed_tweet']):
    pairs = list(combinations(processed_tweet, 2))
    sentiment_pairs[sentiment].extend(pairs)

frequent_pairs = {}
for sentiment, pairs in sentiment_pairs.items():
    pair_counts = Counter(pairs)
    frequent_pairs[sentiment] = pair_counts.most_common(3)  # Top 3 pairs


for sentiment, pairs in frequent_pairs.items():
    print(f"{sentiment.upper()} SENTIMENT:")
    for pair, count in pairs:
        print(f"  Pair: {pair}, Count: {count}")



In [None]:
from collections import Counter

word_counts = Counter(word for tweet in processed_tweets for word in tweet)
word_freq = np.array([word_counts[word] for word in vocab])

epochs = 50
window_size = 3
num_negative_samples=3
# Training Loop
# Training Loop
for epoch in range(epochs):
    for tweet in processed_tweets:
        for center_idx, center_word in enumerate(tweet):
            center_word_idx = word_to_idx[center_word]
            start = max(center_idx - window_size, 0)
            end = min(center_idx + window_size + 1, len(tweet))

            for context_idx in range(start, end):
                if center_idx == context_idx:
                    continue
                context_word_idx = word_to_idx[tweet[context_idx]]

                # Positive sample
                update_embeddings(center_word_idx, context_word_idx, 1)
                # Negative samples
                negative_samples = get_negative_samples(vocab_size, center_word_idx, num_negative_samples, word_freq)
                for negative_idx in negative_samples:
                    update_embeddings(center_word_idx, negative_idx, 0)

    # Normalize embeddings after each epoch
    main_embeddings = normalize_embeddings(main_embeddings)
    context_embeddings = normalize_embeddings(context_embeddings)

    # Plot embeddings at the end of every epoch (optional)
    print(f"Epoch {epoch + 1}/{epochs} completed")
    if epoch % 5 == 0:
        plot_words2('thank', 'happy', word_to_idx, main_embeddings, context_embeddings)

# Final visualization
plot_words2('thank', 'happy', word_to_idx, main_embeddings, context_embeddings)

