In [27]:
import collections
import numpy as np
import random
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()


class SkipGram:
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embeddings = np.random.randn(vocab_size, embedding_dim)
        self.W1 = np.random.randn(embedding_dim, vocab_size)
        self.b1 = np.zeros(vocab_size)
    
    def forward(self, inputs):
        z = np.dot(self.embeddings[inputs], self.W1) + self.b1
        exp_scores = np.exp(z)
        softmax_scores = exp_scores / np.sum(exp_scores, axis=0, keepdims=True)
        return softmax_scores
    
    def backward(self, inputs, labels, learning_rate):
        softmax_scores = self.forward(inputs)
        dL_dZ = softmax_scores
        dL_dZ[labels] -= 1
        dL_dW1 = np.dot(self.embeddings[inputs].T, dL_dZ)
        dL_db1 = np.sum(dL_dZ, axis=0)
        self.W1 -= learning_rate * dL_dW1
        self.b1 -= learning_rate * dL_db1
        dL_dEmbeddings = np.dot(dL_dZ, self.W1.T)
        self.embeddings[inputs] -= learning_rate * dL_dEmbeddings

def read_data(filename):
    with open(filename, "r") as f:
        data = f.read()
    return data

def build_vocab(data, vocab_size):
    # Count the frequency of each word in the data
    counter = collections.Counter(data.split())
    # Keep only the most common vocab_size words
    words, _ = zip(*counter.most_common(vocab_size))
    # Create a dictionary mapping words to integers
    word_to_int = dict(zip(words, range(len(words))))
    # Add a special token for unknown words
    word_to_int["UNK"] = len(words)
    return word_to_int

def convert_to_ints(data, word_to_int):
    return [word_to_int.get(word, word_to_int["UNK"]) for word in data.split()]

def generate_sample(data, window_size):
    for i, word in enumerate(data):
        start = max(0, i - window_size)
        end = min(len(data), i + window_size + 1)
        for j in range(start, end):
            if j != i:
                yield (word, data[j])

def get_batch(data, batch_size, window_size):
    # Generate pairs of (word, context_word) from the data
    pairs = generate_sample(data, window_size)
    # Group the pairs into batches of size batch_size
    while True:
        batch = np.array(random.sample(list(pairs), batch_size))
        yield (batch[:, 0], batch[:, 1])

def train_skip_gram(data, vocab_size, embedding_dim, batch_size, window_size, learning_rate, num_steps):
    # Create a word to integer mapping
    word_to_int = build_vocab(data, vocab_size)
    # Convert the data to integers
    data = convert_to_ints(data, word_to_int)
    # Create a generator for generating batches
    batches = get_batch(data, batch_size, window_size)
    # Define the Tensorflow placeholders for the input words and context words
    inputs = tf.placeholder(tf.int32, shape=[batch_size], name="inputs")
    labels = tf.placeholder(tf.int32, shape=[batch_size, 1], name="labels")
    # Define the embedding matrix
    embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_dim], -1.0, 1.0), name="embeddings")
    # Define the weights and biases for the output layer
    W = tf.Variable(tf.truncated_normal([vocab_size, embedding_dim], stddev=0.1), name="W")
    b = tf.Variable(tf.zeros([vocab_size]), name="b")
    
    return embeddings

In [35]:
emb = train_skip_gram(result, len(vocab), 256, 1024, 3, 0.001, 3)

In [36]:
emb

<tf.Variable 'embeddings_3:0' shape=(60695, 256) dtype=float32_ref>

In [None]:
emb

In [32]:
df = pd.read_csv('./../data/processed/processed_data_2.csv')
df = df.dropna()

In [33]:
vocab = set()
_ = df['sentences'].apply(lambda sent : vocab.update(sent.split()))
print("Vocab length : " +str(len(vocab)))

Vocab length : 60695


In [34]:
result = ''
for row in df['sentences']:
    row += ' '
    result += row