<a href="https://colab.research.google.com/github/harshithamadarapu/Team16_Hinglish-Auto-suggestions/blob/main/embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

file_path = '/content/preprocessed_train.csv'
data = pd.read_csv(file_path)

unique_words = set()

for phrase in data['phrases']:
    words = phrase.split()
    unique_words.update(words)

print(f"Number of unique words: {len(unique_words)}")
print(f"Unique words: {unique_words}")


Number of unique words: 38200


In [None]:
import nltk
nltk.download('punkt')        # For word_tokenize
nltk.download('punkt_tab')    # For sentence_tokenize (to prevent errors)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

file_path = '/content/preprocessed_train.csv'
data = pd.read_csv(file_path)

tokenized_phrases = [word_tokenize(phrase.lower()) for phrase in data['phrases']]

model = Word2Vec(tokenized_phrases, vector_size=100, window=5, min_count=1, workers=4)

model.save("word2vec_model.bin")

word_embeddings = []


for word in model.wv.index_to_key:
    embedding = model.wv[word]
    word_embeddings.append([word] + embedding.tolist())

embeddings_df = pd.DataFrame(word_embeddings, columns=['Word'] + [f'Embedding_{i+1}' for i in range(100)])

embeddings_df.to_csv('word_embeddings.csv', index=False)

print("Word embeddings saved to 'word_embeddings.csv'")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Word embeddings saved to 'word_embeddings.csv'


In [None]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

def get_context_from_bert(sentence):
    # Tokenize the sentence into input format for BERT
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Get embeddings from BERT model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the embeddings of all tokens (not just the [CLS] token)
    word_embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: [num_tokens, embedding_dim]

    context_embedding = word_embeddings.mean(dim=0).numpy()

    return word_embeddings, context_embedding

word_embeddings, context_embedding = get_context_from_bert(data['phrases'][0])


print(f"Word embeddings for the first phrase: {word_embeddings}")
print(f"Context embedding (mean of word embeddings) for the first phrase: {context_embedding}")


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

# Load word embeddings from the CSV file
embedding_df = pd.read_csv('/content/word_embeddings.csv')

# Create a dictionary to store word embeddings
word_embeddings_dict = {}

# Extract the word embeddings from the dataframe
for _, row in embedding_df.iterrows():
    word = row['Word']
    embedding = np.array(row[1:])  # Extract all columns except the 'word' column
    word_embeddings_dict[word] = embedding

# Define the LSTM-based model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # LSTM returns the output sequence and the hidden state
        lstm_out, (hn, _) = self.lstm(x)
        # Use the last hidden state as the sentence embedding
        out = self.fc(hn[-1])
        return out

# Example sentence to process
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize the sentence and convert each word to its embedding
words = word_tokenize(sentence.lower())  # Tokenize and lowercased
word_embeddings = []

for word in words:
    # Check if the word exists in the embeddings dictionary, else use a zero vector
    word_embedding = word_embeddings_dict.get(word, np.zeros(100))  # Assuming 100-dimensional embeddings
    word_embeddings.append(word_embedding)

# Convert word embeddings to tensor
word_embeddings_tensor = torch.tensor(word_embeddings, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

# Define the LSTM model
input_dim = 100  # Word2Vec embedding dimension (adjust based on your embeddings)
hidden_dim = 128  # Hidden layer dimension
output_dim = 100  # Output embedding dimension, same as input_dim for simplicity
lstm_model = LSTMModel(input_dim, hidden_dim, output_dim)

# Pass the word embeddings through the LSTM to get context-aware embeddings
context_embedding = lstm_model(word_embeddings_tensor)

# Output the context-aware sentence embedding
print("Context-Aware Sentence Embedding:", context_embedding)


Context-Aware Sentence Embedding: tensor([[-0.0439,  0.0905,  0.1324, -0.0541,  0.0201, -0.0119, -0.0419, -0.0401,
          0.0483, -0.0762, -0.0088,  0.0682, -0.0658, -0.0358,  0.1398, -0.0504,
         -0.0788, -0.0527, -0.0335, -0.1159,  0.0285, -0.0054,  0.0067,  0.1360,
         -0.0298, -0.0324,  0.0181,  0.0146, -0.0517, -0.0500, -0.1385, -0.0677,
          0.0504,  0.1450,  0.0262, -0.0896, -0.0563,  0.0746, -0.0943,  0.1089,
          0.0167, -0.0025, -0.1449, -0.0225, -0.0215,  0.0261,  0.0108, -0.0036,
         -0.0585,  0.0381, -0.0212,  0.0614,  0.1048, -0.0120,  0.0450, -0.0526,
         -0.0095,  0.0482,  0.0009, -0.0860, -0.1285, -0.0775,  0.0306,  0.0531,
         -0.0131, -0.0112,  0.1275,  0.0519, -0.1563,  0.0422, -0.0003, -0.0189,
         -0.0487,  0.0752, -0.0520,  0.0811, -0.0168, -0.0760, -0.0379, -0.1008,
          0.0225,  0.1279,  0.0421, -0.1284,  0.0517, -0.0125,  0.0018, -0.0247,
         -0.0122,  0.0437,  0.0741, -0.1089,  0.0101,  0.0005, -0.0669,  0.

  word_embeddings_tensor = torch.tensor(word_embeddings, dtype=torch.float32).unsqueeze(0)  # Add batch dimension


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize

# Load the dataset containing phrases (sentences)
dataset_df = pd.read_csv('/content/preprocessed_train.csv')

# Load word embeddings from the CSV file
embedding_df = pd.read_csv('/content/word_embeddings.csv')

# Create a dictionary to store word embeddings
word_embeddings_dict = {}
for _, row in embedding_df.iterrows():
    word = row['Word']
    embedding = np.array(row[1:])  # Extract all columns except the 'word' column
    word_embeddings_dict[word] = embedding

# Define the LSTM-based model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])  # Use the last hidden state as the sentence embedding
        return out

# Example setup for LSTM model
input_dim = 100  # Word embedding dimension (adjust if needed)
hidden_dim = 128  # Hidden layer dimension
output_dim = 100  # Output embedding dimension (same as input_dim)
lstm_model = LSTMModel(input_dim, hidden_dim, output_dim)

# Function to get context embedding for a sentence
def get_context_embedding(sentence):
    words = word_tokenize(sentence.lower())  # Tokenize and lowercased
    word_embeddings = []

    for word in words:
        word_embedding = word_embeddings_dict.get(word, np.zeros(100))  # Default to zero vector if word not found
        word_embeddings.append(word_embedding)

    # Convert word embeddings to tensor
    word_embeddings_tensor = torch.tensor(word_embeddings, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

    # Get the context-aware embedding from the LSTM
    context_embedding = lstm_model(word_embeddings_tensor)
    return context_embedding.detach().numpy().flatten()  # Flatten the output to 1D

# List to store context embeddings for all sentences
context_embeddings_list = []

# Process each sentence in the dataset
for _, row in dataset_df.iterrows():
    sentence = row['phrases']
    context_embedding = get_context_embedding(sentence)
    context_embeddings_list.append(context_embedding)

# Convert the list of embeddings into a DataFrame
context_embeddings_df = pd.DataFrame(context_embeddings_list)

# Save the context embeddings to a CSV file
context_embeddings_df.to_csv('/content/context_embeddings.csv', index=False)

print("Context embeddings saved to '/content/context_embeddings.csv'")


Context embeddings saved to '/content/context_embeddings.csv'
