First find the embeddings of all words in the dataset and then pick a random word and find 10 words that are close to it.Your metric to find similarity should be **Euclidean Distance**.

#Load Pretrain Model

In [None]:
!pip install transformers[sentencepiece]

In [None]:
import torch
import re
from transformers import BertModel, BertTokenizer

In [None]:
model_checkpoint = "bert-base-uncased"

In [None]:
model = BertModel.from_pretrained(model_checkpoint, output_hidden_states = True)
model.eval()
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)

In [None]:
def bert_text_preparation(text, tokenizer):
  """
  Preprocesses text input in a way that BERT can interpret.
  """
  marked_text = "[CLS] " + text + " [SEP]"
  tokenized_text = tokenizer.tokenize(marked_text)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [1]*len(indexed_tokens)
  # convert inputs to tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensor = torch.tensor([segments_ids])
  return tokenized_text, tokens_tensor, segments_tensor

In [None]:
def get_bert_embeddings(tokens_tensor, segments_tensor, model):
  """
  Obtains BERT embeddings for tokens.
  """
  # gradient calculation id disabled
  with torch.no_grad():
    # obtain hidden states
    outputs = model(tokens_tensor, segments_tensor)
    hidden_states = outputs[2]
  # concatenate the tensors for all layers
  # use "stack" to create new dimension in tensor
  token_embeddings = torch.stack(hidden_states, dim=0)
  # remove dimension 1, the "batches"
  token_embeddings = torch.squeeze(token_embeddings, dim=1)
  # swap dimensions 0 and 1 so we can loop over tokens
  token_embeddings = token_embeddings.permute(1,0,2)
  # intialized list to store embeddings
  token_vecs = []
  # "token_embeddings" is a [Y x 12 x 768] tensor
  # where Y is the number of tokens in the sentence
  # loop over tokens in sentence
  for token in token_embeddings:
  # "token" is a [12 x 768] tensor
  # sum the vectors from the last four layers
      token_vec = token[-1]
      token_vecs.append(token_vec)
  return token_vecs

#Load Dataset and Preprocess

In [None]:
import pandas as pd
df = pd.read_csv("q1_sent_train.csv")

In [None]:
def delete_hashtag_usernames(text):
  try:
    result = []
    for word in text.split():
      if word[0] not in ['@', '#']:
        result.append(word)
    return ' '.join(result)
  except:
    return ''

def delete_url(text):
  text = re.sub(r'http\S+', '', text)
  return text

In [None]:
import string 
special_tokens = ['[UNK]', '[CLS]', '[SEP]']
def remove_unuseful_tokens(tokens):
  tokens_without_stopwords = list()
  for token in context_sorted:
    if token in string.punctuation:
      continue
    elif '#' in token:
      continue
    elif token in special_tokens:
      continue
    elif token[0] == '[':
      continue
    else:
      tokens_without_stopwords.append(token)
  return tokens_without_stopwords

In [None]:
# 1. extract all tweets from files and save them in memory base on each year.

texts = df["text"]
normalized_list = []
for text in texts:
  new_text = delete_url(text)
  new_text = delete_hashtag_usernames(new_text)
  normalized_list.append(new_text)

normalized_texts = pd.DataFrame(normalized_list, columns=['text'])
tweets = normalized_texts["text"]

# 2. remove urls, hashtags and usernames.

In [None]:
sentences = tweets[:1000]
from collections import OrderedDict
context_dict = {} # key is token and values are (embeddings, count)
result_dict1 = dict()
context_embeddings = []
context_tokens = []
for sentence in sentences:
  tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
  list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
  # make ordered dictionary to keep track of the position of each   word
  tokens = OrderedDict()
  # loop over tokens in sensitive sentence
  for token in tokenized_text[1:-1]:
    # keep track of position of word and whether it occurs multiple times
    if token in tokens:
      tokens[token] += 1
    else:
      tokens[token] = 1
    # compute the position of the current token
    token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
    current_index = token_indices[tokens[token]-1]
    # get the corresponding embedding
    token_vec = list_token_embeddings[current_index]
    
    # save values
    if token in context_dict:
      context_dict[token]['embedding'] += token_vec
      context_dict[token]['count'] += 1
    else:
      context_dict[token] = {'embedding': token_vec, 'count': 1}

context_sorted = sorted(context_dict, key=lambda k: context_dict[k]['count'])
context_sorted.reverse()

tokens_without_stopwords = remove_unuseful_tokens(context_sorted)

for token in tokens_without_stopwords[:1000]:
  context_tokens.append(token)
  context_embeddings.append(context_dict[token]['embedding'] / context_dict[token]['count'])
  result_dict1[token] = context_dict[token]['embedding'] / context_dict[token]['count']

#Find 10 Nearest neighbor

In [None]:
def find_k_nearest_neighbors(word, embedding_dict, k):
  # Impelement Euclidean Distance and them find k nearest neighbors of word using this metric
  pass

In [None]:
word = '' # Pick a random word from dataset
# 2. find 10 nearest words
top_10_nearest_words = find_k_nearest_neighbors(word, result_dict1, 10)

# Print the top-10 words
for i, word in enumerate(top_10_nearest_words):
    print(f"{i}- {word}")