# Task 1: Word Embeddings (10 points)

This notebook will guide you through all steps necessary to train a word2vec model (Detailed description in the PDF).

## Imports

This code block is reserved for your imports. 

You are free to use the following packages: 

(List of packages)

In [None]:
# Imports
import pandas as pd
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import string
import re
import numpy as np
import math
import random
import regex

# 1.1 Get the data (0.5 points)

The Hindi portion HASOC corpus from [github.io](https://hasocfire.github.io/hasoc/2019/dataset.html) is already available in the repo, at data/hindi_hatespeech.tsv . Load it into a data structure of your choice. Then, split off a small part of the corpus as a development set (~100 data points).

If you are using Colab the first two lines will let you upload folders or files from your local file system.

In [None]:
#FOR HINDI DATASET
url = 'https://raw.githubusercontent.com/alch00001/NNTI-WS2021-NLP-Project/main/data/hindi_hatespeech.tsv'
development = pd.read_csv(url, sep='\t')
development.astype("string")
development.head()

development['text'] = development['text'].apply(lambda x:' '.join(x.lower() for x in x.split())) #remove english words
development['text'] = development['text'].apply(lambda x: regex.sub(r'(#[^\s]*)*', '',x))                                    #removing hashtags     
development['text'] = development['text'].apply(lambda x: regex.sub(r'(@[\w]*)*[\d~\|\p{Punct}*]*(http[^\s]*)*', '',x)) 
development['text'] = development['text'].apply(lambda x: regex.sub(r'<[^<]+?>','',x)) #remove html 
development['text'] = development['text'].apply(lambda x: regex.sub(r'href=','',x)) 
development['text'] = development['text'].apply(lambda x: x.lower())                                                          #make lower case
#import and remove stopwords
stopurl = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-hi/master/stopwords-hi.txt'
stopwords = pd.read_csv(stopurl, sep='\t', header=None)
development['text'] = development['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords[:][0].tolist()))
#remove emojis, this script was taken from github
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
development['text'] = development['text'].apply(lambda x: remove_emoji(x))

#this is only unique words
V = list(set(development['text'].str.split(' ').sum()))
#all words
corpus = ' '.join([i for i in development['text']]).split()

KeyboardInterrupt: ignored

In [None]:
#FOR BENGALI DATASET
'''
url = 'https://raw.githubusercontent.com/alch00001/NNTI-WS2021-NLP-Project/main/data/bengali_hatespeech.csv'
bengali_data = pd.read_csv(url,header = None)

#Making equal to Hindi set which has 2469 hate  and 2196 non hate
#lets just make 4700 Bengali corpus, 2500 hate and 2200 not hate
development = bengali_data.loc[1:2500].copy()
not_hate = bengali_data.loc[10001:12200].copy()
development = development.append(not_hate)
development = development.sample(frac=1) #shuffle it so its random

#preprocess as before
development[0] = development[0].apply(lambda x: " ".join(x.lower() for x in x.split()))
development[0] = development[0].apply(lambda x: regex.sub(r'(#[^\s]*)*', '',x))                                    #removing hashtags     
development[0] = development[0].apply(lambda x: regex.sub(r'(@[\w]*)*[\d~\|\p{Punct}*]*(http[^\s]*)*', '',x)) 
development[0] = development[0].apply(lambda x: regex.sub(r'<[^<]+?>','',x)) #remove html 
development[0] = development[0].apply(lambda x: regex.sub(r'href=','',x))    
development[0] = development[0].apply(lambda x: regex.sub(r'\s{2,}',' ',x)) 
development[0] = development[0].str.replace('\s{2,}', ' ')                                                    #make lower case
#import and remove stopwords
stopurl = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-bn/master/stopwords-bn.txt'
stopwords = pd.read_csv(stopurl, sep='\t', header=None)
development[0] = development[0].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords[:][0].tolist()))
#remove emojis, this script was taken from github
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
development[0] = development[0].apply(lambda x: remove_emoji(x))
 
development[0] = development[0].replace(r'^\s*$', '', regex=True)
print(development[0].head())

V = list(set(development[0].str.split(' ').sum()))
corpus = ' '.join([i for i in development[0]]).split()
'''

'\nurl = \'https://raw.githubusercontent.com/alch00001/NNTI-WS2021-NLP-Project/main/data/bengali_hatespeech.csv\'\nbengali_data = pd.read_csv(url,header = None)\n\n#Making equal to Hindi set which has 2469 hate  and 2196 non hate\n#lets just make 4700 Bengali corpus, 2500 hate and 2200 not hate\ndevelopment = bengali_data.loc[1:2500].copy()\nnot_hate = bengali_data.loc[10001:12200].copy()\ndevelopment = development.append(not_hate)\ndevelopment = development.sample(frac=1) #shuffle it so its random\n\n#preprocess as before\ndevelopment[0] = development[0].apply(lambda x: " ".join(x.lower() for x in x.split()))\ndevelopment[0] = development[0].apply(lambda x: regex.sub(r\'(#[^\\s]*)*\', \'\',x))                                    #removing hashtags     \ndevelopment[0] = development[0].apply(lambda x: regex.sub(r\'(@[\\w]*)*[\\d~\\|\\p{Punct}*]*(http[^\\s]*)*\', \'\',x)) \ndevelopment[0] = development[0].apply(lambda x: regex.sub(r\'<[^<]+?>\',\'\',x)) #remove html \ndevelopment[0] = de

In [None]:
if len(V[0]) == 0: #resolve an issue with emptry string appearing in vocab
  V.pop(0)

word2idx = {w: idx for (idx, w) in enumerate(V)}
idx2word = {idx: w for (idx, w) in enumerate(V)}
print("V is of length ",len(V))
print(len(corpus))
print(V[0])

V is of length  6485
15991
शिवसेना


* Then, write a function ```word_to_one_hot``` that returns a one-hot encoding of an arbitrary word in the vocabulary. The size of the one-hot encoding should be ```len(v)```.

In [None]:
#TODO: implement!

def word_to_one_hot(word):
  one_hot = [0 if word != x else 1 for x in V]
  return one_hot

X = word_to_one_hot(V[1])
len(X)
        

6485

## 1.4 Subsampling (0.5 points)

The probability to keep a word in a context is given by:

$P_{keep}(w_i) = \Big(\sqrt{\frac{z(w_i)}{0.001}}+1\Big) \cdot \frac{0.001}{z(w_i)}$

Where $z(w_i)$ is the relative frequency of the word $w_i$ in the corpus. Now,
* Calculate word frequencies
* Define a function ```sampling_prob``` that takes a word (string) as input and returns the probabiliy to **keep** the word in a context.

In [None]:
#TODO: implement!
def word_frequency(word):
  freq = 0
  for x in corpus:
    if x == word:
      freq += 1
  return freq

def sampling_prob(word):
  relative_frq = word_frequency(word)/len(V)
  if relative_frq==0:          #if word is not present in the corpus
    return 0
  else:
    p_keep = (np.sqrt(relative_frq/0.000001)+1)*(0.000001/relative_frq)
    return p_keep


In [None]:
#Challenge Task 3 - implementing negative sampling

def gen_negative_sample_table():
  exponent = 0.75
  normlization_factor = sum([math.pow(word_frequency(word), exponent) for word in V])
  table_size = int(1e5) #make it to 1e8
  table = np.zeros(table_size, dtype=np.int)
  p = 0 # Cumulative probability
  i = 0
  for j, unigram in enumerate(word2idx):
      p += float(math.pow(word_frequency(unigram), exponent))/normlization_factor
      while i < table_size and float(i) / table_size < p:
        table[i] = j
        i += 1 
  return table

neg_samples = gen_negative_sample_table()
np.random.shuffle(neg_samples)

In [None]:
#generate 5 words for negative samples
def get_neg_sample(table,count):
    indices = np.random.randint(low=0, high=len(table), size=count)
    return [V[table[i]] for i in indices]

# 1.5 Skip-Grams (1 point)

Now that you have the vocabulary and one-hot encodings at hand, you can start to do the actual work. The skip gram model requires training data of the shape ```(current_word, context)```, with ```context``` being the words before and/or after ```current_word``` within ```window_size```. 

* Have closer look on the original paper. If you feel to understand how skip-gram works, implement a function ```get_target_context``` that takes a sentence as input and [yield](https://docs.python.org/3.9/reference/simple_stmts.html#the-yield-statement)s a ```(current_word, context)```.

* Use your ```sampling_prob``` function to drop words from contexts as you sample them. 

In [None]:
#TODO: implement!
window_size = 5

def get_target_context(sentence):
  words = sentence.split()
  training_data = []
  #remove high freq words
  for word in words:
    if random.random() < sampling_prob(word): 
      words.remove(word)
  #for actual(positive) contexts
  for word in words:
    position = words.index(word) 
    for i in range(-window_size, window_size+1):
      if position+i<0 or position+i>=len(words) or i==0:
        continue
      training_data.append([word, words[position+i],1])  
  #now get random(negative) contexts
  neg_data = []
  for word in words:
    n_neg_examples = 0
    negs = get_neg_sample(neg_samples,10)
    for n in negs:
      if n_neg_examples >= 5:
        break
      elif (word,n) not in training_data:
        neg_data.append([word,n,0])
        n_neg_examples +=1
  yield training_data, neg_data


# 1.6 Hyperparameters (0.5 points)

According to the word2vec paper, what would be a good choice for the following hyperparameters? 

* Embedding dimension
* Window size

Initialize them in a dictionary or as independent variables in the code block below. 

In [None]:
# Set hyperparameters
window_size = 5
embedding_size = 300 
vocab_size = len(V)
batch_size = 7000
learning_rate = 0.01
epochs = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1.7 Pytorch Module (0.5 + 0.5 + 0.5 points)

Pytorch provides a wrapper for your fancy and super-complex models: [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html). The code block below contains a skeleton for such a wrapper. Now,

* Initialize the two weight matrices of word2vec as fields of the class.

* Override the ```forward``` method of this class. It should take a one-hot encoding as input, perform the matrix multiplications, and finally apply a log softmax on the output layer.

* Initialize the model and save its weights in a variable. The Pytorch documentation will tell you how to do that.

In [None]:
#Create Model

class Word2Vec(nn.Module):
  def __init__(self):
    super(Word2Vec, self).__init__()
    
    # Hidden layer
    self.l1 = nn.Linear(len(V), embedding_size, bias=False)
    self.l2 = nn.Linear(len(V), embedding_size, bias=False)

  def forward(self, targ, context):
    Z1 = self.l1(targ)
    Z2 = self.l2(context)
    dot_u_v = torch.zeros(targ.shape[0], 1)
    for j in range(len(Z1)):
      dot_u_v[j, :] = torch.dot(Z1[j, :],Z2[j, :])
    pred = dot_u_v
    out = nn.Sigmoid()(pred)
    return out


# 1.8 Loss function and optimizer (0.5 points)

Initialize variables with [optimizer](https://pytorch.org/docs/stable/optim.html#module-torch.optim) and loss function. You can take what is used in the word2vec paper, but you can use alternative optimizers/loss functions if you explain your choice in the report.

In [None]:
# Define optimizer and loss
model = Word2Vec()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,)
criterion = nn.BCELoss()
model = model.to(device=device)

# 1.9 Training the model (3 points)

As everything is prepared, implement a training loop that performs several passes of the data set through the model. You are free to do this as you please, but your code should:

* Load the weights saved in 1.6 at the start of every execution of the code block
* Print the accumulated loss at least after every epoch (the accumulate loss should be reset after every epoch)
* Define a criterion for the training procedure to terminate if a certain loss value is reached. You can find the threshold by observing the loss for the development set.

You can play around with the number of epochs and the learning rate.

In [None]:
#iterate through every sentence and create a list 
#of target,context pairs stored in variable 'pairs'
train_set = development['text'].str.split(',')
train_set = train_set.to_list()
print("sentences :",len(train_set))
train_set = [''.join(x) for x in train_set]
print('train set created')
pos_pairs = []
neg_pairs = []
for sentence in train_set:
  contexts = get_target_context(sentence)
  pos, neg = next(contexts)
  pos_pairs += pos
  neg_pairs += neg

print('pairs formed')
print('len positive: ', len(pos_pairs))
print('len negative: ', len(neg_pairs))
pairs = pos_pairs + neg_pairs
random.shuffle(pairs)
print(pairs[:10])


sentences : 1001
train set created
pairs formed
len positive:  122091
len negative:  76215
[['इतना', 'apman', 0], ['दिलाएँगे', 'पाएँगे', 1], ['गठबंधन', 'धेर्य', 1], ['खिराज', 'शहीद', 1], ['आलोकधन्वा', 'जन्मदिन', 1], ['रुपये', 'नौकरी', 1], ['अवार्ड', 'पता', 1], ['श्रीराम', 'वन्देमातरम', 1], ['शाख़', 'बरगद', 1], ['देशविरोधी', 'नईम', 0]]


In [None]:
from torch.utils.data import Dataset, DataLoader
dataloader = DataLoader(
    pairs, 
    batch_size=100)
for epoch in range(epochs):
  total_loss = 0
  for i,(center, contexts,labels)in enumerate(dataloader): #returns a list of word,context pairs in string format of length batch
      #convert target/center words to one hot encodings
      word = torch.FloatTensor([word_to_one_hot(x) for x in center])
      context = torch.FloatTensor([word_to_one_hot(x) for x in contexts])
      output = model(word.to(device=device),context.to(device=device))
      labels = labels.float()
      optimizer.zero_grad()
      loss = criterion(output,torch.Tensor(labels).view(output.shape[0], 1))
      total_loss += loss
      loss.backward()
      optimizer.step()
  print("Total loss at epoch: ",epoch+1, "loss: ", total_loss/(len(dataloader)))

print("Training finished")


KeyboardInterrupt: ignored

In [None]:
w1 = model.l1.weight
print(w1.shape)

# 1.10 Train on the full dataset (0.5 points)

Now, go back to 1.1 and remove the restriction on the number of sentences in your corpus. Then, reexecute code blocks 1.2, 1.3 and 1.6 (or those relevant if you created additional ones). 

* Then, retrain your model on the complete dataset.

* Now, the input weights of the model contain the desired word embeddings! Save them together with the corresponding vocabulary items (Pytorch provides a nice [functionality](https://pytorch.org/tutorials/beginner/saving_loading_models.html) for this).

In [None]:
#Saving Model

#from google.colab import drive
#drive.mount('/content/gdrive')
model_save_name = 'classifier.pth'
#path = F"/content/gdrive/My Drive/{model_save_name}" 
#torch.save(model.state_dict(), path)

torch.save(model.state_dict(), 'checkpoint.pth') #saving weights
state_dict = torch.load('checkpoint.pth')
print(state_dict.keys())
print(state_dict['l2.weight'].shape)
weights = state_dict['l1.weight']

In [None]:
#create dictionary mapping vocab words to tensors
weights = torch.transpose(weights,0,1)
vectors = {}
w = weights.cpu()
for i in range(len(V)):
  vectors[V[i]] = w[i].numpy()


In [None]:
#Write these dictionary to text file in the format:
# word tensorval1 tensorval2 ....tensorval2
from itertools import chain
import io
with open(r'C:\Users\kusha\Desktop\HindiEmbeddingsUpdated.txt', 'w', encoding="utf-8") as f:
  f.write("\n".join(" ".join(chain([key],[str(number) for number in value])) for key,value in vectors.items()))  
f.close()   

In [None]:
!nvidia-smi

In [None]:
torch.cuda.get_device_name()