In [55]:
import numpy as np 
import pandas as pd 
from bpemb import BPEmb
import torch 

In [21]:
#!pip install bpemb


In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
device = torch.device("cpu")
if torch.cuda.is_available():
 device = torch.device("cuda")
device

device(type='cpu')

In [67]:
# import training data
dft_eng = pd.read_csv('dft_eng.csv')

# import validation data
dfv_eng = pd.read_csv('dfv_eng.csv')

embedding = torch.load('last_hidden.pt', map_location=torch.device('cpu'))

In [31]:
bpemb_en = BPEmb(lang='en', dim=100, vs=25000)
bpemb_en.vectors

array([[ 0.305192, -0.486759, -0.361542, ..., -0.205049,  0.33516 ,
        -0.428452],
       [-0.015292, -0.072622,  0.131374, ..., -0.291093, -0.15548 ,
        -0.329501],
       [ 0.266255,  0.113249, -0.081075, ..., -0.316839,  0.012411,
        -0.232759],
       ...,
       [ 0.608477, -0.223953, -1.449336, ..., -0.07385 , -0.800959,
         0.657389],
       [ 0.167647, -0.133789,  0.252258, ...,  0.708369, -0.09607 ,
         0.120539],
       [-0.039888, -0.158139,  0.41632 , ..., -0.181057,  0.534418,
        -0.487808]], dtype=float32)

In [57]:
!git pull

remote: Enumerating objects: 6, done.[K
remote: Counting objects:  16% (1/6)[Kremote: Counting objects:  33% (2/6)[Kremote: Counting objects:  50% (3/6)[Kremote: Counting objects:  66% (4/6)[Kremote: Counting objects:  83% (5/6)[Kremote: Counting objects: 100% (6/6)[Kremote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects:  50% (1/2)[Kremote: Compressing objects: 100% (2/2)[Kremote: Compressing objects: 100% (2/2), done.[K
remote: Total 4 (delta 2), reused 4 (delta 2), pack-reused 0[K
Unpacking objects: 100% (4/4), done.
From https://github.com/hrobjarturh/nlptasks
   8eedb6a..db99f32  main       -> origin/main
Updating 8eedb6a..db99f32
Fast-forward
 data/last_hidden.pt        | Bin [31m0[m -> [32m3819[m bytes
 data/last_hidden_trans.txt | 155 [31m---------------------------------------------[m
 2 files changed, 155 deletions(-)
 create mode 100644 data/last_hidden.pt
 delete mode 100644 data/last_hidden_trans.txt


On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mnotebooks/hali-temp/colab_subtask2-2.ipynb[m

nothing added to commit but untracked files present (use "git add" to track)


In [None]:
class LSTM_LM(nn.Module):
    """
    LSTM Language Model
    """
    def __init__(
            self,            
            pretrained_embeddings: torch.tensor,
            lstm_dim: int,       
            dropout_prob: float = 0.0,
            lstm_layers: int = 1,
    ):
        """
        Initializer for LSTM Language Model
        :param pretrained_embeddings: A tensor containing the pretrained BPE embeddings
        :param lstm_dim: The dimensionality of the BiLSTM network
        :param dropout_prob: Dropout probability
        :param lstm_layers: The number of stacked LSTM layers
        """

        # First thing is to call the superclass initializer
        super(LSTM_LM, self).__init__()

        # We'll define the network in a ModuleDict, which makes organizing the model a bit nicer
        # The components are an embedding layer, an LSTM layer, a dropout layer, and a feed-forward output layer
        self.vocab_size = pretrained_embeddings.shape[0]
        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pretrained_embeddings.shape[0] - 1),
            'lstm': nn.LSTM( 
                pretrained_embeddings.shape[1],
                lstm_dim,
                num_layers=lstm_layers,
                batch_first=True,
                dropout=dropout_prob),
            'ff': nn.Linear(lstm_dim, pretrained_embeddings.shape[0]),
            'drop': nn.Dropout(dropout_prob)
        })

        # Initialize the weights of the model
        self._init_weights()

    def _init_weights(self):
        all_params = list(self.model['lstm'].named_parameters()) + \
                     list(self.model['ff'].named_parameters())
        for n, p in all_params:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

    def forward(self, input_ids, input_lens, hidden_states):
        """
        Defines how tensors flow through the model
        :param input_ids: (b x sl) The IDs into the vocabulary of the input samples
        :param input_lens: (b x 1) The length of each instance's text
        :param hidden_states: (b x sl) x 2 Hidden states for the LSTM model
        :return: (lstm output, updated hidden stated)
        """

        # Get embeddings (b x sl x edim)
        embeds = self.model['drop'](self.model['embeddings'](input_ids))

        lstm_in = nn.utils.rnn.pack_padded_sequence(
            embeds,
            input_lens.to('cpu'),
            batch_first=True,
            enforce_sorted=False
        )

        # Pass the packed sequence through the BiLSTM
        lstm_out, hidden = self.model['lstm'](lstm_in)
        # Unpack the packed sequence --> (b x sl x 2*lstm_dim)
        lstm_out, hidden_states = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        lstm_out = self.model['drop'](lstm_out)
        # generate the prediction of each word in the vocabulary being the next
        lstm_out = self.model['ff'](lstm_out)
        lstm_out = lstm_out.reshape(-1, self.vocab_size)

        return lstm_out, hidden_states