# Setup of WikiText2 Dataset

Load the datasets, including the WikiText-2.

In [27]:
from datasets import load_dataset

wiki_dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
print(dataset['train'][4])

dev_dataset_string = "datasets/test_text.txt"

dev_dataset = ""
with open(dev_dataset_string, 'r') as file:
    dev_dataset = file.read()

# Now file_content holds the entire content of the file as a string
print(dev_dataset)


{'text': " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers . Character designer Raita Honjou and composer Hitoshi Sakimoto both returned from previous entries , along with Valkyria Chronicles II director Takeshi Ozawa . A large team of writers handled the script . The game 's opening theme was sung by May 'n . \n"}
Flowers captivate with their myriad forms and hues, serving both as nature's art and essential elements in ecosystems. They symbolize emotions and mark significant life events across cultures. Beyond beauty, flowers play crucial roles in agriculture by attracting pollinators, essential for crop production. The floriculture industry, while economically significant, faces challenges of sustainability, prompting a shift towards environmentally friendly practice

# Step 2 - Train the sentencepiece model on the data from the test file

In [16]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(f"--input={dev_dataset_string} --model_prefix=m --vocab_size=100")

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=datasets/test_text.txt --model_prefix=m --vocab_size=100
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: datasets/test_text.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 100
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_pi

Step 3 - Load the model from the model file, and encode the test text

In [23]:
import sentencepiece as spm

# Load your model
sp = spm.SentencePieceProcessor(model_file='m.model')

# Encode a sentence
sentence = "This is a test sentence."
encoded_pieces = sp.encode(sentence, out_type=str)
print(encoded_pieces)

# Decode back to a sentence
decoded_sentence = sp.decode(encoded_pieces)
print(decoded_sentence)

# encode the dev dataset
encoded_dev_dataset = sp.encode(dev_dataset)
print(encoded_dev_dataset)

# Tokenize text
tokenized_text = [sp.encode_as_pieces(t) for t in dev_dataset]

# Convert tokens to IDs
token_ids = [sp.encode_as_ids(t) for t in dev_dataset]

['▁', 'T', 'hi', 's', '▁', 'i', 's', '▁a', '▁', 'te', 's', 't', '▁', 's', 'ent', 'e', 'n', 'ce', '.']
This is a test sentence.
[3, 90, 63, 24, 25, 31, 14, 83, 41, 14, 35, 28, 8, 58, 20, 8, 82, 70, 6, 23, 14, 15, 59, 75, 5, 9, 50, 34, 7, 3, 4, 61, 62, 12, 22, 18, 58, 5, 4, 55, 13, 45, 4, 3, 42, 11, 5, 9, 3, 74, 33, 8, 17, 3, 71, 69, 33, 4, 38, 43, 4, 6, 4, 35, 75, 19, 47, 6, 44, 8, 92, 13, 3, 40, 18, 29, 4, 5, 9, 70, 42, 91, 51, 54, 76, 41, 33, 4, 5, 64, 4, 4, 3, 27, 4, 19, 3, 93, 13, 6, 18, 9, 48, 7, 30, 24, 25, 32, 21, 14, 6, 31, 82, 98, 99, 8, 17, 67, 68, 34, 38, 5, 97, 23, 27, 22, 6, 5, 11, 11, 46, 12, 32, 68, 73, 87, 14, 11, 36, 4, 7, 3, 74, 33, 8, 17, 59, 3, 64, 85, 57, 15, 98, 99, 29, 19, 47, 30, 23, 27, 3, 79, 52, 82, 6, 7, 28, 60, 37, 43, 87, 18, 69, 53, 21, 21, 6, 51, 7, 72, 14, 66, 31, 88, 17, 37, 12, 34, 16, 10, 3, 4, 52, 14, 8, 87, 14, 86, 8, 73, 11, 6, 7, 57, 78, 77, 12, 5, 3, 4, 60, 10, 11, 3, 11, 18, 96, 42, 15, 4, 49, 81, 6, 72, 65, 9, 21, 6, 32, 46, 66, 19, 3, 89, 4, 4

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embeddings_dimensions):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embeddings_dimensions)
        self.output_layer = nn.Linear(embeddings_dimensions, vocab_size)
        self.activation_function = nn.LogSoftmax(dim=-1)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        scores = self.output_layer(embeds)
        log_probabilties = self.activation_function(scores)
        return log_probabilties
    

vocab_size = len(sp)
print(vocab_size)

100


# Appendix

## A conceptual implementation of nn.Embeddings

Embedding is a specific type of layer that is used to build a matrix containing vectors associated with an input that is then passed on to the next layer in the model. The section below shows an implementation using numpy

In [30]:
import numpy as np

class Embedding:
    def __init__(self, vocab_size, embedding_dim):
        self.embeddings = np.random.uniform(-1.0, 1.0, (vocab_size, embedding_dim))
    
    def forward(self, indices):
        return self.embeddings[indices]
    
vocab_size = 5
embedding_dim = 4

embedding_layer = Embedding(vocab_size, embedding_dim)

word_indices = np.array([1,3,4])
selected_embeddings = embedding_layer.forward(word_indices)

print(f"Embeddings: {embedding_layer.embeddings}")
print(f"Selected embeddings: {selected_embeddings} ")


Embeddings: [[-0.94403489  0.56867671  0.58760799  0.86947277]
 [ 0.33459191  0.12253984  0.03845223 -0.88417206]
 [ 0.69562544 -0.30883693  0.66455824  0.80469115]
 [ 0.03204974 -0.35790267 -0.99944344  0.81280022]
 [-0.16043952  0.85588285  0.30353961  0.53162284]]
Selected embeddings: [[ 0.33459191  0.12253984  0.03845223 -0.88417206]
 [ 0.03204974 -0.35790267 -0.99944344  0.81280022]
 [-0.16043952  0.85588285  0.30353961  0.53162284]] 
