# WikiText2
At the suggestion of my new AI, Pi, I am trying to use the autoencoder on a smaller data set - WikiText2.

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List

class BottleneckT5Autoencoder:
    def __init__(self, model_path: str, device='cpu'):
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=512)
        self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(self.device)
        self.model.eval()

    @torch.no_grad()
    def embed(self, text: List[str]) -> List[List[float]]:

        # big batches are causing us to run out of memory. Limit the size
        embeddings = list()
        for i in range(0, len(text), 100):
            end = i + 100
            if end > len(text):
                end = len(text)
            batch = text[i:end]
        
            inputs = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to(self.device)
            decoder_inputs = self.tokenizer('', return_tensors='pt').to(self.device)
            embeddings.extend(self.model(
                    **inputs,
                    decoder_input_ids=decoder_inputs['input_ids'],
                    encode_only=True,
                ).to('cpu').tolist())
        
        return embeddings

    @torch.no_grad()
    def generate_from_latent(self, latent: List[float], max_length=512, temperature=1.0) -> str:
        dummy_text = ['.']
        dummy = torch.tensor(self.embed(dummy_text)).to(device)
        latent = torch.tensor(latent).to(device)
        perturb_vector = latent - dummy
        self.model.perturb_vector = perturb_vector
        input_ids = self.tokenizer(dummy_text, return_tensors='pt').to(self.device).input_ids
        output = self.model.generate(
            input_ids=input_ids,
            max_length=max_length,
            do_sample=True,
            temperature=temperature,
            top_p=0.9,
            num_return_sequences=1,
        )
        return self.tokenizer.decode(output[0], skip_special_tokens=True)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
autoencoder = BottleneckT5Autoencoder(model_path='thesephist/contra-bottleneck-t5-large-wikipedia', device=device)


In [1]:
from datasets import load_dataset
wiki = load_dataset(path="wikitext", name="wikitext-103-v1", split="train")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wiki_test = load_dataset(path="wikitext", name="wikitext-103-v1", split="test")

In [3]:
wiki_valid = load_dataset(path="wikitext", name="wikitext-103-v1", split="validation")


In [4]:
wiki_all = load_dataset(path="wikitext", name="wikitext-103-v1")


In [5]:
print(wiki_all)

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [None]:
from datasets import Dataset

# Skim off the first 10 articles for testing
test_wiki = wiki['text'][:10]
for s in test_wiki:
    print(s)


In [4]:
wikitext2 = load_dataset(path="wikitext", name="wikitext-2-raw-v1", split="train")

In [6]:
# Skim off the first 10 articles for testing
test_wiki2 = wikitext2.select(range(0, 10))
for s in test_wiki2:
    print(s['text'])



 = Valkyria Chronicles III = 


 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . 

 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series 

In [15]:
print(type(wikitext2))

<class 'datasets.arrow_dataset.Dataset'>


In [7]:
import re

def get_sentences(article):
    # split the article into sentences
    sentences = re.split(r'(?<=[.!?;:\n\r])\s+', article)
    # remove empty sentences
    sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
    # remove sentences that are too long by filtering out sentences with more than 400 words
    sentences = [s for s in sentences if len(s.split()) <= 400]
    return sentences

test_wiki_sentences = test_wiki2.map(lambda x: {'sentences': get_sentences(x['text'])})

In [8]:
# remove any rows that only have a singe sentence or not sentences.
def has_multiple_sentences(item):
    return len(item['sentences']) > 1

test_wiki_sentences = test_wiki_sentences.filter(has_multiple_sentences)

In [22]:
print(test_wiki_sentences.features)

{'text': Value(dtype='string', id=None), 'sentences': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}


In [9]:
def get_embeddings_batch(sentences):
    return autoencoder.embed(sentences)

test_wiki_embeddings = test_wiki_sentences.map(lambda x: {'embeddings': get_embeddings_batch(x['sentences'])})

Map: 100%|██████████| 4/4 [00:00<00:00,  5.94 examples/s]


In [35]:
print(test_wiki_embeddings.column_names)
for e in test_wiki_embeddings:
    for i in range(len(e['sentences'])):
        print(e['sentences'][i])
        embeddings = torch.tensor(e['embeddings'][i]).to(device)  # Move embeddings to the same device
        print(autoencoder.generate_from_latent(embeddings))
        print('-----------------')

['text', 'sentences', 'embeddings']
Senjō no Valkyria 3 :


  latent = torch.tensor(latent).to(device)


Senj Valkyria 3: Victory of the Serpent :
-----------------
Unrecorded Chronicles ( Japanese :
Unrecorded Chronicles (in Japanese : )
-----------------
戦場のヴァルキュリア3 , lit .
3, lit..
-----------------
Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable .
Valkyria Chronicles 3, commonly known as Valkyria – Battlefield III for the Nintendo Entertainment System () or simply, is a tactical role-playing video game (T2P) developed and published by Sega in the United States by Media Blasters. The game follows the Valkyrie Chronicles III video game series.
-----------------
Released in January 2011 in Japan , it is the third game in the Valkyria series .
Released in January 2011 in Japan, it is the third game in the Valkyria series.
-----------------
Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story 

In [36]:
print(test_wiki_embeddings[3])

{'text': " As with previous Valkyira Chronicles games , Valkyria Chronicles III is a tactical role @-@ playing game where players take control of a military unit and take part in missions against enemy forces . Stories are told through comic book @-@ like panels with animated character portraits , with characters speaking partially through voiced speech bubbles and partially through unvoiced text . The player progresses through a series of linear missions , gradually unlocked as maps that can be freely scanned through and replayed as they are unlocked . The route to each story location on the map varies depending on an individual player 's approach : when one option is selected , the other is sealed off to the player . Outside missions , the player characters rest in a camp , where units can be customized and character growth occurs . Alongside the main story missions are character @-@ specific sub missions relating to different squad members . After the game 's completion , additional

Now save the embeddings and read them back to prove that we have a working representation of the articles.

In [37]:
# Save the test_wiki_embeddings dataset to disk
test_wiki_embeddings.save_to_disk('test_wiki2_embeddings')

Saving the dataset (1/1 shards): 100%|██████████| 4/4 [00:00<00:00, 158.67 examples/s]


In [38]:
# read the dataset back from disk
test_wiki_embeddings_back = Dataset.load_from_disk('test_wiki2_embeddings')

In [39]:
print(test_wiki_embeddings_back.column_names)
for e in test_wiki_embeddings_back:
    for i in range(len(e['sentences'])):
        print(e['sentences'][i])
        embeddings = torch.tensor(e['embeddings'][i]).to(device)  # Move embeddings to the same device
        print(embeddings)
        print(autoencoder.generate_from_latent(embeddings))
        print('-----------------')
    break

['text', 'sentences', 'embeddings']
Senjō no Valkyria 3 :
tensor([-0.0746, -0.0810,  0.0340,  ..., -0.0173,  0.0492, -0.0693],
       device='cuda:0')


  latent = torch.tensor(latent).to(device)


Senj no Valkyria 3: 
-----------------
Unrecorded Chronicles ( Japanese :
tensor([-0.0203, -0.0304, -0.0649,  ..., -0.0971,  0.0625, -0.0397],
       device='cuda:0')
The Unrecorded Chronicles (Japanese: )
-----------------
戦場のヴァルキュリア3 , lit .
tensor([ 0.0021, -0.0556, -0.0508,  ..., -0.0414, -0.1776, -0.0985],
       device='cuda:0')
3 , lit..
-----------------
Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable .
tensor([-0.0459, -0.1662, -0.0365,  ..., -0.0843,  0.0490,  0.0246],
       device='cuda:0')
Valkyria – The Battle of Earth III, commonly referred to as Valkyria  and, is a tactical role-playing video game (T3P) developed by Sega for the Sega Virtual Console under the Mediacom brand in Japan..
-----------------
Released in January 2011 in Japan , it is the third game in the Valkyria series .
tensor([-0.1342, -0.0589, -0.0030,

OK - now lets do it for the whole dataset.

In [40]:
print(len(wikitext2))
print(wikitext2.column_names)


36718
['text']


In [10]:
wiki_sentences = wikitext2.map(lambda x: {'sentences': get_sentences(x['text'])})
print(len(wiki_sentences))
wiki_sentences = wiki_sentences.filter(has_multiple_sentences)
print(len(wiki_sentences))


36718
15388


In [11]:
wiki_text_embeddings = wiki_sentences.map(lambda x: {'embeddings': get_embeddings_batch(x['sentences'])})
wiki_text_embeddings.save_to_disk('wikitext2_embeddings')


Map: 100%|██████████| 15388/15388 [06:48<00:00, 37.68 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 15388/15388 [00:00<00:00, 30531.21 examples/s]


Now do the embeddings for wikitext 103

In [12]:
wiki_103_sentences = wiki.map(lambda x: {'sentences': get_sentences(x['text'])})
print(len(wiki_103_sentences))
wiki_103_sentences = wiki_103_sentences.filter(has_multiple_sentences)
print(len(wiki_103_sentences))


Map: 100%|██████████| 1801350/1801350 [01:26<00:00, 20743.73 examples/s]


1801350


Filter: 100%|██████████| 1801350/1801350 [00:10<00:00, 163914.56 examples/s]

749034





In [None]:
wiki_text_103_embeddings = wiki_103_sentences.map(lambda x: {'embeddings': get_embeddings_batch(x['sentences'])})
wiki_text_103_embeddings.save_to_disk('wikitext103_embeddings')