In [None]:
!git clone https://github.com/cocoxu/Shakespeare.git

Cloning into 'Shakespeare'...
remote: Enumerating objects: 9016, done.[K
remote: Total 9016 (delta 0), reused 0 (delta 0), pack-reused 9016[K
Receiving objects: 100% (9016/9016), 556.83 MiB | 21.29 MiB/s, done.
Resolving deltas: 100% (3354/3354), done.
Updating files: 100% (4160/4160), done.


In [None]:
import os
import re

DIR = 'Shakespeare/data/align/plays/merged'
PATTERN = re.compile(r'(?P<base>[a-z_-]+)(?P<era>original|modern)\.snt\.aligned')

plays = set(PATTERN.match(file_name).group('base') for file_name in os.listdir(DIR))
for play in plays:
    with open(DIR + '/' + play + 'modern.snt.aligned', 'r') as reading:
        open('modern.txt', 'a').write(reading.read())
    with open(DIR + '/' + play + 'original.snt.aligned', 'r') as reading:
        open('original.txt', 'a').write(reading.read())

In [None]:
%pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [None]:
import pandas as pd
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

In [None]:
df = pd.DataFrame({
    'original': open('original.txt').readlines(),
    'modern': open('modern.txt').readlines()
})

In [None]:
train_df = df.iloc[:-1000, :].copy()
train_df['combined'] = train_df['original'].str.strip() + ' <transition> ' + train_df['modern'].str.strip()
test_df = df.drop(train_df.index)

In [None]:
train_df.shape, test_df.shape

((20079, 3), (1000, 2))

In [None]:
# Well, that's convenient; GPT-2 takes up to 1024 characters of input
train_df['combined'].str.len().max() + len('<startoftext>  <endoftext>')

1020

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

encoded = []
for phrase in train_df['combined']:
    encoded.append(torch.tensor(
        tokenizer.encode(f'<startoftext> {phrase} <endoftext>')
    ))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
tokenizer.save_pretrained("./models/tokenizer")

('./models/tokenizer/tokenizer_config.json',
 './models/tokenizer/special_tokens_map.json',
 './models/tokenizer/vocab.json',
 './models/tokenizer/merges.txt',
 './models/tokenizer/added_tokens.json')

In [None]:
!zip tokenizer.zip models/tokenizer/*

  adding: models/tokenizer/merges.txt (deflated 53%)
  adding: models/tokenizer/special_tokens_map.json (deflated 74%)
  adding: models/tokenizer/tokenizer_config.json (deflated 70%)
  adding: models/tokenizer/vocab.json (deflated 68%)


In [None]:
tokenizer.decode(encoded[0])

'<startoftext> Lie thou there (throwing down a letter), for here comes the trout that must be caught with tickling. <transition> Now, you lie there on the path. <endoftext>'

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
def train(
    dataset,
    model,
    tokenizer,
    batch_size=16,
    epochs=5,
    lr=2e-5,
    max_seq_len=400,
    warmup_steps=200
):
    acc_steps = 100
    device = torch.device('cuda')
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):
        print(f'Training epoch {epoch}')
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        print("Loss:", loss)

    return model

In [None]:
model = train(encoded, model, tokenizer, epochs=10)



Training epoch 0


20079it [03:48, 87.98it/s]


Loss: tensor(2.8929, device='cuda:0', grad_fn=<NllLossBackward0>)
Training epoch 1


20079it [04:05, 81.68it/s]


Loss: tensor(2.6872, device='cuda:0', grad_fn=<NllLossBackward0>)
Training epoch 2


20079it [04:02, 82.72it/s]


Loss: tensor(2.6452, device='cuda:0', grad_fn=<NllLossBackward0>)
Training epoch 3


20079it [04:03, 82.54it/s]


Loss: tensor(2.0279, device='cuda:0', grad_fn=<NllLossBackward0>)
Training epoch 4


20079it [04:05, 81.77it/s]


Loss: tensor(1.8184, device='cuda:0', grad_fn=<NllLossBackward0>)
Training epoch 5


20079it [04:02, 82.76it/s]


Loss: tensor(2.3304, device='cuda:0', grad_fn=<NllLossBackward0>)
Training epoch 6


20079it [04:01, 83.02it/s]


Loss: tensor(1.9647, device='cuda:0', grad_fn=<NllLossBackward0>)
Training epoch 7


20079it [04:01, 83.12it/s]


Loss: tensor(1.8187, device='cuda:0', grad_fn=<NllLossBackward0>)
Training epoch 8


20079it [04:02, 82.75it/s]


Loss: tensor(2.5710, device='cuda:0', grad_fn=<NllLossBackward0>)
Training epoch 9


20079it [04:02, 82.86it/s]

Loss: tensor(2.9977, device='cuda:0', grad_fn=<NllLossBackward0>)





In [None]:
torch.save(model, open('shakespeare_translator', 'wb'))

In [None]:
!ls -sh

total 489M
1004K modern.txt     4.0K sample_data
 1.1M original.txt   487M shakespeare_translator


In [None]:
!zip shakespeare_translator


zip error: Nothing to do! (shakespeare_translator.zip)


In [None]:
!zip --help

Copyright (c) 1990-2008 Info-ZIP - Type 'zip "-L"' for software license.
Zip 3.0 (July 5th 2008). Usage:
zip [-options] [-b path] [-t mmddyyyy] [-n suffixes] [zipfile list] [-xi list]
  The default action is to add or replace zipfile entries from list, which
  can include the special name - to compress standard input.
  If zipfile and list are omitted, zip compresses stdin to stdout.
  -f   freshen: only changed files  -u   update: only changed or new files
  -d   delete entries in zipfile    -m   move into zipfile (delete OS files)
  -r   recurse into directories     -j   junk (don't record) directory names
  -0   store only                   -l   convert LF to CR LF (-ll CR LF to LF)
  -1   compress faster              -9   compress better
  -q   quiet operation              -v   verbose operation/print version info
  -c   add one-line comments        -z   add zipfile comment
  -@   read names from stdin        -o   make zipfile as old as latest entry
  -x   exclude the following nam

In [None]:
!zip -9 shakespeare_translator.zip shakespeare_translator

  adding: shakespeare_translator


zip error: Interrupted (aborting)


In [None]:
!ls -sh

total 930M
1004K modern.txt     487M shakespeare_translator
 1.1M original.txt   442M shakespeare_translator.zip
 4.0K sample_data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil

shutil.copyfile('shakespeare_translator', '/content/drive/MyDrive/shakespeare_translator')

'/content/drive/MyDrive/shakespeare_translator'

In [None]:
def translate(
    model,
    tokenizer,
    prompt,
    entry_count=1,
    entry_length=30,
    top_p = 0.8,
    temperature=1.,
):
    if not isinstance(prompt, str):
        raise TypeError('prompt must be a str')
    if '<startoftext>' not in prompt and isinstance(prompt, str):
        prompt = '<startoftext> ' + prompt.lstrip()
    if '<transition>' not in prompt and isinstance(prompt, str):
        prompt = prompt.rstrip() + ' <transition>'

    model.eval()
    translated_num=0
    translated_list=[]

    filter_value = -float("Inf")

    with torch.no_grad():
        for entry_idx in trange(entry_count):
            entry_finished = False
            translated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
            for i in range(entry_length):
                outputs = model(translated, labels=translated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :]/(temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                translated = torch.cat((translated, next_token), dim=-1)

                if next_token in tokenizer.encode("<endoftext>"):
                    entry_finished = True

                if entry_finished:
                    translated_num = translated_num + 1

                    output_list = list(translated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    translated_list.append(output_text)
                    break

            if not entry_finished:
                output_list = list(translated.squeeze().numpy())
                output_text = f"{tokenizer.decode(output_list)}<endoftext>"
                translated_list.append(output_text)

    return translated_list[0].split('<transition> ')[1].rstrip('<end')


def text_translation(test_data):
    translate_text = []
    for i in range(len(test_data)):
        x = translate(model.to('cpu'), tokenizer, test_data[i], entry_count=1)
        translate_text.append(x)

    return translate_text
df_test = test_df.iloc[:100,:]
# generated = generate(model.to('cpu'), tokenizer, test_df['original'].iloc[1], entry_count=1)
# actual = test_df['modern'].iloc[1]
# generated, actual
translate_text = text_translation(df_test['original'].tolist())

100%|██████████| 1/1 [00:02<00:00,  2.23s/it]
100%|██████████| 1/1 [00:01<00:00,  1.85s/it]
100%|██████████| 1/1 [00:03<00:00,  3.17s/it]
100%|██████████| 1/1 [00:02<00:00,  2.00s/it]
100%|██████████| 1/1 [00:07<00:00,  7.08s/it]
100%|██████████| 1/1 [00:01<00:00,  1.89s/it]
100%|██████████| 1/1 [00:02<00:00,  2.19s/it]
100%|██████████| 1/1 [00:05<00:00,  5.71s/it]
100%|██████████| 1/1 [00:04<00:00,  4.97s/it]
100%|██████████| 1/1 [00:02<00:00,  2.86s/it]
100%|██████████| 1/1 [00:03<00:00,  3.10s/it]
100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
100%|██████████| 1/1 [00:02<00:00,  2.31s/it]
100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
100%|██████████| 1/1 [00:08<00:00,  8.74s/it]
100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
100%|██████████| 1/1 [00:01<00:00,  1.10s/it]
100%|██████████| 1/1 [00:02<00:00,

In [None]:
print(translate_text)

['I thank you for your service and help. ', 'And he that hath commanded us is our king. ', 'And that same vengeance doth He pierce that same wall. ', 'Do you love your dear heart? ', 'How can God compel us to tolerate that law if he could, right in front of us, being very plainly locked in such misery? ', 'For who did I help to hurt my wife? ', 'For Edward, for my brother, for his sake. ', "If God wants to be punished, you know he won't even give in on that promise. ", 'Take him to his rightful point, then I will allow him to forgive those who have offended Him. ', 'My brother’s love, my demon, and my rage. ', 'I am his brother and I love him very well. ', "You're not fooled. ", 'Your brother Gloucester hates you. ', 'Oh, I love you, and he loves me. ', 'Go you to him from me. ', 'Ay, so we will. ', 'Tell him, when your father York Blessed his three sons with his victorious arm, He was not concerned about a battle for the king, rather he did<endoftext>', 'She will weep, and it will be 

In [None]:
# adding generated text to df

df_test['Translated_Text'] = translate_text




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['Translated_Text'] = translate_text


In [None]:
df_test

Unnamed: 0,original,modern,Translated_Text
20079,"I do beseech your Grace to pardon me, and with...",I beg your \n,I thank you for your service and help.
20080,And he that hath commanded is our king.\n,And the one who commands us is our king.\n,And he that hath commanded us is our king.
20081,And that same vengeance doth He hurl on thee F...,"He does, and now he’s taking revenge on you—fo...",And that same vengeance doth He pierce that sa...
20082,Whom thou wert sworn to cherish and defend.\n,Whom you swore you would protect and defend.\n,Do you love your dear heart?
20083,How canst thou urge God’s dreadful law to us W...,How dare you tell us about God’s laws when you...,How can God compel us to tolerate that law if ...
...,...,...,...
20174,"A most ridiculous monster, to make a wonder of...","What a silly monster, to think a poor drunk is...","A most ridiculous monster, to make a wonder of..."
20175,"I prithee now, lead the way without any more t...",Show us the way without further delay.—Trincul...,"I’ll drink some of my old liquor, lead the way..."
20176,"A howling monster, a drunken monster.\n","A loud-mouthed, drunken monster.\n","A howling monster, a drunken monster."
20177,O brave monster!\n,Good monster!\n,What is it you're going to do with me?


In [None]:
# # performance evaluation

# import statistics
# from nltk.translate.bleu_score import sentence_bleu

# scores = []

# for i in range(len(df_test)):
#   reference = df_test['modern'].iloc[i]
#   generated_candidate = df_test['Translated_Text'].iloc[i]
#   scores.append(sentence_bleu(reference, generated_candidate))
# statistics.mean(scores)

In [None]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=1,
    entry_length=50,
    top_p = 0.8,
    temperature=1.,
):

    model.eval()
    generated_num=0
    generated_list=[]

    filter_value = -float("Inf")

    with torch.no_grad():
        for entry_idx in trange(entry_count):
            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :]/(temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=-1)

                if next_token in tokenizer.encode("<endoftext>"):
                    entry_finished = True

                if entry_finished:
                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break

            if not entry_finished:
                output_list = list(generated.squeeze().numpy())
                output_text = f"{tokenizer.decode(output_list)}<endoftext>"
                generated_list.append(output_text)

    return generated_list[0]


def text_generation(test_data):
    generate_text = []
    for i in range(len(test_data)):
        x = generate(model.to('cpu'), tokenizer, test_data[i], entry_count=1)
        generate_text.append(x)

    return generate_text
df_test = test_df.iloc[:100,:]
# generated = generate(model.to('cpu'), tokenizer, test_df['original'].iloc[1], entry_count=1)
# actual = test_df['modern'].iloc[1]
# generated, actual
generate_text = text_generation(df_test['original'].tolist()[-10:])

100%|██████████| 1/1 [00:15<00:00, 15.49s/it]
100%|██████████| 1/1 [00:11<00:00, 11.01s/it]
100%|██████████| 1/1 [00:10<00:00, 10.86s/it]
100%|██████████| 1/1 [00:11<00:00, 11.84s/it]
100%|██████████| 1/1 [00:09<00:00,  9.71s/it]
100%|██████████| 1/1 [00:12<00:00, 12.08s/it]
100%|██████████| 1/1 [00:21<00:00, 21.85s/it]
100%|██████████| 1/1 [00:10<00:00, 10.85s/it]
100%|██████████| 1/1 [00:08<00:00,  8.65s/it]
100%|██████████| 1/1 [00:09<00:00,  9.71s/it]


In [None]:
print(generate_text)

['An abominable monster!\n\nHe had a fiendish presence that was full of pain.\n\nBut the only other thing he could do was insult himself.\n\nHe could no longer let himself be impudent and abuse himself,\n\nAnd with his soft voice<endoftext>', 'I’ll show thee the best springs.\n\n[There is no aid, nor force in our com\xadmon business. Oh, you!"] He does as he pleases, but with hard and sharp teeth, so he doesn’t bite into the flesh. If he goes<endoftext>', 'I’ll pluck thee berries.\n\nPut me out of bed, knock it off! Come back to your tree!\n\nGive me this time to get rid of you. Get back to your tree! Give me this time to use that life. Get back to your tree!<endoftext>', 'I’ll fish for thee and get thee wood enough.\n\nGrow your hatred and shame, and tell me what you shall do, To my love you. And I won’t forget your letter to me. I won’t have to look to see you. I don’t<endoftext>', 'A plague upon the tyrant that I serve!\n\nYou are such a wicked creature, my lord!\n\nYou carry such g

In [None]:
print(generate_text[1])

I’ll show thee the best springs.

[There is no aid, nor force in our com­mon business. Oh, you!"] He does as he pleases, but with hard and sharp teeth, so he doesn’t bite into the flesh. If he goes<endoftext>


In [None]:
import re

def clean(text):
  if '<end' not in text:
    text = '. '.join(text.split('.')[:-1])
  else:
    text = re.sub(r'<end(oftext>)?', '.', text)
  text = re.sub(r'[\[\]]', '', text)
  return text

print(clean(generate_text[4]))

A plague upon the tyrant that I serve!

You are such a wicked creature, my lord!

You carry such guilt for your father!

I can take you to hell, my lord!

I will kill you like a murderer!

But you, a r.


In [None]:
# 0, 4