# Pocking at Ever Larger Language Models
## An introduction for (digital) humanists



## Introduction 

## Download data

In [None]:
!wget -O animacy.zip https://bl.iro.bl.uk/downloads/59a8c52f-e0a5-4432-9897-0db8c067627c

In [None]:
!unzip animacy.zip -d animacy_data

In [None]:
!ls

## From mini to somewhat larger

... but not large

### A Shakespeare language model

In [1]:
# !pip install requests

In [2]:
import numpy as np
import requests
from collections import Counter
import re

In [3]:
text = requests.get('https://www.gutenberg.org/cache/epub/100/pg100.txt').text.lower().strip()

In [4]:
text[:100]

'\ufeffthe project gutenberg ebook of the complete works of william shakespeare, by william shakespeare\r\n\r'

In [23]:
pattern = re.compile('\w+')
tokens = pattern.findall(text)

In [24]:
print(tokens[:50])

['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'complete', 'works', 'of', 'william', 'shakespeare', 'by', 'william', 'shakespeare', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away']


In [25]:
#tokens = text.split()

In [26]:
def ngrams(tokens: list,n: int=2):
    """
    Arguments:
        text
        n
    Returns:
        
    """
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens))]

bigrams = ngrams(tokens,2)

In [27]:
bigrams[:10]

['the project',
 'project gutenberg',
 'gutenberg ebook',
 'ebook of',
 'of the',
 'the complete',
 'complete works',
 'works of',
 'of william',
 'william shakespeare']

In [28]:
bigrams = Counter(bigrams)

In [29]:
trigrams = Counter(ngrams(tokens,3))

In [30]:
trigrams.most_common(10)

[('_exeunt _ scene', 320),
 ('i pray you', 252),
 ('a room in', 245),
 ('i will not', 229),
 ('room in the', 174),
 ('i know not', 171),
 ('i do not', 160),
 ('the duke of', 157),
 ('i am a', 149),
 ('the king s', 148)]

In [31]:
tetragram = Counter(ngrams(tokens,4))

In [32]:
tetragram.most_common(10)

[('a room in the', 151),
 ('another part of the', 109),
 ('_exeunt _ scene ii', 92),
 ('what s the matter', 77),
 ('_exeunt _ scene iii', 76),
 ('room in the palace', 69),
 ('act iv scene i', 63),
 ('act iii scene i', 62),
 ('act v scene i', 62),
 ('act i scene i', 60)]

In [33]:
vocabulary = set(tokens)
len(vocabulary)

26984

In [34]:
sequence = 'the duke of'
prob_next_word = Counter({w: tetragram[f'{sequence} {w}'] / trigrams[sequence] for w in vocabulary})

In [35]:
prob_next_word.most_common(20)

[('york', 0.2229299363057325),
 ('norfolk', 0.10828025477707007),
 ('suffolk', 0.08917197452229299),
 ('gloucester', 0.08917197452229299),
 ('buckingham', 0.07006369426751592),
 ('albany', 0.050955414012738856),
 ('somerset', 0.044585987261146494),
 ('exeter', 0.044585987261146494),
 ('burgundy', 0.03821656050955414),
 ('lancaster', 0.03184713375796178),
 ('florence', 0.03184713375796178),
 ('cornwall', 0.03184713375796178),
 ('clarence', 0.025477707006369428),
 ('hereford', 0.025477707006369428),
 ('milan', 0.01910828025477707),
 ('bedford', 0.012738853503184714),
 ('orleans', 0.012738853503184714),
 ('aumerle', 0.006369426751592357),
 ('venice', 0.006369426751592357),
 ('lorraine', 0.006369426751592357)]

In [36]:
words, probs = list(prob_next_word),list(prob_next_word.values())

In [37]:
np.random.choice(words, p=probs)

'norfolk'

In [38]:
sequence += ' ' + np.random.choice(words, p=probs)
sequence

'the duke of york'

In [39]:
total_sequence = 'the duke of'
sequence = total_sequence
for _ in range(20):
    prob_next_word = Counter({w: tetragram[f'{sequence} {w}'] / trigrams[sequence] for w in vocabulary})
    words, probs = list(prob_next_word),list(prob_next_word.values())
    total_sequence += ' ' + np.random.choice(words, p=probs)
    sequence = ' '.join(total_sequence.split()[-3:])
    print(total_sequence)

the duke of exeter
the duke of exeter his
the duke of exeter his brother
the duke of exeter his brother archbishop
the duke of exeter his brother archbishop late
the duke of exeter his brother archbishop late of
the duke of exeter his brother archbishop late of canterbury
the duke of exeter his brother archbishop late of canterbury sir
the duke of exeter his brother archbishop late of canterbury sir thomas
the duke of exeter his brother archbishop late of canterbury sir thomas erpingham
the duke of exeter his brother archbishop late of canterbury sir thomas erpingham williams
the duke of exeter his brother archbishop late of canterbury sir thomas erpingham williams a
the duke of exeter his brother archbishop late of canterbury sir thomas erpingham williams a good
the duke of exeter his brother archbishop late of canterbury sir thomas erpingham williams a good old
the duke of exeter his brother archbishop late of canterbury sir thomas erpingham williams a good old commander
the duke of 

In [40]:
!pip install transformers



In [None]:
#!pip3 install torch torchvision torchaudio

In [43]:
sequence = 'the duke of'

In [44]:
from transformers import pipeline
generator = pipeline('text-generation', model = 'gpt2')
generator(sequence, max_length = 30, num_return_sequences=3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'the duke of Jersey with its royal and military background and its history as a leader of modern civilised and democratic society.\n\nIn March 2015'},
 {'generated_text': 'the duke of York" is the president of the United States.\n\nDuke of York is one of the most prestigious hereditary monarchy in the'},
 {'generated_text': 'the duke of Naples – that was a real good decision."\n\nBut the case goes deeper. It also raises serious questions concerning the effectiveness or'}]

In [48]:
#https://huggingface.co/blog/how-to-generate

In [47]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

In [50]:
input_ids = tokenizer.encode(sequence, return_tensors='pt')


In [51]:
input_ids

tensor([[1169,  288, 4649,  286]])

In [52]:
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
the duke of York, who was a member of the royal family, was a member of the royal family, and was a member of the royal family.

The Duke of York was a member of the royal family, and was a member
