<h1 style="text-align: center;">Language Models - Tutorial</h1> 

## Install dependencies (Optional)

In [None]:
!pip3 install transformers nltk

In [None]:
!pip3 install "numpy<2.0.0" --force-reinstall

## N-Grams Using Brown Corpus

In [None]:
import nltk
from nltk.corpus import brown
from nltk.util import ngrams
from collections import Counter # used for frequency counting

# Download Brown corpus if not already downloaded
nltk.download('brown')

### Basic stats

In [None]:
words = brown.words() # list of all tokens including punctuations
sents = brown.sents()
paras = brown.paras()
categories = brown.categories()

print(f"Number of words: {len(words)}")
print(f"Number of sentences: {len(sents)}")
print(f"Number of paragraphs: {len(paras)}")
print(f"Number of categories: {len(categories)}")
print(f"Categories: {categories}")
print(f"Vocabulary size: {len(set([w.lower() for w in words]))}")

In [None]:
import string

# Load and preprocess Brown corpus (convert to lowercase)
tokens = [word.lower() for word in words] # include punctuations  
clean_tokens = [w for w in tokens if w not in string.punctuation]

print("Done")

### Unigrams

In [None]:
unigrams = list(ngrams(tokens, 1))
unigram_freq = Counter(unigrams)

print("Top 10 Unigrams:")
for word, freq in unigram_freq.most_common(10):
    print(f"{word}: {freq}")

In [None]:
print(unigram_freq)

In [None]:
unigram_freq = Counter(clean_tokens)

# Show top 10 most common unigrams (no punctuation)
print("Top 10 Unigrams (Punctuation Removed):")
for word, freq in unigram_freq.most_common(10):
    print(f"{word}: {freq}")

### Bigrams

In [None]:
bigrams = list(ngrams(clean_tokens, 2))
bigram_freq = Counter(bigrams)

print("Top 10 Bigrams:")
for pair, freq in bigram_freq.most_common(10):
    print(f"{pair}: {freq}")

### Trigrams

In [None]:
trigrams = list(ngrams(clean_tokens, 3))
trigram_freq = Counter(trigrams)

print("Top 10 Trigrams:")
for triplet, freq in trigram_freq.most_common(10):
    print(f"{triplet}: {freq}")

## Next Word Prediction

### Input Sentence

In [None]:
partial_sentence = "The doctor said the patient might have to"
print("Input:", partial_sentence)

### Statistical Language Model

In [None]:
import nltk
from nltk.corpus import brown
from nltk import FreqDist # NLTK specific frequency counting
from nltk import bigrams, trigrams, ngrams
from collections import defaultdict
import random

# Lowercase words from Brown corpus
tokens = [w.lower() for w in brown.words()]
clean_tokens = [w for w in tokens if w not in string.punctuation]
print("Done")

In [None]:
# Build frequency distributions
bi_freq = FreqDist(bigrams(clean_tokens)) # a dictionary (bigram, count)
tri_freq = FreqDist(trigrams(clean_tokens)) # a dictionary (trigram, count)

print(f"Bigram frequency distribution sample: {bi_freq.most_common(5)}")
print(f"Trigram frequency distribution sample: {tri_freq.most_common(5)}")

# Get previous words of input
input_text = partial_sentence.lower().split()

last_input = tuple(input_text[-1:])
last_bigram = tuple(input_text[-2:])

print("=="*50)
print("Input: %s____"%partial_sentence)
print(f"Last input for Bigram: {last_input}")
print(f"Last bigram for Trigram: {last_bigram}")

In [None]:
# Get all trigrams that start with "as well"
item = [k for k,v in tri_freq.items() if k[:-1] == tuple(['as', 'well'])]
print(item)

In [None]:
# Get top 5 most likely next words that follow a given context
def get_most_likely_next_word(freq_dist, context):
    # the following line produces a dictionary of {next_word: count} for all matching n-grams
    candidates = {k[-1]: v for k, v in freq_dist.items() if k[:-1] == context}
    # Sort candidates by frequency (descending)
    sorted_candidates = sorted(candidates.items(), key=lambda x: -x[1])
    # Return top 5
    return sorted_candidates[:5]

bigram_predictions = get_most_likely_next_word(bi_freq, last_input)
print("Bigram prediction:")
print(bigram_predictions)

trigram_predictions = get_most_likely_next_word(tri_freq, last_bigram)
print("\nTrigram prediction:")
print(trigram_predictions)

In [None]:
print("Input Sentence Bigram Predictions:")
for word, count in bigram_predictions:
    sentence = f"{partial_sentence} {word}"
    print(f"{sentence}  [{count}]")

print("\nInput Sentence Trigram Predictions:")
for word, count in trigram_predictions:
    sentence = f"{partial_sentence} {word}"
    print(f"{sentence}  [{count}]")

## Transformer-based Language Model (Hugging Face GPT-2)
* GPT-2 is pretrained on massive web text (WebText corpus) in an unsupervised, causal language modeling fashion (i.e., predicting the next token given all previous ones).
* GPT-2 generates one token at a time, feeding each new token back into itself until.
* GPT-2 is **pretrained but not fine-tuned** for a specific task.
* You control how many words/tokens it outputs using **max_new_tokens**.

In [None]:
from transformers import pipeline, set_seed

# Load text generation pipeline with GPT-2
generator = pipeline("text-generation", model="gpt2")
set_seed(42) # random seed for reproducibility

prompt = "The doctor said the patient might have to" # → 8 tokens
# generate a sequence of tokens (words, punctuation, etc.) up to 15 tokens total (including the prompt length)
outputs = generator(prompt, max_length=15, num_return_sequences=3, max_new_tokens=1) # increase max_new_tokens

print("\nGPT-2 Predictions:")
for i, output in enumerate(outputs):
    print(f"{i+1}: {output['generated_text']}")


### Final Notes:
* GPT-3 is not on Hugging Face since it is a proprietary model developed by OpenAI.
* How about Llama?
* Access to Llama models is restricted and **requires accepting the license on HF**.