# Lab 6

## Dataset Preparation

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('PoetryFoundationData.csv')
df = df.sample(frac=0.25, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
0,77,\r\r\n At Eighty-three She ...,"\r\r\nEnclosure, steam-heated; a trial casket....",Ruth Stone,"Living,Growing Old,Arts & Sciences"
1,6,\r\r\n And the Gauchos Sing...,\r\r\n\r\r\n\r\r\n\r\r\nFor Barry Silesky\r\r\...,Mike Puican,"Arts & Sciences,Poetry & Poets,Social Commenta..."
2,29,\r\r\nfrom Saying Grace\r\r\n,\r\r\n\r\r\n\r\r\n\r\r\nfor my mother\r\r\n\r\...,Kevin Young,"Activities,Jobs & Working,Social Commentaries,..."
3,78,\r\r\n Leaving the Hospital...,"\r\r\nAs the doors glide shut behind me, the w...",Anya Silver,
4,13,\r\r\n Relic\r\r\n ...,"\r\r\nThe first time I touched it, cloth fell ...",Rachel Richardson,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3464 entries, 0 to 3463
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  3464 non-null   int64 
 1   Title       3464 non-null   object
 2   Poem        3464 non-null   object
 3   Poet        3464 non-null   object
 4   Tags        3224 non-null   object
dtypes: int64(1), object(4)
memory usage: 135.4+ KB


In [None]:
# normalize poem line endings, strip leading/trailing whitespace, drop empty entries
poems = df['Poem'].astype(str).map(lambda s: s.replace('\r\n', '\n').replace('\r', '\n').strip())
poems = poems[poems != '']

# join poems into a single corpus, separating poems by two newlines for clarity
corpus = '\n\n'.join(poems.tolist())

# (optional) save corpus to a file
with open('poems_corpus.txt', 'w', encoding='utf-8') as f:
  f.write(corpus)

print(f'Corpus created with {len(poems)} poems ({len(corpus)} characters). Saved to poems_corpus.txt')

Corpus created with 3443 poems (4964851 characters). Saved to poems_corpus.txt


## Data Preprocessing

In [None]:
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
n = 5  # context length (use n words to predict the (n+1)th)
oov_tok = '<OOV>'

# 1) Clean: lowercase and remove punctuation/special chars
def clean_text(s: str) -> str:
  s = s.lower()
  # replace newlines with spaces, drop non-alphanum (keep digits and spaces)
  s = re.sub(r'[\r\n]+', ' ', s)
  s = re.sub(r'[^a-z0-9\s]', ' ', s)
  s = re.sub(r'\s+', ' ', s).strip()
  return s

cleaned_poems = poems.astype(str).map(clean_text).tolist()

# 2) Tokenize (word -> integer)
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(cleaned_poems)
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding / index start

# 3) Create sliding-window sequences (n inputs + 1 target)
sequences = []
for text in cleaned_poems:
  token_list = tokenizer.texts_to_sequences([text])[0]
  if len(token_list) <= n:
    continue
  for i in range(n, len(token_list)):
    seq = token_list[i - n : i + 1]  # length = n+1
    sequences.append(seq)

if len(sequences) == 0:
  raise ValueError("No sequences were created. Try reducing `n` or check cleaned_poems content.")

sequences = np.array(sequences, dtype=np.int32)

# 4) Pad sequences so they all have same length (here maxlen = n+1)
max_len = n + 1
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

# Split into inputs and targets
X = padded_sequences[:, :-1]  # shape (num_seqs, n)
y = padded_sequences[:, -1]   # shape (num_seqs,)

# Quick summary
print(f"vocab_size={vocab_size}, num_sequences={X.shape[0]}, X.shape={X.shape}, y.shape={y.shape}")
# example: decode one X -> words
idx_to_word = {i: w for w, i in tokenizer.word_index.items()}
def decode_tokens(tokens):
  return " ".join(idx_to_word.get(t, '<PAD_OR_OOV>') for t in tokens)

print("example (first seq):", decode_tokens(X[0]), "->", idx_to_word.get(y[0], '<PAD_OR_OOV>'))

vocab_size=48741, num_sequences=844950, X.shape=(844950, 5), y.shape=(844950,)
example (first seq): enclosure steam heated a trial -> casket


## LSTM Model Development

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam

# configuration
embedding_dim = 100
lstm_units = 100
dropout_rate = 0.2

# build model (two LSTM layers)
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=n))
model.add(LSTM(lstm_units, return_sequences=True))
model.add(LSTM(lstm_units))
model.add(Dropout(dropout_rate))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
model.summary()



## Training

In [None]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import json

# Recompile model to use categorical cross-entropy and standard accuracy
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Create a tf.data pipeline that one-hot encodes targets per-batch (avoids materializing a gigantic y matrix)
batch_size = 512
shuffle_buffer = 10000

dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.shuffle(shuffle_buffer).map(lambda x, y: (x, tf.one_hot(y, depth=vocab_size, dtype=tf.float32)))
dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Train the model (adjust epochs between 10-20 as desired)
epochs = 10
history = model.fit(dataset, epochs=epochs)

# Save training history
with open('training_history.json', 'w') as fh:
  json.dump(history.history, fh)

Epoch 1/10
[1m1651/1651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 81ms/step - accuracy: 0.0614 - loss: 7.7462
Epoch 2/10
[1m1651/1651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 81ms/step - accuracy: 0.0780 - loss: 6.9579
Epoch 3/10
[1m1651/1651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 81ms/step - accuracy: 0.0936 - loss: 6.7268
Epoch 4/10
[1m1651/1651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 81ms/step - accuracy: 0.1006 - loss: 6.5607
Epoch 5/10
[1m1651/1651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 81ms/step - accuracy: 0.1063 - loss: 6.4204
Epoch 6/10
[1m1651/1651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 81ms/step - accuracy: 0.1110 - loss: 6.2936
Epoch 7/10
[1m1651/1651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 81ms/step - accuracy: 0.1151 - loss: 6.1749
Epoch 8/10
[1m1651/1651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 81ms/step - accuracy: 0.1182 - loss: 6.0599


## Text Generation

In [None]:
import numpy as np

def _sample_from_preds(preds, temperature=1.0):
  preds = np.asarray(preds, dtype=np.float64)
  preds[0] = 0.0  # never predict padding index 0
  if temperature <= 0 or np.isclose(temperature, 0.0):
    return int(np.argmax(preds))
  preds = preds / np.sum(preds)
  # apply temperature by working in log-space to avoid underflow
  log_preds = np.log(preds + 1e-12) / temperature
  exp_preds = np.exp(log_preds - np.max(log_preds))
  probs = exp_preds / np.sum(exp_preds)
  return int(np.random.choice(len(probs), p=probs))

def generate_poetry(seed: str,
          num_words: int = 20,
          temperature: float = 1.0,
          sample: bool = True) -> str:
  """
  Generate a single line/sequence of `num_words` words continuing from `seed`.
  Uses `tokenizer`, `model`, `n`, and `idx_to_word` from the notebook state.
  """
  # clean + tokenize seed using the same preprocessing used for training
  cleaned = clean_text(seed)
  token_list = tokenizer.texts_to_sequences([cleaned])[0]  # may be empty or contain OOV index
  generated_tokens = token_list.copy()

  for _ in range(num_words):
    # take last `n` tokens as context, pad on the left if needed
    input_seq = pad_sequences([generated_tokens[-n:]], maxlen=n, padding='pre')
    preds = model.predict(input_seq, verbose=0)[0]  # shape (vocab_size,)
    if sample:
      next_idx = _sample_from_preds(preds, temperature=temperature)
    else:
      preds[0] = 0.0
      next_idx = int(np.argmax(preds))
    generated_tokens.append(next_idx)

  # convert token indices back to words (skip padding 0 tokens)
  words = [idx_to_word.get(t, '<OOV>') for t in generated_tokens if t != 0]
  return " ".join(words)

def generate_multiple_lines(seeds, num_words=15, temperature=1.0, sample=True):
  """
  Generate multiple lines from a list of seeds. Returns a list of strings.
  """
  lines = []
  for s in seeds:
    line = generate_poetry(s, num_words=num_words, temperature=temperature, sample=sample)
    lines.append(line)
  return lines

# Example usage (change seeds and parameters as desired):
example_seeds = [
  "the moon",
  "i remember",
  "dear heart",
  "in the morning light",
  "under the old oak"
]

generated = generate_multiple_lines(example_seeds, num_words=12, temperature=0.8, sample=True)
for seed, line in zip(example_seeds, generated):
  print(f"> {seed}\n{line}\n")

> the moon
the moon rising that love and all that i have lost the tv no

> i remember
i remember no other but a way about the story of the sun so

> dear heart
dear heart is it here i think of the world i will be wondering

> in the morning light
in the morning light from his fingers the children that is a dusty view where she

> under the old oak
under the old oak a stone near a nanda s fine garment a web to read



## Evaluation and Experimentation

In [19]:
import time
import json

# Experiment cell: try different LSTM sizes, dropout rates, and sequence lengths.
# Uses existing variables/functions from the notebook: cleaned_poems, tokenizer, clean_text,
# pad_sequences, _sample_from_preds. Does not overwrite the trained `model` variable.


def prepare_sequences(context_len):
  """Create X,y for a given context length using the existing tokenizer & cleaned_poems."""
  token_seqs = []
  for text in cleaned_poems:
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) <= context_len:
      continue
    for i in range(context_len, len(token_list)):
      seq = token_list[i - context_len : i + 1]  # length = context_len+1
      token_seqs.append(seq)
  if len(token_seqs) == 0:
    raise ValueError(f"No sequences created for context_len={context_len}. Reduce length or check data.")
  arr = np.array(token_seqs, dtype=np.int32)
  Xp = pad_sequences(arr[:, :-1], maxlen=context_len, padding='pre')
  yp = arr[:, -1]
  return Xp, yp

def build_lstm_model(vocab_size, context_len,
                     embedding_dim=100, lstm_units_list=(128,),
                     dropout_rate=0.2):
  """Builds and compiles an LSTM model with specified layer sizes."""
  m = Sequential()
  m.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=context_len))
  for i, u in enumerate(lstm_units_list):
    # return_sequences for all but last LSTM
    return_seq = (i < len(lstm_units_list) - 1)
    m.add(LSTM(u, return_sequences=return_seq))
  m.add(Dropout(dropout_rate))
  m.add(Dense(vocab_size, activation='softmax'))
  m.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
  return m

def generate_from_model(seed, model_obj, context_len, num_words=20, temperature=1.0, sample=True):
  # local generate using same preprocessing and sampling helper
  cleaned = clean_text(seed)
  token_list = tokenizer.texts_to_sequences([cleaned])[0]
  generated = token_list.copy()
  for _ in range(num_words):
    input_seq = pad_sequences([generated[-context_len:]], maxlen=context_len, padding='pre')
    preds = model_obj.predict(input_seq, verbose=0)[0]
    if sample:
      next_idx = _sample_from_preds(preds, temperature=temperature)
    else:
      preds[0] = 0.0
      next_idx = int(np.argmax(preds))
    generated.append(next_idx)
  words = [idx_to_word.get(t, '<OOV>') for t in generated if t != 0]
  return " ".join(words)

# Grid of experiments (kept small to run reasonably)
sequence_lengths = [3, 5, 8]                      # try shorter/longer contexts
lstm_configs = [(128,), (128, 128), (256, 128)]   # one vs two LSTM layers and sizes
dropout_rates = [0.1, 0.3]
embedding_dim = 100
batch_size = 512
epochs_per_run = 4   # small for fast iterations; increase when doing deeper experiments
shuffle_buffer = 10000

# seeds to inspect generated outputs
seeds = ["the moon", "i remember", "in the morning"]

results = []

for context_len in sequence_lengths:
  try:
    Xp, yp = prepare_sequences(context_len)
  except ValueError as e:
    print(e)
    continue

  # build a tf.data pipeline once per context length
  ds = tf.data.Dataset.from_tensor_slices((Xp, yp))
  ds = ds.shuffle(shuffle_buffer).map(lambda x, y: (x, tf.one_hot(y, depth=vocab_size, dtype=tf.float32)))
  ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

  for lstm_cfg in lstm_configs:
    for dr in dropout_rates:
      cfg_name = f"ctx{context_len}_L{'-'.join(map(str,lstm_cfg))}_do{dr}"
      print("="*80)
      print(f"Starting experiment: {cfg_name}  (epochs={epochs_per_run})")
      start = time.time()

      exp_model = build_lstm_model(vocab_size=vocab_size,
                                   context_len=context_len,
                                   embedding_dim=embedding_dim,
                                   lstm_units_list=lstm_cfg,
                                   dropout_rate=dr)

      hist = exp_model.fit(ds, epochs=epochs_per_run, verbose=1)

      elapsed = time.time() - start
      print(f"Finished {cfg_name} in {elapsed:.1f}s. Last-epoch loss={hist.history['loss'][-1]:.4f}")

      # generate a few samples at different temperatures
      samples = {}
      for temp in (0.6, 0.9):
        gen_lines = [generate_from_model(s, exp_model, context_len, num_words=20, temperature=temp, sample=True)
                     for s in seeds]
        samples[f"temp{temp}"] = gen_lines

      # print concise outputs
      print("Sample generations:")
      for temp, lines in samples.items():
        print(f"  -- {temp} --")
        for s, line in zip(seeds, lines):
          print(f"    [{s}] -> {line}")

      results.append({
        'config': cfg_name,
        'history': hist.history,
        'samples': samples
      })

# Save experiment metadata to a file for later review (lightweight)
with open("lstm_experiments_summary.json", "w", encoding="utf-8") as fh:
  json.dump({
    'sequence_lengths': sequence_lengths,
    'lstm_configs': lstm_configs,
    'dropout_rates': dropout_rates,
    'results_summary': [
      {'config': r['config'], 'last_loss': r['history']['loss'][-1], 'samples': r['samples']}
      for r in results
    ]
  }, fh, ensure_ascii=False, indent=2)

print("All experiments completed. Summary saved to lstm_experiments_summary.json")

Starting experiment: ctx3_L128_do0.1  (epochs=4)
Epoch 1/4
[1m1664/1664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 81ms/step - accuracy: 0.0623 - loss: 7.7824
Epoch 2/4
[1m1664/1664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 81ms/step - accuracy: 0.0887 - loss: 6.8391
Epoch 3/4
[1m1664/1664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 81ms/step - accuracy: 0.1010 - loss: 6.5955
Epoch 4/4
[1m1664/1664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 81ms/step - accuracy: 0.1100 - loss: 6.3902
Finished ctx3_L128_do0.1 in 547.7s. Last-epoch loss=6.3364
Sample generations:
  -- temp0.6 --
    [the moon] -> the moon s a likeness i ve seen the human eyes i am i say not do you have been done you
    [i remember] -> i remember up as the snail has been the fool you are on the sun and the dark the ones grow as
    [in the morning] -> in the morning what makes me see me for you to make the sky she s just to write the world of common
  -- temp0.9 --


## Report — Model Performance & Generated Poetry

### Training & Experiments
- The LSTM models (single- and multi-layer) trained on the sampled Poetry Foundation corpus and showed consistent learning behavior: loss decreased over epochs for most configurations, with larger models converging more slowly but often achieving lower final loss.
- Shorter context lengths (3–5) produced faster training and more locally grammatical continuations; longer contexts (8) increased model capacity requirements and yielded somewhat better long-range coherence at the cost of more training time.
- Two-layer LSTMs and larger hidden sizes improved expressiveness but increased overfitting risk when training epochs or data were limited.

### Quantitative observations
- Final loss and accuracy varied by config; simpler models (fewer units / shorter context) trained fastest and generalized better when data/epochs were limited.
- Batch one-hot encoding in the tf.data pipeline kept memory manageable; expanding vocabulary or one-hot depth will require more memory or a different loss approach (sparse targets).

### Qualitative observations of generated poetry
- Local fluency: generated lines typically respect local word order and basic syntax (articles, prepositions, short phrases).
- Repetition & generic phrases: the model often repeats common phrases or falls back to safe, high-frequency tokens, especially at higher temperatures or when seed context is weak.
- Coherence: meaningful thematic continuity across many tokens is limited — the model tends to drift or introduce unrelated tokens after ~10–20 words.
- OOV / rare words: rare poetic vocabulary is often replaced by common alternatives or the OOV token treatment, reducing poetic richness.
- Temperature effects: lower temperature yields conservative, more grammatical output; higher temperature increases creativity but also incoherence and nonsensical tokens.

### Practical recommendations / next steps
- More data or data augmentation: train on a larger corpus (or use more of the dataset) to better capture poetic vocabulary and rare constructions.
- Longer training & checkpoints: increase epochs and save checkpoints to choose models with the best validation/perplexity rather than final epoch.
- Tokenization improvements: switch to subword tokenization (Byte Pair Encoding, WordPiece) to handle rare words and reduce OOVs.
- Model architecture: experiment with Transformer-based language models (GPT-style) or bidirectional encoders for richer context; try increasing embedding & LSTM sizes cautiously.
- Regularization & sampling: add weight decay, increase dropout, or use scheduled sampling; tune temperature and consider top-k / nucleus (top-p) sampling instead of pure temperature sampling.

Overall: the current LSTM pipeline produces locally fluent and sometimes evocative lines but struggles with sustained poetic coherence and rare vocabulary. Improvements in data, tokenization, model capacity, and sampling strategies will yield the best gains.