# Trigram Lyrics generation

## Sample sentence: 
Carlos needs a haircut

## Bigrams: 

- (Carlos, needs)
- (needs, a)
- (a, haircut)

## Trigrams

- (Carlos, needs, a)
- (needs, a, haircut)

## Acknowledgments
Based on the Poetry-generator talk made by Carlos Castro - Software Engineer at Microsoft

# Setup

In [1]:
# Imports

import nltk
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

In [2]:
# Download data

nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\lirosale\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lirosale\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

# Exploration

In [3]:
# Inspect first sentence

first_sentence = reuters.sents()[0]
print(first_sentence) # [u'ASIAN', u'EXPORTERS', u'FEAR', u'DAMAGE', u'FROM' ...

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']


In [4]:
# Show trigrams for first sentence

print(list(trigrams(first_sentence, pad_left=True, pad_right=True))) 

[(None, None, 'ASIAN'), (None, 'ASIAN', 'EXPORTERS'), ('ASIAN', 'EXPORTERS', 'FEAR'), ('EXPORTERS', 'FEAR', 'DAMAGE'), ('FEAR', 'DAMAGE', 'FROM'), ('DAMAGE', 'FROM', 'U'), ('FROM', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.-'), ('S', '.-', 'JAPAN'), ('.-', 'JAPAN', 'RIFT'), ('JAPAN', 'RIFT', 'Mounting'), ('RIFT', 'Mounting', 'trade'), ('Mounting', 'trade', 'friction'), ('trade', 'friction', 'between'), ('friction', 'between', 'the'), ('between', 'the', 'U'), ('the', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.'), ('S', '.', 'And'), ('.', 'And', 'Japan'), ('And', 'Japan', 'has'), ('Japan', 'has', 'raised'), ('has', 'raised', 'fears'), ('raised', 'fears', 'among'), ('fears', 'among', 'many'), ('among', 'many', 'of'), ('many', 'of', 'Asia'), ('of', 'Asia', "'"), ('Asia', "'", 's'), ("'", 's', 'exporting'), ('s', 'exporting', 'nations'), ('exporting', 'nations', 'that'), ('nations', 'that', 'the'), ('that', 'the', 'row'), ('the', 'row', 'could'), ('row', 'could', 'inflict'), ('could', 'inflict

In [5]:
# How many trigrams in n word sentence?

In [6]:
# Word count
print(len(first_sentence))

49


In [7]:
# Trigram count
print(len(list(trigrams(first_sentence, pad_left=True, pad_right=True))))

51


# Reuters trigram model

In [8]:
# Our model will be a dictionary that maps trigram -> number of occurrences in reuters data
# By default we have zero for all trigrams (this is why we use defaultdict and not dict)
model = defaultdict(lambda: defaultdict(lambda: 0))
 
# Iterated through all sentences in the dataset
for sentence in reuters.sents():
    # For each trigram in the sentence
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        # Increase occurence count
        model[(w1, w2)][w3] += 1

In [9]:
# How many times does "economists" follow "what the"?
print(model["", ""][""]) # "economists" follows "what the" 2 times

2


In [10]:
# How many times does "noneexistingword" follow "what the"?
print(model["", ""][""]) # 0 times

0


In [11]:
print(model[,][""]) # 8839 sentences start with "The"?

8839


## Intuition for probabilities in trigram model

Consider the sentences 

- _Carlos needs a haircut_
- _Carlos needs a new pair of shoes_

The trigrams are

- (Carlos, needs, a) (2)
- (needs, a, haircut) (1)
- (needs, a, new) (1)
- (a, new, pair) (1)
- (pair, of, shoes) (1)

Our language model can now predict conditional probabilities for words given the context:

Carlos, needs -> ???

a: 1.0
carlos: 0.0
needs: 0.0
new: 0.0
haircut: 0.0
shoes: 0.0

needs, a ->


a: 0.0
carlos: 0.0
needs: 0.0
new: 0.5
haircut: 0.5
shoes: 0.0

Mathematically this can be expressed as the conditional probability P (w[i] | w[i-1], w[i-2])

## Reuters trigram text generation

Let's get a word prediction given 2 words from our trigram model


In [12]:
# Compute probabilities from counts
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        # Divide the number of times a trigram appears by the total count of trigrams
        model[w1_w2][w3] /= total_count

In [13]:
# View language model output for "what the"
prediction = model["I", "Shall"]

prediction_sorted = sorted(prediction.items(), key=lambda kv: -kv[1])

for word, conditional_probability in prediction_sorted:
    print(word, conditional_probability)
    

In [14]:
import numpy as np
from itertools import compress

text = [None, None]
end_sentence = False

while not end_sentence:
    
    # Obtain predictions for next word based on the lastest 2 words
    predictions = model[tuple(text[-2:])]
 
    prediction_items = predictions.items()
    probs = [p[1] for p in prediction_items]
    words = [p[0] for p in prediction_items]
    
    # Randomly select next word considering conditional probabilities
    s = np.random.multinomial(1, probs) # [0 0 0 1 0 0 0]
    candidate = list(compress(words, s))[0]
    text.append(candidate)
    
    if candidate == None:
        end_sentence = True

print(text)
    

[None, None, 'HILTON', '&', 'lt', ';', 'XON', '>', 'EXTENDS', 'MID', '-', 'GRADE', 'UNLEADED', 'MARKETING', 'Exxon', 'Co', 'U', '.', 'S', '.', 'companies', 'from', 'trading', 'on', 'NASDAQ', ',', 'while', 'residual', 'fuel', 'went', 'into', 'stocks', '.', None]


# Lyrics

In [28]:
# Load poetry
file = open("DonOmar/Blue Zone.txt", "r") 

new_line_sentinel = " _NL_ "

# In poetry, new lines are very important. Workaround to capture the new line in the tokenizer
poetry = file.read().replace("\n", new_line_sentinel)

In [41]:
import os

In [42]:
all_files = os.listdir("DonOmar/")

In [69]:
#Load lyrics
all_songs = []
for name_file in all_files:
    file = open("DonOmar/" + name_file, 'r')
    all_songs.append(file.read().replace("\n", new_line_sentinel))

In [77]:
# How many songs?


358


In [70]:
# Tokenization
poetry_tokenized = []
for song in all_songs:
    poetry_tokenized += nltk.word_tokenize(song)

In [74]:
print(poetry_tokenized[0:100])

['Natty', 'natasha_NL_Ahora', 'entiendo', 'es', 'tan', 'difícil', 'de', 'aceptar_NL_Que', 'tuvimos', 'tanto', 'para', 'dar_NL_Y', 'con', 'el', 'tiempo', 'se', 'acabo', 'nuestro', 'camino_NL__NL_Natty', 'natasha_NL_Ahora', 'entiendo', 'es', 'tan', 'difícil', 'de', 'aceptar_NL_Que', 'tuvimos', 'tanto', 'para', 'dar_NL_Y', 'con', 'el', 'tiempo', 'se', 'acabo', 'nuestro', 'camino_NL_Ahora', 'entiendo', 'es', 'tan', 'difícil', 'de', 'aceptar_NL_Que', 'tuvimos', 'tanto', 'para', 'dar_NL_Y', 'con', 'el', 'tiempo', 'se', 'acabo', 'nuestro', 'caminoDon', 'omar_NL_Que', 'paraíso', 'construimos', 'si', 'al', 'final_NL_Termino', 'por', 'derribarnos', 'la', 'ansiedad_NL_No', 'supimos', 'rescatarnos', 'y', 'cambio', 'nuestro', 'destino_NL_Don', 'omar_NL_Que', 'paraíso', 'construimos', 'si', 'al', 'final_NL_Termino', 'por', 'derribarnos', 'la', 'ansiedad_NL_No', 'supimos', 'rescatarnos', 'y', 'cambio', 'nuestro', 'destinoNatty', 'natasha_NL_Me', 'duele', 'tanto', 'imaginar_NL_Todo', 'lo', 'que', 'no'

In [108]:
# Generate some poetry: build language model and generate!

# Our model will be a dictionary that maps trigram -> number of occurrences in reuters data
# By default we have zero for all trigrams (this is why we use defaultdict and not dict)
poetry_model = defaultdict(lambda: defaultdict(lambda: 0))
 

# For each trigram in the sentence
for w1, w2, w3 in trigrams(poetry_tokenized, pad_right=True, pad_left=True):
    # Increase occurence count
    poetry_model[(w1, w2)][w3] += 1

In [109]:
poetry_model

defaultdict(<function __main__.<lambda>()>,
            {(None,
              None): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'Natty': 1}),
             (None,
              'Natty'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'natasha_NL_Ahora': 1}),
             ('Natty',
              'natasha_NL_Ahora'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'entiendo': 1}),
             ('natasha_NL_Ahora',
              'entiendo'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'es': 2}),
             ('entiendo',
              'es'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'tan': 3}),
             ('es',
              'tan'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'difícil': 3,
                          'fácil': 2,
                          'impactante_NL_tan': 3,
                          'sato_NL_se': 7,
                          'puro_NL_Que': 4}),
             ('

In [107]:
# Compute probabilities from counts
for w1_w2 in poetry_model:
    total_count = float(sum(poetry_model[w1_w2].values()))
    for w3 in poetry_model[w1_w2]:
        # Divide the number of times a trigram appears by the total count of trigrams
        poetry_model[w1_w2][w3] /= total_count

In [106]:
poetry_model["el", "tiempo"]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'se': 0.14285714285714285,
             'corre': 0.02857142857142857,
             'no': 0.07142857142857142,
             'me': 0.02857142857142857,
             'si': 0.05714285714285714,
             'de': 0.02857142857142857,
             'junto': 0.04285714285714286,
             'pasa_NL_Te': 0.02857142857142857,
             'lo': 0.14285714285714285,
             'decida_NL_Si': 0.02857142857142857,
             ',': 0.02857142857142857,
             'pasa': 0.05714285714285714,
             'q': 0.02857142857142857,
             'todavía': 0.08571428571428572,
             'y': 0.11428571428571428,
             'todavia': 0.02857142857142857,
             'te': 0.05714285714285714})

In [None]:
# Compute probabilities from counts
for w1_w2 in poetry_model:
    total_count = float(sum(poetry_model[w1_w2].values()))
    for w3 in poetry_model[w1_w2]:
        # Divide the number of times a trigram appears by the total count of trigrams
        poetry_model[w1_w2][w3] /= total_count

In [111]:
new_line_sentinel = "_NL_"
poetry = ["tu", "sabe"]

for i in range(1, 50):

    # Obtain predictions for next word based on the lastest 2 words
    predictions = poetry_model[tuple(poetry[-2:])]

    prediction_items = predictions.items()
    probs = [p[1] for p in prediction_items]
    words = [p[0] for p in prediction_items]
    
    # Randomly select next word considering conditional probabilities
    s = np.random.multinomial(1, probs)
    candidate = list(compress(words, s))[0]
    poetry.append(candidate)


print(" ".join(poetry).replace(new_line_sentinel, "\n"))


tu sabe tu sabe
la invacion
ok mano arriba
es el presidente
el rey don
let me see ya.inside
echo diesel esta es la invasion
nosotros somos genios . ) 
 ( Coro ) Asi de sencillo ando con 50 titeres y a voz tan siquiera verte , por eso es lo mismo , no
yo soy tu hombre
tu
