# Recurrent Neural Networks

El objetivo del notebook es generar sentencias usando redes neuronales recurrentes

In [2]:
from datetime import datetime
import itertools
import numpy as np
import nltk  # Natural Language Toolkit
import os
import operator
import sys

In [4]:
# Descargamos el modelo de datos NLTK
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/jcla/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to /home/jcla/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to /home/jcla/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/jcla/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /home/jcla/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /home/jcla/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /home/jcla/nltk_data...
[nltk_data]    |   Unzipping corpora/dependenc

True

In [18]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

corpora_dir = "/home/jcla/nltk_data/corpora/state_union"

In [25]:
# Leemos los datos y agregamos los tokens SENTENCE_START y SENTENCE_END
print("Reading data")

# Read all file paths in corpora directory
file_list = []
for root, _, files in os.walk(corpora_dir):
    for filename in files:
        file_list.append(os.path.join(root, filename))

# Extraemos todas las sentencias en una lista
sentences = []

for files in file_list:
    with open(files, 'r') as fin:
        try:
            str_form = fin.read().replace('\n', '')
            sentences.extend(nltk.sent_tokenize(str_form))
        except UnicodeDecodeError:
            # Algunas sentencias tienen carcateres weird, los ignoramos por ahora
            pass

# Mostremos las primeras sentencias en una lista
sentences[:5]

Reading data


["PRESIDENT GERALD R. FORD'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS REPORTING ON THE STATE OF THE UNION January 19, 1976Mr.",
 'Speaker, Mr. Vice President, Members of the 94th Congress, and distinguished guests:As we begin our Bicentennial, America is still one of the youngest nations in recorded history.',
 "Long before our forefathers came to these shores, men and women had been struggling on this planet to forge a better life for themselves and their families.In man's long, upward march from savagery and slavery--throughout the nearly 2,000 years of the Christian calendar, the nearly 6,000 years of Jewish reckoning--there have been many deep, terrifying valleys, but also many bright and towering peaks.One peak stands highest in the ranges of human history.",
 'One example shines forth of a people uniting to produce abundance and to share the good life fairly and with freedom.',
 'One union holds out the promise of justice and opportunity for every citizen: That union is the

In [26]:
# Agregamos delimitadores de sentencia
sentences = [sentence_start_token + " " + x + " " + sentence_end_token for x in sentences]

In [28]:
sentences[:5]

["SENTENCE_START PRESIDENT GERALD R. FORD'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS REPORTING ON THE STATE OF THE UNION January 19, 1976Mr. SENTENCE_END",
 'SENTENCE_START Speaker, Mr. Vice President, Members of the 94th Congress, and distinguished guests:As we begin our Bicentennial, America is still one of the youngest nations in recorded history. SENTENCE_END',
 "SENTENCE_START Long before our forefathers came to these shores, men and women had been struggling on this planet to forge a better life for themselves and their families.In man's long, upward march from savagery and slavery--throughout the nearly 2,000 years of the Christian calendar, the nearly 6,000 years of Jewish reckoning--there have been many deep, terrifying valleys, but also many bright and towering peaks.One peak stands highest in the ranges of human history. SENTENCE_END",
 'SENTENCE_START One example shines forth of a people uniting to produce abundance and to share the good life fairly and with freedom. 

In [29]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

print("Found ", len(word_freq.items()), " unique words tokens.")

Found  18331  unique words tokens.


In [41]:
# Obtengamos las palabras más frecuentes y construyamos dos vectores: index_to_word y word_to_index
vocab = word_freq.most_common(vocabulary_size - 1)  # Get the most frequent words to construct vocab
index_to_word = [x[0] for x in vocab]  # Extract word
index_to_word.append(unknown_token)  # Add an extra token called "unknown" for the words in corpora, but not in vocab
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])  # Create word-index map

print("Using vocabulary size ", vocabulary_size)
print("The least frequent word in our vocabulary is '", vocab[-1][0],
      "' and appeard ", vocab[-1][1], " times.")

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w  if w in word_to_index else unknown_token for w in sent]

Using vocabulary size  8000
The least frequent word in our vocabulary is ' bet ' and appeard  2  times.


In [48]:
## Create the training data
# Every X represents a word. Every y represents a word that follows it in the sequence
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [50]:
# Print and training data example
x_example, y_example = X_train[10], y_train[10]
print("The 10th sentence input and expected output to every neuron looks like this\n")
print(list(zip([index_to_word[x] for x in x_example], [index_to_word[y] for y in y_example])))

The 10th sentence input and expected output to every neuron looks like this

[('SENTENCE_START', 'I'), ('I', 'know'), ('know', 'it'), ('it', 'will'), ('will', 'be'), ('be', 'better'), ('better', 'for'), ('for', 'my'), ('my', 'children'), ('children', 'because'), ('because', 'my'), ('my', 'hands'), ('hands', ','), (',', 'my'), ('my', 'brains'), ('brains', ','), (',', 'my'), ('my', 'voice'), ('voice', ','), (',', 'and'), ('and', 'my'), ('my', 'vote'), ('vote', 'can'), ('can', 'help'), ('help', 'make'), ('make', 'it'), ('it', 'UNKNOWN_TOKEN'), ('UNKNOWN_TOKEN', 'has'), ('has', 'happened'), ('happened', 'here'), ('here', 'in'), ('in', 'America'), ('America', '.'), ('.', 'SENTENCE_END')]


In [58]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [None]:
class RNN:
    
    def __init__(self, word_dim, hidden_dim=50, bptt_truncate=4)