In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%cd gdrive/My\ Drive/Colab\ Notebooks/word_generator/

/content/gdrive/My Drive/Colab Notebooks/word_generator


In [20]:
!pwd

/content/gdrive/My Drive/Colab Notebooks/word_generator


## Process text

In [0]:
import string

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()


In [4]:

# load document
in_filename = 'republic_clean.txt'
doc = load_doc(in_filename)
print(doc[:200])

# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

# save sequences to file
out_filename = 'republic_sequences.txt'
save_doc(sequences, out_filename)

﻿BOOK I. The Republic opens with a truly Greek scene--a festival in
honour of the goddess Bendis which is held in the Piraeus; to this is
added the promise of an equestrian torch-race in the evening. 
['i', 'the', 'republic', 'opens', 'with', 'a', 'truly', 'greek', 'scene', 'a', 'festival', 'in', 'honour', 'of', 'the', 'goddess', 'bendis', 'which', 'is', 'held', 'in', 'the', 'piraeus', 'to', 'this', 'is', 'added', 'the', 'promise', 'of', 'an', 'equestrian', 'torchrace', 'in', 'the', 'evening', 'the', 'whole', 'work', 'is', 'supposed', 'to', 'be', 'recited', 'by', 'socrates', 'on', 'the', 'day', 'after', 'the', 'festival', 'to', 'a', 'small', 'party', 'consisting', 'of', 'critias', 'timaeus', 'hermocrates', 'and', 'another', 'this', 'we', 'learn', 'from', 'the', 'first', 'words', 'of', 'the', 'timaeus', 'when', 'the', 'rhetorical', 'advantage', 'of', 'reciting', 'the', 'dialogue', 'has', 'been', 'gained', 'the', 'attention', 'is', 'not', 'distracted', 'by', 'any', 'reference', 'to', 'th

## get training data

Read the vector from glove file

In [0]:
import numpy as np

def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1

    return words_to_index, index_to_words, word_to_vec_map

In [0]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

In [0]:
# GRADED FUNCTION: sentences_to_indices

def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    vec_len = word_to_vec_map['baseball'].shape[0]
    # import pdb; pdb.set_trace()
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros( shape = (m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        # sentence_words = list(map(lambda x : x.lower() , X[i].split(" ")))
        sentence_words = [i.lower() for i in X[i].split()]
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            try:
              X_indices[i, j] = word_to_index[w]
            except KeyError:
              print(w + " doesnt have index, new entry created")
              vocab_size = len(word_to_index) + 1
              word_to_index[w] = vocab_size
              X_indices[i, j] = vocab_size
              word_to_vec_map[w] = np.random.rand(vec_len)
              continue
            # Increment j to j + 1
            j += 1
    ### END CODE HERE ###
    # pdb.set_trace()
    return X_indices

In [9]:
X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
X1_indices = sentences_to_indices(X1,word_to_index, max_len = 5)
print("X1 =", X1)
print("X1_indices =", X1_indices)

X1 = ['funny lol' 'lets play baseball' 'food is ready for you']
X1_indices = [[155345. 225122.      0.      0.      0.]
 [220930. 286375.  69714.      0.      0.]
 [151204. 192973. 302254. 151349. 394475.]]


In [10]:
from numpy import array
from pickle import dump
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding


In [0]:
# load
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
lines = np.array(lines)
length = 50 + 1


In [12]:
print(lines[:5])

['i the republic opens with a truly greek scene a festival in honour of the goddess bendis which is held in the piraeus to this is added the promise of an equestrian torchrace in the evening the whole work is supposed to be recited by socrates on the day after the'
 'the republic opens with a truly greek scene a festival in honour of the goddess bendis which is held in the piraeus to this is added the promise of an equestrian torchrace in the evening the whole work is supposed to be recited by socrates on the day after the festival'
 'republic opens with a truly greek scene a festival in honour of the goddess bendis which is held in the piraeus to this is added the promise of an equestrian torchrace in the evening the whole work is supposed to be recited by socrates on the day after the festival to'
 'opens with a truly greek scene a festival in honour of the goddess bendis which is held in the piraeus to this is added the promise of an equestrian torchrace in the evening the whole wor

In [13]:
# integer encode sequences of words
#okenizer = Tokenizer()
#tokenizer.fit_on_texts(lines)

#array([   15,     1,   306, 10270,    28,     7,   363,   225,  4604,....])
sequences = sentences_to_indices(lines,word_to_index, max_len = length)
# vocabulary size
vocab_size = len(word_to_index) + 1  
X, y = sequences[:,:-1], sequences[:,-1]
#y = to_categorical(y, num_classes=vocab_size) eats all memory use sparse_categorical_crossentropy
seq_length = X.shape[1]

torchrace doesnt have index, new entry created
polemarchus doesnt have index, new entry created
adeimantus doesnt have index, new entry created
oftener doesnt have index, new entry created
seriphian doesnt have index, new entry created
mythus doesnt have index, new entry created
ismenias doesnt have index, new entry created
punishest doesnt have index, new entry created
kheyam doesnt have index, new entry created
polydamas doesnt have index, new entry created
cleitophon doesnt have index, new entry created
unmeaning doesnt have index, new entry created
drivelling doesnt have index, new entry created
rogueries doesnt have index, new entry created
episcopari doesnt have index, new entry created
goodhumour doesnt have index, new entry created
semiwickedness doesnt have index, new entry created
enquirers doesnt have index, new entry created
illtreated doesnt have index, new entry created
enigmatical doesnt have index, new entry created
reconcilement doesnt have index, new entry created
rea

In [14]:
word_to_vec_map['unmindfulness']

array([0.25804905, 0.42782601, 0.08253279, 0.94189494, 0.90483662,
       0.80605249, 0.75530373, 0.53488257, 0.99771895, 0.6904369 ,
       0.62830752, 0.25978722, 0.61121753, 0.73058863, 0.86849375,
       0.11660682, 0.80018255, 0.45473913, 0.59649186, 0.55138229,
       0.67770633, 0.84527166, 0.18786813, 0.48001956, 0.18850275,
       0.64891165, 0.11234195, 0.8189257 , 0.85143895, 0.44849895,
       0.42619307, 0.56067617, 0.43850789, 0.20578361, 0.52413191,
       0.45231781, 0.0306588 , 0.84735471, 0.16005981, 0.84826822,
       0.97089961, 0.52815292, 0.75219495, 0.80375093, 0.56572554,
       0.79102657, 0.31173895, 0.73245766, 0.41836962, 0.04117114])

## Pretrain embedding layer

In [0]:

def pretrained_embedding_layer(word_to_vec_map, word_to_index, trainable = False):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros(shape=(vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it non-trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len,emb_dim,trainable=trainable)
    
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [16]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
weights[0][1][3] = -0.3403


In [0]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.optimizers import SGD
# define model
model = Sequential()
# in : (batch, input_lengh) , [12,3,2,3,4,5,12,..., input_lentgh]

model.add(pretrained_embedding_layer(word_to_vec_map,word_to_index,trainable = True))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
# opt = SGD(learning_rate=0.001, momentum=0.0)
# opt = Adagrad(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

# save the model to file
model.save('model_embpre.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [24]:
!ls -lah

total 247M
-rw------- 1 root root 3.7K Nov  8 00:15 emo_utils.py
drwx------ 7 root root 4.0K Nov 10 04:38 .git
-rw------- 1 root root 164M Nov  8 00:10 glove.6B.50d.txt
-rw------- 1 root root  20M Nov  6 18:22 model.h5
drwx------ 2 root root 4.0K Nov  8 00:11 __pycache__
-rw------- 1 root root 1.2M Nov  5 20:25 republic_clean.txt
-rw------- 1 root root  55M Nov 10 04:12 republic_sequences.txt
-rw------- 1 root root 1.2M Nov  5 20:16 republic.txt
-rw------- 1 root root  18K Nov  5 20:06 text_gen_words.ipynb
-rw------- 1 root root 484K Nov  6 18:22 tokenizer.pkl
-rw------- 1 root root 5.6M Nov  7 21:38 vects.tsv
-rw------- 1 root root  53K Nov 10 04:37 word_gen_emb_pretrained.ipynb
-rw------- 1 root root 237K Nov 10 02:08 word_gen_MLM.ipynb
-rw------- 1 root root  88K Nov  7 21:38 words.tsv


In [0]:
!git remote add origin https://jcrangel:kikkomanchocolatemint@github.com/jcrangel/text-gen.git

In [57]:
!git push origin master

Counting objects: 13, done.
Delta compression using up to 4 threads.
Compressing objects: 100% (13/13), done.
Writing objects: 100% (13/13), 70.48 MiB | 2.98 MiB/s, done.
Total 13 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), done.[K
remote: error: GH001: Large files detected. You may want to try Git Large File Storage - https://git-lfs.github.com.[K
remote: error: Trace: 2e3232f5e71593d94a670cf79e4ea993[K
remote: error: See http://git.io/iEPt8g for more information.[K
remote: error: File glove.6B.50d.txt is 163.41 MB; this exceeds GitHub's file size limit of 100.00 MB[K
To https://github.com/jcrangel/text-gen.git
 ! [remote rejected] master -> master (pre-receive hook declined)
error: failed to push some refs to 'https://jcrangel:kikkomanchocolatemint@github.com/jcrangel/text-gen.git'


In [58]:
!git status

On branch master
Changes to be committed:
  (use "git reset HEAD <file>..." to unstage)

	[32mdeleted:    glove.6B.50d.txt[m
	[32mdeleted:    republic_sequences.txt[m

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   word_gen_emb_pretrained.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31m__pycache__/[m
	[31mglove.6B.50d.txt[m
	[31mmodel.h5[m
	[31mrepublic_sequences.txt[m



In [61]:
!git init

Initialized empty Git repository in /content/gdrive/My Drive/Colab Notebooks/word_generator/.git/
