In [1]:
import tensorflow 
import tensorflow as tf
import json 
import os
import pickle
import numpy as np
import string, os 
from gensim.models import KeyedVectors
import gensim.downloader as api
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku 
import random
import sys
# Load vectors directly from the file
word_vectors = api.load("glove-twitter-100")



In [2]:
with open('../configs/config.json','r') as cfgFile:
    cfg = json.load(cfgFile)

In [3]:
data_dir = '../data/processed/verses.txt'
with open(data_dir, "rb") as fp:   # Unpickling
    lyrics = pickle.load(fp)   
    
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

In [4]:
lyrics[0]

'i live life lavish and my chain is karats \n the last name on the train to paris \n used to be lame then attained the merit \n so many clothes cant name the fabrics \n dynamics i want the fame \n and my name engraved in granites \n heres his lane now came to grab it \n you moving sideways change your habits \n used to rock minks then i changed to rabbit \n from out the garbage i came from average \n used to be righteous that changed to savage \n bang my ratchet like bangkok dangerous \n 36 chamber fist trianglist \n watch me mangle this star spanglist \n rock cowboy wrangler creating mega-hits \n im from the grain game at my fingertips \n \n\n'

In [5]:
lyrics = np.array(lyrics)       
arr = [[clean_text(j) for j in i.split(' \n ') if len(j) > 1 and '\n\n' != j] for i in list(np.array(lyrics)) if len(i.split(' \n ')) > 0]  

In [6]:
arr[0]

['i live life lavish and my chain is karats',
 'the last name on the train to paris',
 'used to be lame then attained the merit',
 'so many clothes cant name the fabrics',
 'dynamics i want the fame',
 'and my name engraved in granites',
 'heres his lane now came to grab it',
 'you moving sideways change your habits',
 'used to rock minks then i changed to rabbit',
 'from out the garbage i came from average',
 'used to be righteous that changed to savage',
 'bang my ratchet like bangkok dangerous',
 '36 chamber fist trianglist',
 'watch me mangle this star spanglist',
 'rock cowboy wrangler creating megahits',
 'im from the grain game at my fingertips']

In [7]:
np.random.shuffle(arr)
arr[0]

['you got another thing comin if you think me and my man aint sellin',
 'got more gold than mr drum and we sellin',
 'through welcome to the terrordome niggas are ever wrong',
 'thats why you get no ends but just in case',
 'you got a death wish for flare for the dramatics',
 'theres some static in the attic  hid it looking at it',
 'im a matic nice to the fifth power son and im still at it',
 'i was about to flip power son til i saw a vision caz']

In [8]:
flattened_list = np.asarray([y for x in arr for y in x])

In [9]:
tokenizer = Tokenizer()
corpus = flattened_list #[' '.join(i) for i in arr]
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[5, 24],
 [5, 24, 195],
 [5, 24, 195, 202],
 [5, 24, 195, 202, 514],
 [5, 24, 195, 202, 514, 46],
 [5, 24, 195, 202, 514, 46, 5],
 [5, 24, 195, 202, 514, 46, 5, 130],
 [5, 24, 195, 202, 514, 46, 5, 130, 15],
 [5, 24, 195, 202, 514, 46, 5, 130, 15, 6],
 [5, 24, 195, 202, 514, 46, 5, 130, 15, 6, 7]]

In [10]:
tokenizer.word_index

{'the': 1,
 'i': 2,
 'a': 3,
 'to': 4,
 'you': 5,
 'and': 6,
 'my': 7,
 'in': 8,
 'it': 9,
 'on': 10,
 'of': 11,
 'im': 12,
 'like': 13,
 'that': 14,
 'me': 15,
 'with': 16,
 'your': 17,
 'is': 18,
 'for': 19,
 'up': 20,
 'we': 21,
 'but': 22,
 'they': 23,
 'got': 24,
 'get': 25,
 'this': 26,
 'all': 27,
 'be': 28,
 'so': 29,
 'its': 30,
 'dont': 31,
 'when': 32,
 'out': 33,
 'no': 34,
 'know': 35,
 'was': 36,
 'aint': 37,
 'from': 38,
 'just': 39,
 'what': 40,
 'now': 41,
 'shit': 42,
 'niggas': 43,
 'nigga': 44,
 'at': 45,
 'if': 46,
 'back': 47,
 'cause': 48,
 'he': 49,
 'she': 50,
 'see': 51,
 'do': 52,
 'her': 53,
 'can': 54,
 'thats': 55,
 'never': 56,
 'fuck': 57,
 'one': 58,
 'not': 59,
 'make': 60,
 'off': 61,
 'or': 62,
 'then': 63,
 'as': 64,
 'em': 65,
 'them': 66,
 'how': 67,
 'down': 68,
 'some': 69,
 'man': 70,
 'time': 71,
 'go': 72,
 'his': 73,
 'cant': 74,
 'who': 75,
 'life': 76,
 'these': 77,
 'yall': 78,
 'bitch': 79,
 'take': 80,
 'still': 81,
 'love': 82,
 'by': 

In [11]:
input_sequences = inp_sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len+1, padding='pre'))

predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

In [12]:
predictors.shape, label.shape

((1552419, 64), (1552419,))

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors, label, test_size=0.10, shuffle=False, random_state=42)

In [14]:
word_vectors.get_vector('word')

array([ 0.57479 ,  0.27959 , -0.17003 ,  1.0926  , -0.5678  ,  0.13946 ,
       -0.22845 ,  0.27979 ,  0.1436  ,  0.25408 ,  0.14175 ,  0.47737 ,
       -4.1063  , -0.45932 , -0.78775 , -0.061295,  0.28098 ,  0.55691 ,
        0.040097, -0.33675 ,  0.10952 ,  0.32482 , -0.60996 ,  0.77837 ,
        1.0855  ,  0.092512, -0.34347 , -0.52561 , -0.32974 , -0.45062 ,
       -0.33763 ,  0.26943 , -0.7608  , -0.013459, -0.097348, -0.40263 ,
        0.22523 ,  0.40602 ,  0.34765 , -1.2264  , -0.81516 , -0.57451 ,
        0.084248,  0.36518 ,  0.24649 , -0.26708 ,  0.074   ,  0.73033 ,
       -0.34619 ,  0.29964 ,  0.49903 ,  0.46251 , -0.68305 , -0.92597 ,
        0.075895, -0.51661 , -0.67615 , -0.017943, -1.1911  , -0.12817 ,
        0.27478 , -0.77928 , -0.35465 ,  0.39712 ,  0.22347 ,  0.38169 ,
       -0.067566, -0.24608 ,  0.34249 , -0.26701 , -0.78815 , -0.79426 ,
       -0.57019 ,  0.14404 ,  0.23621 , -0.067121,  0.31948 ,  0.06233 ,
       -0.3619  , -0.012909,  0.91253 ,  0.21408 , 

In [15]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_vectors.get_vector("cucumber").shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    emb_dim_one = word_vectors.get_vector("cucumber").shape

    ### START CODE HERE ###
    # Step 1
    # Initialize the embedding matrix as a numpy array of zeros.
    # See instructions above to choose the correct shape.
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Step 2
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        try:
            emb_matrix[idx, :] = word_vectors.get_vector(word)
        except KeyError:
            emb_matrix[idx, :] = np.random.rand(emb_dim)#np.zeros(word_vectors.get_vector("cucumber").shape)

    # Step 3
    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = tensorflow.keras.layers.Embedding(input_dim=vocab_len, output_dim=emb_dim, trainable=True, mask_zero=True)
    ### END CODE HERE ###

    # Step 4 (already done for you; please do not modify)
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,)) # Do not modify the "None".  This line of code is complete as-is.
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [16]:
embedding = pretrained_embedding_layer(word_vectors, tokenizer.word_index)

In [17]:
input_shape = (max_sequence_len,)
print(max_sequence_len)
sentence_indices = Input(shape=input_shape, dtype='int32')
    
# Create the embedding layer pretrained with GloVe Vectors (≈1 line)
embedding_layer = pretrained_embedding_layer(word_vectors, tokenizer.word_index)

# Propagate sentence_indices through your embedding layer
# (See additional hints in the instructions).
embeddings = embedding_layer(sentence_indices) 

# Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
# The returned output should be a batch of sequences.
X = LSTM(units=256, return_sequences=True)(embeddings)
# Add dropout with a probability of 0.5
X = Dropout(rate=0.5)(X)
# Propagate X trough another LSTM layer with 128-dimensional hidden state
# The returned output should be a single hidden state, not a batch of sequences.
X = LSTM(units=256, return_sequences=False)(X)
# Add dropout with a probability of 0.5
X = Dropout(rate=0.5)(X)
# Propagate X through a Dense layer with 5 units
X = Dense(units=total_words)(X)
# Add a softmax activation
X = Activation('softmax')(X)



# Create Model instance which converts sentence_indices into X.
model = Model(inputs=sentence_indices, outputs=X)
model.summary()

64
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 64)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 64, 100)           5012100   
_________________________________________________________________
lstm (LSTM)                  (None, 64, 256)           365568    
_________________________________________________________________
dropout (Dropout)            (None, 64, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 50121)             128

In [18]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    text = flattened_list
    start_index = random.randint(0, len(text) - max_sequence_len - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(40):
            x_pred = [tokenizer.word_index[i] for i in sentence.split()]
            x_pred = np.array(pad_sequences([[tokenizer.word_index[i] for i in sentence.split()]], maxlen=max_sequence_len, padding='pre'))
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = ' ' + tokenizer.index_word[next_index]
            
            sentence = sentence + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [19]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])
history = model.fit(X_train, y_train, epochs = 25, batch_size = 32, validation_split=0.33, shuffle=True,callbacks=[print_callback])

Train on 936108 samples, validate on 461069 samples
Epoch 1/25
 40224/936108 [>.............................] - ETA: 3:28:26 - loss: 7.6273 - sparse_categorical_accuracy: 0.0498
----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "talk real good cause im smart and stuff"
talk real good cause im smart and stuff

KeyboardInterrupt: 

In [None]:
model.evaluate(X_test, y_test, batch_size = 32)