To run this notebook, using a GPU is highly recommended.

# Imports

In [1]:
import collections
import numpy as np
import codecs
import random
import sys
import os
import re

from keras.models import Sequential
from keras.models import Model
from keras.models import load_model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.layers import Dense, Input, Flatten
from keras.utils.data_utils import get_file
from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding
from gensim.models import Word2Vec
import gensim as gs

Using TensorFlow backend.


# Clean Txt File

In [2]:
def clean_text(path):
    try: 
        text = open(path).read().lower()
    except UnicodeDecodeError:
        text = codecs.open(path, encoding='utf-8').read().lower()
    print('corpus length:', len(text))
    
    # remove numbers
    text = re.sub('[0-9]{4}', 'hedgehog', text) # replace years with hedgehog
    text = re.sub('\d+', 'warthog', text) # replace numbers with warthog
    text = text.replace('warthogd', 'warthog').replace('warthogth', 'warthog')\
    # remove all punctuation and special characters
    text = re.sub("[^A-Za-z]"," ", text) 
    # replace dictionary
    replacer = {'u s': 'United_States', 's ct': 'Supreme_Court', ' v ': ' versus ', 
            'ginsburg': 'Ginsburg', 'roberts': 'Roberts', 'kennedy': 'Kennedy',
           'thomas': 'Thomas', 'scalia': 'Scalia', 'breyer': 'Breyer', 'alito': 'Alito',
           'sotomayor': 'Sotomayor', 'kagan': 'Kagan', 'o conner': 'O\'Conner',
            'souter': 'Souter', 'stevens': 'Stevens', 'rehnquist': 'Rehnquist', 
            'blackmun': 'Blackmun', 'powell': 'Powell', 'burger': 'Burger', 
            'marshall': 'Marshall', 'brennan': 'Brennan'
           }
    for key in replacer.keys():
        text = text.replace(key, replacer[key])
    
    return text

In [7]:
# GRAB TXT DATA
text = clean_text("ginsaff.txt")
# ginsaff.txt for GinsBot on Affirmative Action
# scaliaff.txt for Scalianator on Affirmative Action
# ginstax.txt for GinsBot on Federal Tax

corpus length: 186483


# Word2Vec Word Embeddings

In [4]:
pretrained = 'GoogleNews-vectors-negative300.bin'
pretrained_embeds = gs.models.KeyedVectors.load_word2vec_format(pretrained, binary=True)

## Text to Vector Dictionary

In [5]:
def get_embeds(list_text):
    words2v= collections.OrderedDict()
    for word in list_text:
        try:
            words2v[word] = pretrained_embeds[word]
        except:
            pass  
    print("word_vectors", type(words2v), "length:",len(words2v))
    return words2v

# Process text

In [None]:
maxlen = 15 # length of a 'sentence'
step = 5

In [None]:
def vectorize_xy(text, maxlen, step):
    word_vectors = get_embeds(text.split())
    
    list_words=[]
    for word in text.split():
        if word in word_vectors.keys():
            list_words.append(word)
    words = set(list_words)
    
    print('number of vectorized words in text:', len(list_words))
    print('number of unique words left in text:', len(words))
    print("maxlen:",maxlen,"step:", step)
    
    sentences = []
    next_words = []
    sentences2=[]
    for i in range(0,len(list_words)-maxlen, step):
        sentences2 = ' '.join(list_words[i: i + maxlen])
        sentences.append(sentences2)
        next_words.append((list_words[i + maxlen]))
    
    print('length of sentence list:', len(sentences))
    print("length of next_word list", len(next_words))
    
    print('Vectorization...')
    X =[]
    y =[]
    for i, sentence in enumerate(sentences):
        sent_of_words=[]
        for t, word in enumerate(sentence.split()):
            sent_of_words.append(word_vectors[word]) # switch this out with 300x1 vector in word2vec
        X.append(sent_of_words)    
        y.append(word_vectors[next_words[i]])    
    X = np.asarray(X)
    y = np.asarray(y)

    print('X: ', X.shape)
    print('y: ', y.shape)
    return X, y

# Build Model

In [None]:
def start_model(model_name):
    #build the model: 4 stacked LSTM
    print('Building model...')

    model = Sequential()
    model.add(LSTM(300, return_sequences=True, input_shape=(maxlen, 300))) # length of vectors
    model.add(Dropout(0.1))
    model.add(LSTM(512, return_sequences=True))
    model.add(Dropout(0.1))
    model.add(LSTM(512, return_sequences=True))
    model.add(Dropout(0.1))
    model.add(LSTM(200, return_sequences=False))
    model.add(Dropout(0.1))
    model.add(Dense(300)) # length of vectors
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

    model.save(str(model_name))

In [14]:
# start_model('gins_affac.h5')

Building model...


# Train Model

## Functions

In [15]:
def clean_seed(seed_sentence, maxlen):
    seed_sentence = re.sub("[^A-Za-z]"," ", seed_sentence).lower()
    clean_sentence = []
    for word in seed_sentence.split():
        try:
            i = pretrained_embeds[word]
#                 if len(i) == 300:
            clean_sentence.append(word)    
        except:
            pass
    return clean_sentence[0:maxlen] # returns list 

In [16]:
def text_generator(seeds, model, para_length):    
    gentext = []
    for i in range(para_length): #length of paragraph 
        x=[] 
        varrays =[]
        for word in seeds:
            try: 
                i = pretrained_embeds[word]
                varrays.append(i)
            except:
                pass
        x.append(varrays[0:15])
        x=np.asarray(x)
    
        preds = model.predict(x, verbose=0)[0]

        next_word = pretrained_embeds.most_similar(positive=[preds], topn=1)[0][0]
#         next_word = get_top_unigram(preds)

        del seeds[0]
        seeds.append(next_word)
        
        gentext.append(' ')
        gentext.append(next_word)
        
        sys.stdout.write(' ')
        sys.stdout.write(next_word)
        sys.stdout.flush()
    print()
    return gentext

In [26]:
def train_generate(seed_sentence, iterations, epochs):
    checkpoint = ModelCheckpoint("gins_affac.h5", monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    loaded_model = load_model("gins_affac.h5")
    
    for iteration in range(1, iterations+1): 
        print()
        print('-' * 50)
        print('Iteration', iteration)
        
        X, y = vectorize_xy(text, maxlen, step)
        
        loaded_model.fit(X, y, batch_size=128, nb_epoch= epochs, callbacks=callbacks_list) 
        
        seeds = clean_seed(seed_sentence, maxlen)
        print('----- Generating with seed: "' , seed_sentence , '"')
        print()
        
        gentext = text_generator(seeds, loaded_model, 200)
    
        generation = ''.join(gentext)
        with open("ginsbot_affaction.txt", "a") as myfile:
            myfile.write('iteration = ' + str(iteration) + '/' + str(iterations) 
                         + ' epoch = ' + str(epochs) 
                         + '\n' + generation + '\n' + '\n' + '\n')
        
        del generation
        del gentext

## Train & Generate

In [27]:
seed_sentence = '''use of race discrimination in university 
admissions policy is lawful to achieve critical mass 
student body diversity'''

In [1]:
train_generate(seed_sentence, 50, 100)