In [3]:
from bs4 import BeautifulSoup
from datetime import datetime, timezone, timedelta
import pandas as pd
import json
import requests
import numpy as np
import re
import time as time
import os
import json
import tensorflow as tf
from tensorflow import keras

In [4]:
np.random.seed(4)
tf.random.set_seed(4)

# Web Scraping
We are going to gather our data while putting minimal pressure on the target website's servers. We will use www.rev.com/ which has many transcripts available. I will mostly focus on the transcripts of Biden and Trump campaign speeches, before the 2020 election

In [3]:
#Let's give our program some fake credentials to make it appear like a real person is using it
headers= {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9,fr;q=0.8,ro;q=0.7,ru;q=0.6,la;q=0.5,pt;q=0.4,de;q=0.3',
    'cache-control': 'max-age=0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}

# I found that more training data doesn't necessarily mean better quality, so we will try with less (~200,000 character training set) while increasing the number of epochs to train:
url_list=[
    'https://www.rev.com/blog/transcripts/joe-biden-election-day-remarks-transcript-scranton-pa',
    'https://www.rev.com/blog/transcripts/joe-biden-kamala-harris-election-day-eve-rally-speech-transcript-november-2',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-event-speech-transcript-pittsburgh-pa-november-2',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-speech-beaver-county-pennsylvania-november-2',
    'https://www.rev.com/blog/transcripts/joe-biden-drive-in-rally-speech-transcript-cleveland-november-2',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-event-speech-transcript-philadelphia-november-1',
    'https://www.rev.com/blog/transcripts/joe-biden-barack-obama-campaign-event-speech-transcript-flint-mi-october-31',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-speech-transcript-milwaukee-wisconsin-october-30',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-event-speech-transcript-st-paul-minnesota-october-30',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-event-speech-transcript-des-moines-iowa-october-30',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-rally-speech-transcript-tampa-fl-october-29',
    'https://www.rev.com/blog/transcripts/joe-biden-drive-in-rally-speech-transcript-broward-county-fl-october-29',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-speech-transcript-atlanta-georgia-october-27',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-speech-transcript-warm-springs-ga-october-27',
    'https://www.rev.com/blog/transcripts/joe-biden-north-carolina-speech-transcript-october-18',
    'https://www.rev.com/blog/transcripts/joe-biden-detroit-speech-transcript-october-16',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-speech-cincinnati-museum-center-transcript-october-12',
    'https://www.rev.com/blog/transcripts/joe-biden-mobilization-campaign-event-miramar-florida-transcript-october-13',
    'https://www.rev.com/blog/transcripts/joe-biden-las-vegas-speech-transcript-october-9',
    'https://www.rev.com/blog/transcripts/joe-biden-community-center-visit-las-vegas-transcript-october-9',
    'https://www.rev.com/blog/transcripts/joe-biden-kamala-harris-campaign-event-phoenix-az-transcript-october-8',
    'https://www.rev.com/blog/transcripts/joe-biden-miami-campaign-speech-transcript-october-5',
    'https://www.rev.com/blog/transcripts/joe-biden-campaign-speech-transcript-october-3',
    'https://www.rev.com/blog/transcripts/joe-biden-train-tour-speech-transcript-cleveland-ohio-september-30',
    'https://www.rev.com/blog/transcripts/joe-biden-train-tour-campaign-speech-transcript-alliance-ohio-september-30',
    'https://www.rev.com/blog/transcripts/joe-biden-remarks-transcript-september-26-us-conference-of-mayors',
    'https://www.rev.com/blog/transcripts/joe-biden-philadelphia-speech-transcript-sept-20-accuses-trump-republicans-of-abuse-of-power-over-scotus',
    'https://www.rev.com/blog/transcripts/joe-biden-cnn-town-hall-fracking-has-to-continue-transcript-september-17',
    'https://www.rev.com/blog/transcripts/joe-biden-pittsburgh-speech-transcript-august-31',
    'https://www.rev.com/blog/transcripts/joe-biden-and-kamala-harris-speech-transcript-august-12-first-campaign-event-as-running-mates',
    'https://www.rev.com/blog/transcripts/joe-biden-speech-transcript-warren-michigan-september-9'
]

### Our web-scraping function which returns the statements made solely by Joe Biden:

In [8]:
def get_speeches(url_list):
    n=0
    global string_all #A global variable containing all of strings, for greater flexibility in the future.
    string_all=''
    string_Biden='' #This is where we will store Biden's speeches

    for url in url_list:
        time.sleep(abs(np.random.normal(40,10))) #I don't want to add extra pressure on their servers. I also don't want anyone else running this code to get their IP address blocked, so I
                                                  #added randomness to the timing of our execution via a normal distribution

        html=requests.get('{}'.format(url), headers=headers).text #Get the html data
        soup=BeautifulSoup(html, 'html.parser')

        #Let's now get the main blocks of data which are the transcripts:
        transcript=soup.find_all('p')

        #If the block of text starts with some variation of Joe Biden:, then:
        transcript_Biden=[statement for statement in transcript if ('Joe Biden:' in statement.text[:35]) | 
                ('Joseph R Biden:' in statement.text[:30]) | ('Joseph R. Biden:' in statement.text[:35])] 
        
        #Use regex to split the sentences on \n which proceeds "Biden:", retrieve Biden's speech:
        for statement in transcript_Biden:
            string_Biden+='\n '+re.split('(\n)', statement.text)[-1] 
        for statement in transcript:
            string_all+= statement.text
        n+=1
        if n%3==0:
            print('{}% completed'.format( round( 100*((url_list.index(url)+1)/len(url_list)), 1) ))
    print('Done.')
    return string_Biden

In [15]:
string_Biden=get_speeches(url_list)

23.1% completed
46.2% completed
69.2% completed
92.3% completed
Done.


In [6]:
#We can also load in the text directly, which I saved from earlier
def load_Biden_str():
    global string_Biden
    string_Biden=''
    with open('text_Biden.txt','r') as file:
        list_Biden=file.readlines()
    for line in list_Biden:
        string_Biden+=line

def write_Biden_str():
    global string_Biden
    with open('text_Biden.txt', 'w') as file:
        file.write(string_Biden)

## Training our RNN on the data
We create a keras Tokenizer which will create a unique number for every word seen in the data. 

In [11]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=False)
tokenizer.fit_on_texts([string_Biden])

In [12]:
tokenizer.texts_to_sequences(["Example input"])

[[484, 4335]]

In [14]:
tokenizer.sequences_to_texts([[484, 4335]])

['example input']

In [15]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size=sum(tokenizer.word_counts.values()) #Use for word-level
#dataset_size = tokenizer.document_count # total number of characters. Use for char-level

## Preprocessing Step
In order to train our model, we have to create pairs of sets of words. The first set of every pair will be 100 words long (training set), and the second set will contain one word only. Our model will therefore have to determine that, given an input of 100 words (more words for longer term memory), what it expects the output word to be. We will shuffle our sets of data to create an i.i.d dataset. 100 n_steps represents a fairly long memory on which our model will be trained on.

In [24]:
[encoded] = np.array(tokenizer.texts_to_sequences([string_Biden])) - 1
train_size = dataset_size * 100 // 100 #We will not be using the validation set for the time being.
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

n_steps = 100 
window_length = n_steps + 1 # target = input shifted 1 word ahead
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

In [17]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))
batch_size = 32
dataset = dataset.shuffle(7600).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [18]:
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [19]:
dataset = dataset.prefetch(1) #Add prefetching

## Model training
We will use a GRU layer instead of LSTM since the GRU is much simpler (and therefore much less memory intensive) and provides very similar precision. Dropout has been shown to improve performance in select cases (especially when using Monte Carlo), and to greatly reduce overfitting. The output will be a softmax function and will have an array of probabilities for each number/word. We will use Adam optimization which uses momentum-like properties to speed up our gradient descent.

In [None]:
first_time=datetime.now()

model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, steps_per_epoch=train_size // batch_size,
                    epochs=10)

time_end=datetime.now()-first_time
print(time_end)

In [48]:
#model.save('BidenBot.h5')