In [1]:
import glob 
import os
import numpy as np

In [2]:
speeches_dir = 'data/speeches/'

In [3]:
from gensim.parsing.preprocessing import preprocess_string, \
                                         strip_non_alphanum, strip_tags

def clean(text):
    text = strip_tags(text)
    text = strip_non_alphanum(text)
    return text

def load_speeches(category, filename='*.txt'):
    """
       :param category: What type of speeches to load
                        - women or comedians
       :param filename: The filename pattern
    """
    category_dir = os.path.join(speeches_dir,category)
    for filename in glob.glob(os.path.join(category_dir, filename)):
        with open(filename, encoding='latin-1') as f:
            yield filename, clean(f.read())

In [4]:
womens_speeches = load_speeches('women')
comedian_speeches = load_speeches('comedians')

In [5]:
from nltk.tokenize import word_tokenize


def load_corpus(speeches):
    corpus = []
    for filename, speech in speeches:
        print(f'Loading speech {filename}')
        tokens = word_tokenize(speech)
        corpus = corpus + tokens
    return corpus

In [6]:
corpus = load_corpus(womens_speeches)

Loading speech data/speeches/women/SpeechToTheTroopsAtTillsbury-ElizabethI.txt
Loading speech data/speeches/women/PulseOfTheMorning-MayaAngelou.txt
Loading speech data/speeches/women/AintIAWoman-SojournerTruth.txt
Loading speech data/speeches/women/RoomOfOnesOwn-VirginiaWoolf.txt
Loading speech data/speeches/women/FreedomFromFear-AungSuuKyi.txt
Loading speech data/speeches/women/MisogynySpeech-JuliaGillard.txt
Loading speech data/speeches/women/FreedomOrDeath-EmmelinePankhurst.txt
Loading speech data/speeches/women/WellesleyCommencement-NoraEphron.txt


In [9]:
print(len(corpus))
print(corpus[:200]) #first 200 words of the corpus

49127
['My', 'loving', 'people', 'We', 'have', 'been', 'persuaded', 'by', 'some', 'that', 'are', 'careful', 'of', 'our', 'safety', 'to', 'take', 'heed', 'how', 'we', 'commit', 'our', 'selves', 'to', 'armed', 'multitudes', 'for', 'fear', 'of', 'treachery', 'but', 'I', 'assure', 'you', 'I', 'do', 'not', 'desire', 'to', 'live', 'to', 'distrust', 'my', 'faithful', 'and', 'loving', 'people', 'Let', 'tyrants', 'fear', 'I', 'have', 'always', 'so', 'behaved', 'myself', 'that', 'under', 'God', 'I', 'have', 'placed', 'my', 'chiefest', 'strength', 'and', 'safeguard', 'in', 'the', 'loyal', 'hearts', 'and', 'good', 'will', 'of', 'my', 'subjects', 'and', 'therefore', 'I', 'am', 'come', 'amongst', 'you', 'as', 'you', 'see', 'at', 'this', 'time', 'not', 'for', 'my', 'recreation', 'and', 'disport', 'but', 'being', 'resolved', 'in', 'the', 'midst', 'and', 'heat', 'of', 'the', 'battle', 'to', 'live', 'and', 'die', 'amongst', 'you', 'all', 'to', 'lay', 'down', 'for', 'my', 'God', 'and', 'for', 'my', 'king

In [10]:
def make_pairs(corpus):
    for i in range(len(corpus)-1):
        yield (corpus[i], corpus[i+1])
          
def load_word_dict(corpus):
    pairs = make_pairs(corpus)
    word_dict = {}
    for word_1, word_2 in pairs:
        if word_1 in word_dict.keys():
            word_dict[word_1].append(word_2)
        else:
            word_dict[word_1] = [word_2]
    return word_dict
            
def load_markov_dict(category, filename='*.txt'):
    speeches = load_speeches(category, filename)
    corpus = load_corpus(speeches)
    return load_word_dict(corpus)

In [11]:
womens_speeches_word_dict = load_markov_dict('women')

Loading speech data/speeches/women/SpeechToTheTroopsAtTillsbury-ElizabethI.txt
Loading speech data/speeches/women/PulseOfTheMorning-MayaAngelou.txt
Loading speech data/speeches/women/AintIAWoman-SojournerTruth.txt
Loading speech data/speeches/women/RoomOfOnesOwn-VirginiaWoolf.txt
Loading speech data/speeches/women/FreedomFromFear-AungSuuKyi.txt
Loading speech data/speeches/women/MisogynySpeech-JuliaGillard.txt
Loading speech data/speeches/women/FreedomOrDeath-EmmelinePankhurst.txt
Loading speech data/speeches/women/WellesleyCommencement-NoraEphron.txt


In [13]:
womens_speeches_word_dict['the']

['loyal',
 'midst',
 'battle',
 'dust',
 'body',
 'heart',
 'borders',
 'field',
 'word',
 'mean',
 'camp',
 'field',
 'mastodon',
 'gloom',
 'Rock',
 'wall',
 'world',
 'songs',
 'Tree',
 'stone',
 'wise',
 'Asian',
 'Hispanic',
 'Jew',
 'Sioux',
 'Muslim',
 'French',
 'Greek',
 'Rabbi',
 'Priest',
 'Sheikh',
 'Straight',
 'Preacher',
 'homeless',
 'Teacher',
 'Tree',
 'first',
 'River',
 'River',
 'employment',
 'Turk',
 'Swede',
 'German',
 'Scot',
 'Ashanti',
 'Yoruba',
 'Kru',
 'Tree',
 'River',
 'Rock',
 'River',
 'Tree',
 'dream',
 'palms',
 'shape',
 'pulse',
 'courage',
 'Rock',
 'River',
 'Tree',
 'mendicant',
 'mastodon',
 'pulse',
 'grace',
 'negroes',
 'South',
 'women',
 'North',
 'white',
 'best',
 'lash',
 'head',
 'first',
 'world',
 'men',
 'banks',
 'words',
 'BrontÃ',
 'words',
 'fiction',
 'fiction',
 'subject',
 'most',
 'first',
 'pages',
 'mantelpiece',
 'great',
 'true',
 'true',
 'duty',
 'room',
 'money',
 'train',
 'ideas',
 'prejudices',
 '4',
 'chance',
 '

In word dict dictionary, for every word as a key, it stores all the words the follows it in the value

In [14]:
comedians_word_dict = load_markov_dict('comedians')

Loading speech data/speeches/comedians/HardvardLawSchool-MindyKaling.txt
Loading speech data/speeches/comedians/UniversityOfVirginia-StephenColbert.txt
Loading speech data/speeches/comedians/TulaneGraduation-MayaRudolph.txt
Loading speech data/speeches/comedians/HarvardU-WillFerrell.txt
Loading speech data/speeches/comedians/WilliamAndMary-JonStewart.txt
Loading speech data/speeches/comedians/Harvard-AmyPoehler.txt
Loading speech data/speeches/comedians/Dartmouth-Conan.txt


In [15]:
def get_sentence(word_dict, n_words=15):
    first_word = np.random.choice(list(word_dict.keys()))
    while first_word.islower():
        first_word = np.random.choice(corpus)
    chain = [first_word]
    for i in range(n_words):
        chain.append(np.random.choice(word_dict[chain[-1]]))
    return ' '.join(chain)

In [16]:
get_sentence(womens_speeches_word_dict)

'Then she writes will choose they opened a muddy market and Mouse Act The Gay by'

In [17]:
get_sentence(womens_speeches_word_dict)

'Nobody could work expressed you actually wrote novels that my list I won your libraries and'

In [18]:
get_sentence(comedians_word_dict)

'He was really talking about to graduate from Harvard The group of Mexico Ernesto Zedillo Ernieâ'

In [20]:
get_sentence(comedians_word_dict)

'Shield But this Don t get childish applause thank you donâ t know him in which'