In [6]:
# Read in the corpus, including punctuation!
import pandas as pd

data = pd.read_pickle('corpus.pkl')
data

Unnamed: 0,transcript,full_name
BattleCreekDec19_2019.txt,Thank you. Thank you. Thank you to Vice Presid...,BattleCreekDec19_2019
BemidjiSep18_2020.txt,There's a lot of people. That's great. Thank y...,BemidjiSep18_2020
CharlestonFeb28_2020.txt,Thank you. Thank you. Thank you. All I can say...,CharlestonFeb28_2020
CharlotteMar2_2020.txt,"I want to thank you very much. North Carolina,...",CharlotteMar2_2020
CincinnatiAug1_2019.txt,Thank you all. Thank you very much. Thank you ...,CincinnatiAug1_2019
ColoradorSpringsFeb20_2020.txt,"Hello Colorado. We love Colorado, most beautif...",ColoradorSpringsFeb20_2020
DallasOct17_2019.txt,Thank you. Thank you very much. Hello Dallas. ...,DallasOct17_2019
DesMoinesJan30_2020.txt,I worked so hard for this state. I worked so h...,DesMoinesJan30_2020.txt
FayettevilleSep19_2020.txt,"What a crowd, what a crowd. Get those people o...",FayettevilleSep19_2020
FayettevilleSep9_2019.txt,Thank you everybody. Thank you and Vice Presi...,FayettevilleSep9_2019


In [7]:
# Extract only one speech
test_text = data.transcript.loc['TexasSep23_2019.txt']
test_text[:200]

"Hello, Houston. I am so thrilled to be here in the great state of Texas with one of America's greatest, most devoted and most loyal friends, Prime Minister Modi of India. Thank you. Thank you. And Pri"

In [8]:
from collections import defaultdict

def markov_chain(text):
    '''The input is a string of text and the output will be a dictionary with each word as
       a key and each value as the list of words that come after the key in the text.'''
    
    # Tokenize the text by word, though including punctuation
    words = text.split(' ')
    
    # Initialize a default dictionary to hold all of the words and next words
    m_dict = defaultdict(list)
    
    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        m_dict[current_word].append(next_word)

    # Convert the default dict back into a dictionary
    m_dict = dict(m_dict)
    return m_dict

In [9]:
# Create the dictionary for the Texas Speech
test_dict = markov_chain(test_text)
test_dict

{'Hello,': ['Houston.'],
 'Houston.': ['I'],
 'I': ['am',
  'understand',
  'know',
  'want',
  'just',
  'was',
  'know',
  'have',
  'want',
  'want',
  'promised',
  'can',
  'can',
  'look',
  'want',
  'invited,',
  'may',
  'may',
  'want',
  'will',
  'want',
  'want',
  'want',
  'would'],
 'am': ['so'],
 'so': ['thrilled'],
 'thrilled': ['to', 'to'],
 'to': ['be',
  'be',
  'be',
  'the',
  'take',
  'address',
  'Houston',
  'celebrate',
  'express',
  'the',
  'be',
  'have',
  'know',
  'recognize',
  'India',
  'report',
  'democracy.',
  'our',
  'justice,',
  'the',
  'our',
  'be',
  "America's",
  'this',
  'strengthen',
  'save',
  'thousands',
  'working',
  'make',
  'hire,',
  '$500',
  'revitalize',
  'thank',
  'say',
  'expand',
  'India,',
  'ensuring',
  'the',
  'another',
  'watch',
  'expanding',
  'hear',
  'purchase',
  '5',
  'billions',
  'India',
  'India',
  'soar',
  'India',
  'concluding',
  'enhance',
  'safeguard',
  'protecting',
  'keep',
  'im

In [10]:
import random

def generate_sentence(chain, count=15):
    '''Input a dictionary in the format of key = current word, value = list of next words
       along with the number of words you would like to see in your generated sentence.'''

    # Capitalize the first word
    word1 = random.choice(list(chain.keys()))
    sentence = word1.capitalize()

    # Generate the second word from the value list. Set the new word as the first word. Repeat.
    for i in range(count-1):
        word2 = random.choice(chain[word1])
        word1 = word2
        sentence += ' ' + word2

    # End it with a period
    sentence += '.'
    return(sentence)

In [19]:
generate_sentence(test_dict)

"On a moment to millions of America's greatest, most loyal friends, Prime Minister Modi, who's."