# Markov Chains from Scratch
http://www.johnwittenauer.net/markov-chains-from-scratch/

In [10]:
import urllib2
import random

text = urllib2.urlopen('https://raw.githubusercontent.com/ryanmcdermott/trump-speeches/master/speeches.txt')  

In [11]:
# create ordered word list from speechs
words = []

for line in text:
    line = line.decode('utf-8-sig', errors='ignore').encode('ascii', errors='ignore')
    line = line.replace('\r',' ').replace('\n', ' ').replace('.','').lower()
    new_words = line.split(' ')
    new_words = [word for word in new_words if word not in['', ' ']]
    words = words + new_words
    
print 'corpus size: {0} words'.format(len(words))

corpus size: 166222 words


In [12]:
words[:10]

['speech', '1', 'thank', 'you', 'so', 'much', "that's", 'so', 'nice', "isn't"]

In [13]:
# first order chain
chain = {}
n_words = len(words)

for i, key in enumerate(words):
    if n_words > (i+1):
        word = words[i+1]
        if key not in chain:
            chain[key] = [word]
        else:
            chain[key].append(word)

print 'Chain size: {0} distinct words.'.format(len(chain))

Chain size: 9094 distinct words.


In [25]:
# create first order markov chain tweets
w1 = random.choice(words)
tweet = w1

while len(tweet) < 140:
    w2 = random.choice(chain[w1])
    tweet += ' ' + w2
    w1 = w2

tweet

'you look at what i came out and gentlemen, thank you know very nice or eight or punches you can by 10 feet then all over and storming and all'

In [26]:
# second order chain
chain = {}
n_words = len(words)

for i, key1 in enumerate(words):
    if n_words > (i+2):
        key2 = words[i + 1]
        word = words[i + 2]
        if (key1, key2) not in chain:
            chain[(key1, key2)] = [word]
        else:
            chain[(key1, key2)].append(word)
            
print 'Chain size: {0} distinct word pairs.'.format(len(chain))

Chain size: 62358 distinct word pairs.


In [27]:
chain[('its','so')]

['nice',
 'great',
 'great',
 'easy',
 'bucolic,',
 'beautiful',
 'big',
 'preposterous',
 'important',
 'important',
 'simple',
 'simple',
 'horrible',
 'out',
 'terrible',
 'wrong',
 'important',
 'sad',
 'sad',
 'much',
 'sad',
 'important',
 'unfair',
 'important',
 'bad,',
 'can',
 'easy',
 'embarrassing',
 'important,',
 'important',
 'important',
 'astronomical',
 'incredible',
 'complicated',
 'easy',
 'little',
 'important',
 'simple',
 'important']

In [29]:
def markov_tweet(chain, words):
    r = random.randint(0, len(words) - 1)
    key = (words[r], words[r+1])
    tweet = key[0] + ' ' + key[1]

    while len(tweet) < 140:
        w = random.choice(chain[key])
        tweet += ' ' + w
        key = (key[1], w)

    return tweet

# test
markov_tweet(chain, words)

'we lose with trade when you read the art of the money but a tremendous waste of time, because as a result of trade imbalance with japan the $50'

In [30]:
markov_tweet(chain, words)

'did something that if youre there illegally and if i answer that question, wed have millions of dollars overseas theyre leaving our country needs'

In [31]:
markov_tweet(chain, words)

'is most peaceful and most dangerous they can buy them why are they going to go through the process complicated" and some phenomenal results you'

In [32]:
markov_tweet(chain, words)

'not domestic, production yet today, 240 years after the first sentence in the world was sort of like a fool now, what were doing actually great'

In [33]:
markov_tweet(chain, words)

'and prosperous, prosperous again together, we will win and ill tell you so, the wall one as sure as youre sitting there and theyre looking forward'