# EECS 126 MCMC Project - Generating Trump Tweets

In [2]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import tweepy

# Ensure that we can see full tweets
pd.set_option('max_colwidth', 280)

# 1) Building Tweet Dataset

### Testing API Access

In [4]:
# Loading previously generated API authentication keys
import json
key_file = 'keys.json'

with open(key_file) as f:
    keys = json.load(f)

In [5]:
from tweepy import TweepError
import logging

try:
    auth = tweepy.OAuthHandler(keys["consumer_key"], keys["consumer_secret"])
    auth.set_access_token(keys["access_token"], keys["access_token_secret"])
    api = tweepy.API(auth)
    print("Your username is:", api.auth.get_username())
except TweepError as e:
    logging.warning("Tweepy error")
    logging.warning(e)

Your username is: gorblamski


In [6]:
from pathlib import Path

ds_tweets_save_path = "BerkeleyData_recent_tweets.json"

# Guarding against attempts to download the data multiple times:
if not Path(ds_tweets_save_path).is_file():
    example_tweets = [t._json for t in tweepy.Cursor(api.user_timeline, id="BerkeleyData", 
                                             tweet_mode='extended').items()]
    
    # Saving the tweets to a json file on disk for future analysis
    with open(ds_tweets_save_path, "w") as f:        
        json.dump(example_tweets, f)

# Re-loading the json file:
with open(ds_tweets_save_path, "r") as f:
    example_tweets = json.load(f)

### Functions for obtaining tweets

In [8]:
# Loads Twitter authentication keys
def load_keys(path):
    key_file = 'keys.json'
    with open(key_file) as f:
        keys = json.load(f)
    return keys

In [9]:
# Downloads tweets by one Twitter user.
def download_recent_tweets_by_user(user_account_name, keys):
    import tweepy
    auth = tweepy.OAuthHandler(keys["consumer_key"], keys["consumer_secret"])
    auth.set_access_token(keys["access_token"], keys["access_token_secret"])
    api = tweepy.API(auth)
    return [t._json for t in tweepy.Cursor(api.user_timeline, id=user_account_name, 
                                             tweet_mode='extended').items()]

In [10]:
# Saves a list of tweets to a file
def save_tweets(tweets, path):
    with open(path, "w") as f:        
        json.dump(tweets, f)

In [11]:
# Loads tweets that have previously been saved.
def load_tweets(path):
    with open(path, "r") as f:
        return json.load(f)

In [12]:
# Get recent tweets from one user, loading from cache if possible
def get_tweets_with_cache(user_account_name, keys_path):
    keys = load_keys(keys_path)
    if not Path(user_account_name + "tweets.json").is_file():
        tweet_list = download_recent_tweets_by_user(user_account_name, keys)
        save_tweets(tweet_list, user_account_name + "tweets.json")
    else:
        tweet_list = load_tweets(user_account_name + "tweets.json")
    return tweet_list

### Scraping trump tweets

In [13]:
trump_tweets = get_tweets_with_cache("realdonaldtrump", key_file)
print("Number of tweets downloaded:", len(trump_tweets))

Number of tweets downloaded: 3191


### Merging scraped tweets with existing Trump tweet dataset

In [15]:
from utils import fetch_and_cache
data_url = 'http://www.ds100.org/fa18/assets/datasets/old_trump_tweets.json.zip'
file_name = 'old_trump_tweets.json.zip'

dest_path = fetch_and_cache(data_url=data_url, file=file_name)
print(f'Located at {dest_path}')

Using version already downloaded: Sat Oct 27 02:13:31 2018
MD5 hash of file: b6e33874de91d1a40207cdf9f9b51a09
Located at data/old_trump_tweets.json.zip


In [16]:
my_zip = zipfile.ZipFile(dest_path, 'r')
with my_zip.open("old_trump_tweets.json", "r") as f:
    old_trump_tweets = json.load(f)

In [18]:
old_trump_tweets[0].keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'extended_entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive', 'lang'])

In [19]:
all_tweets = []
new_trump_tweet_ids = set([t['id'] for t in trump_tweets])
for old_tweet in old_trump_tweets:
    if old_tweet['id'] not in new_trump_tweet_ids:
        all_tweets.append(old_tweet)

all_tweets += trump_tweets

In [20]:
assert len(all_tweets) > len(trump_tweets)
assert len(all_tweets) > len(old_trump_tweets)

### Creating pandas dataframe of tweets

In [21]:
txt = [tweet['text'] if 'text' in tweet.keys() else tweet['full_text'] for tweet in all_tweets]

In [22]:
trump = pd.DataFrame(all_tweets)[['created_at', 'id', 'source', 'retweet_count']].set_index('id', inplace=False)

In [23]:
trump['text'] = pd.Series(data=txt, index=trump.index)

In [24]:
trump.sort_index(inplace=True)

In [25]:
trump.head()

Unnamed: 0_level_0,created_at,source,retweet_count,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
690171032150237184,Thu Jan 21 13:56:11 +0000 2016,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",1059,"""@bigop1: @realDonaldTrump @SarahPalinUSA https://t.co/3kYQGqeVyD"""
690171403388104704,Thu Jan 21 13:57:39 +0000 2016,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",1339,"""@AmericanAsPie: @glennbeck @SarahPalinUSA Remember when Glenn gave out gifts to ILLEGAL ALIENS at crossing the border? Me too!"""
690173226341691392,Thu Jan 21 14:04:54 +0000 2016,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",2006,So sad that @CNN and many others refused to show the massive crowd at the arena yesterday in Oklahoma. Dishonest reporting!
690176882055114758,Thu Jan 21 14:19:26 +0000 2016,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",2266,"Sad sack @JebBush has just done another ad on me, with special interest money, saying I won't beat Hillary - I WILL. But he can't beat me."
690180284189310976,Thu Jan 21 14:32:57 +0000 2016,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",2886,Low energy candidate @JebBush has wasted $80 million on his failed presidential campaign. Millions spent on me. He should go home and relax!


# 2) Additional data cleaning

In [26]:
trump.sort_index(inplace=True)
trump['time'] = pd.to_datetime(trump['created_at'])
trump = trump.drop('created_at', axis=1)
source_pat = "(>.*<)"
trump['source'] = trump['source'].str.extract(source_pat)
trump['source'] = trump['source'].str.replace(">", "")
trump['source'] = trump['source'].str.replace("<", "")

In [27]:
trump.head()

Unnamed: 0_level_0,source,retweet_count,text,time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
690171032150237184,Twitter for Android,1059,"""@bigop1: @realDonaldTrump @SarahPalinUSA https://t.co/3kYQGqeVyD""",2016-01-21 13:56:11
690171403388104704,Twitter for Android,1339,"""@AmericanAsPie: @glennbeck @SarahPalinUSA Remember when Glenn gave out gifts to ILLEGAL ALIENS at crossing the border? Me too!""",2016-01-21 13:57:39
690173226341691392,Twitter for Android,2006,So sad that @CNN and many others refused to show the massive crowd at the arena yesterday in Oklahoma. Dishonest reporting!,2016-01-21 14:04:54
690176882055114758,Twitter for Android,2266,"Sad sack @JebBush has just done another ad on me, with special interest money, saying I won't beat Hillary - I WILL. But he can't beat me.",2016-01-21 14:19:26
690180284189310976,Twitter for Android,2886,Low energy candidate @JebBush has wasted $80 million on his failed presidential campaign. Millions spent on me. He should go home and relax!,2016-01-21 14:32:57


### Removing links, newlines, quotes, 'amp'

In [30]:
trump['text'] = trump['text'].str.replace(r'(https){1}.*', '')
trump['text'] = trump['text'].str.replace(r'(\n){1}', '')
trump['text'] = trump['text'].str.replace(r'"', '')

In [34]:
trump['text'] = trump['text'].str.replace(r'(&amp){1}', '&')

In [35]:
trump['text'] = trump['text'].str.replace(r'(&;){1}', '&')

In [38]:
tcopy = trump['text'].str.split(' ')

# 3) Constructing Markov Chain

### Getting unique words

In [39]:
for i in tcopy.index:
    tcopy.loc[i] = [x for x in tcopy.loc[i] if x != '']

In [41]:
from collections import defaultdict

unique_words = defaultdict(int)

In [42]:
unique_words

defaultdict(int, {})

In [43]:
for i in tcopy.index[:1000]:
    for j in tcopy.loc[i]:
        unique_words[j] += 1

### Generating transition probabilities

In [44]:
def generate_tprobs_for_word(w):
    trans_dict = defaultdict(int)
    total_trans = 0
    for i in tcopy.index[:1000]:
        curr_tweet = tcopy.loc[i]
        for i in range(len(curr_tweet)):
            curr_word = curr_tweet[i]
            if curr_word == w and i != (len(curr_tweet) - 1):
                next_word = curr_tweet[i + 1]
                trans_dict[next_word] += 1
                total_trans += 1
    for k in trans_dict:
        trans_dict[k] /= total_trans
    return trans_dict

In [81]:
len(unique_words)

4819

In [82]:
markov_dict = defaultdict(int)
for u in unique_words:
    markov_dict[u] = generate_tprobs_for_word(u)

In [83]:
len(markov_dict)

4819

In [86]:
lst = []
for key, value in markov_dict['at'].items():
    lst.append((key, value))

In [87]:
lst

[('crossing', 0.013333333333333334),
 ('the', 0.22666666666666666),
 ('an', 0.013333333333333334),
 ('9pm', 0.02666666666666667),
 ('various', 0.02666666666666667),
 ('around', 0.02666666666666667),
 ('7:30', 0.013333333333333334),
 ('41%', 0.013333333333333334),
 ('my', 0.02666666666666667),
 ('8:00', 0.013333333333333334),
 ('Drake', 0.013333333333333334),
 ('7:00', 0.02666666666666667),
 ('their', 0.013333333333333334),
 ('minimum.', 0.013333333333333334),
 ('8pm', 0.013333333333333334),
 ('stake.', 0.013333333333333334),
 ('Clemson', 0.013333333333333334),
 ('me.', 0.013333333333333334),
 ('9am.', 0.013333333333333334),
 ('7:02', 0.013333333333333334),
 ('this', 0.013333333333333334),
 ('Jeb', 0.013333333333333334),
 ('bottom', 0.013333333333333334),
 ('8:40.', 0.013333333333333334),
 ('least', 0.013333333333333334),
 ('all!', 0.013333333333333334),
 ('it', 0.013333333333333334),
 ('meetings', 0.013333333333333334),
 ('5pm.', 0.013333333333333334),
 ('1%,', 0.013333333333333334),
 

# 4) Generating Tweets

### Function for random walk on Tweet Markov chain

In [157]:
import random

def gen_tweet_given_start(start, length):
    assert start in unique_words
    ct = 0
    prob = 1
    tweet = start + ' '
    state = [(key, value) for key, value in markov_dict[start].items()]
    while ct != length:
        br = False
        success = 0
        fail_count = 0
        while not success:
            s_ind = 0
            ran = random.uniform(0, 1)
            ran -= state[s_ind][1]
            while ran > 0:
                s_ind += 1
                ran -= state[s_ind][1]
            next_word = state[s_ind][0]
            if next_word in markov_dict and len(markov_dict[next_word]) >= 1:
                success = 1
                prob *= state[s_ind][1]
                tweet += (next_word + ' ')
                ct += 1
                state = [(key, value) for key, value in markov_dict[next_word].items()]
            fail_count += 1
            if fail_count > 100:
                br = True
                break
        if br:
            break
    return tweet, prob

# 5) Results

## Selected tweets

In [158]:
best = ['Ted Cruz lies and he runs ', 
        'Ted Cruz campaign is very sleazy ', 
        'Obama is working with Trump. Thank you people ',
        'Jeb Bush has ties to Trump and Cruz ',
        "Bush just on @FoxNews - I wasn't enough ",
        "Marco 'Amnesty' Rubio is thinking of little Mort ",
        'Marco Rubio was as if he is weak ',
        'Marco Rubio is jealous of Cruz & strong ',
        'Trump has a dinner in bed ',
        'Democrats numbers have done the dying ',
        'I did a WALL and ObamaCare, ',
        'I could be just another politician. ',
        'Trump won all the evangelical vote ',
        'ISIS is a person in Salem, ',
        'ISIS is better than a rock! ',
        "I'm going to church I WILL. ",
        'many believe @realDonaldTrump Wow! Thank you, ',
        'so easy to be president - ']

## Tests with various start words

In [155]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("Ted", 5))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[('Ted Cruz to me. I must ', 1.8470064073831216e-08),
 ('Ted may be happy to watching ', 2.9675379832991706e-08),
 ('Ted Cruz & not on my ', 8.269718196200094e-08),
 ('Ted Cruz lies and he runs ', 8.997584818509143e-08),
 ('Ted Cruz, and demanding a man ', 9.185935084099072e-08),
 ('Ted Cruz to donate and deceptive ', 9.383984166543278e-08),
 ("Ted can't beat him that @CNN ", 1.3882317282675216e-07),
 ('Ted Cruz, Rubio and special guy! ', 2.2755834747734885e-07),
 ('Ted Cruz be great people we ', 2.4715402469229096e-07),
 ('Ted Cruz campaign is very sleazy ', 3.6808936495531105e-07)]

In [166]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("Obama", 7))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[('Obama is such a big & the evening. ', 3.683733992282364e-11),
 ('Obama would beat you in lie to do ', 9.086065127868122e-11),
 ('Obama is underway @realDonaldTrump Trump will not careful, ',
  2.9046102973826e-10),
 ('Obama would not Trump will be a dinner ', 3.146953472772636e-10),
 ('Obama would the people I will do on ', 1.153588398491367e-09),
 ('Obama is repulsive that I stated on 3/8/2016. ', 1.2224064242014723e-09),
 ('Obama is working with Trump. Thank you people ', 2.7038665951790717e-09),
 ('Obama is a stacked RNC and the chaos ', 3.2710778028747256e-09),
 ('Obama so many more for the primary vote ', 3.5581226907221935e-09),
 ('Obama so big, Cruz, and and get out ', 5.184663615706427e-09)]

In [168]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("Jeb", 7))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[("Jeb spent so out the media wasn't enough ", 1.7900228169339606e-10),
 ('Jeb and a weak on this morning. Glad ', 4.3783486184892456e-10),
 ('Jeb and other Republican Party see the highly ', 6.186185460489041e-10),
 ('Jeb Bush has done the winner of himself. ', 8.813934219230215e-10),
 ('Jeb Bush is all the new voters that ', 1.6448768147495032e-09),
 ('Jeb and vote total #Mediafraud. When you Kansas! ', 4.3396285286658715e-09),
 ('Jeb Bush has ties to Trump and Cruz ', 6.616259174406837e-09),
 ('Jeb Bush in NH GOP candidates are way ', 7.301429590708142e-09),
 ('Jeb --- Just in, big & many voters ', 7.737986574655196e-09),
 ('Jeb Bush was told to my forum with ', 7.902124613268439e-09)]

In [171]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("Bush", 7))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[('Bush and a major speeches and is by ', 7.58411795438427e-11),
 ('Bush is afraid of Iowa love to thank ', 2.2874980198094679e-10),
 ("Bush just on @FoxNews - I wasn't enough ", 2.9305667200989573e-10),
 ('Bush is at Trump winning debate tonight because ', 4.775252445212783e-10),
 ('Bush in America Needs: The people are spending ', 8.272555693905776e-10),
 ('Bush and Jeb spent very successful event is ', 1.0279785159398183e-09),
 ('Bush was great experience in his votes than ', 1.0519599368548003e-09),
 ('Bush is because of @StJude in Salt Lake ', 1.2214780985788866e-09),
 ('Bush spent against Trump has cancelled the special ',
  1.2332162961739611e-09),
 ('Bush is going to be president in Manchester, ', 1.2982888379067002e-09)]

In [177]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("Marco", 7))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[('Marco Rubio is jealous of Cruz & strong ', 1.9196447098869053e-10),
 ('Marco Rubio & the century the warm embrace ', 1.5202635131310505e-09),
 ('Marco Rubio was as if he is weak ', 1.7433880865613842e-09),
 ('Marco Rubio was so much to the advertisers ', 2.310660416910899e-09),
 ('Marco made so happy to help you for ', 2.8390806261989783e-09),
 ('Marco Rubio gave up a disaster for Rubio, ', 4.14364382714961e-09),
 ('Marco Rubio to the people of time we ', 4.869131977967355e-09),
 ('Marco Rubio 15.0 Cruz talks about to contribute ', 6.707848775107442e-09),
 ("Marco 'Amnesty' Rubio is thinking of little Mort ", 6.9312072825363165e-09),
 ('Marco Rubio, who will change in Canada Cruz ', 7.33268571053138e-09)]

In [183]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("Trump", 5))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[('Trump line and to Trump like ', 2.0360312376056597e-09),
 ('Trump is to compete in American ', 3.797385567304884e-09),
 ('Trump won that while the remaining ', 1.7602994128477303e-08),
 ('Trump had to get to do ', 2.520391648699075e-08),
 ('Trump is looking good for himself—a ', 3.2589910908494975e-08),
 ('Trump campaign & invest in Madison, ', 3.8283549405686184e-08),
 ('Trump has a dinner in bed ', 5.056043716576391e-08),
 ('Trump about @FoxNews Listening to Tampa ', 6.311696836577547e-08),
 ('Trump about Hillary would have killed ', 6.777381978670222e-08),
 ('Trump is Clinton and running in ', 7.709441290181607e-08)]

In [188]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("Democrats", 5))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[('Democrats vote. The great & replace. ', 3.538870958609365e-07),
 ('Democrats would be just got ZERO, ', 5.269167122324167e-07),
 ('Democrats would never run, a small ', 7.273192757063725e-07),
 ('Democrats numbers have done the dying ', 1.1156854213385992e-06),
 ('Democrats working so out from his ', 1.17039417705489e-06),
 ('Democrats would have the stage. Not ', 1.5213892109162715e-06),
 ('Democrats working with very good chance ', 1.6546648310918137e-06),
 ('Democrats numbers in Dayton & dry. ', 2.067311667907054e-06),
 ('Democrats working hard in the sports ', 2.119245358767894e-06),
 ('Democrats numbers are for Trump. DT ', 2.7056277056277056e-06)]

In [193]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("I", 5))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[('I did great debate Cruz - ', 2.286321532713374e-09),
 ('I hope you win in Tampa! ', 7.791038124665959e-09),
 ('I do w/ a very good ', 1.1228788817922944e-08),
 ('I did a WALL and ObamaCare, ', 1.7205402221010963e-08),
 ('I must talk and a WALL ', 4.014593851569224e-08),
 ('I was able to him the ', 4.6780811847574754e-08),
 ('I would end this great President, ', 5.5581493585895636e-08),
 ('I can join the race, someone ', 8.772911005397534e-08),
 ('I would say in Iowa, he ', 8.98382193346222e-08),
 ('I did he was an even ', 9.301633366819214e-08)]

In [245]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("Trump", 5))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[('Trump up to speak the warm ', 3.0274616199608057e-09),
 ('Trump held in Canada to unite ', 2.0433550909783423e-08),
 ('Trump winning the leaders in defending ', 2.3076227239917077e-08),
 ('Trump was amazing day for lightweight ', 2.7508197442837967e-08),
 ('Trump way that is a leader ', 3.501147928373572e-08),
 ('Trump is not as bad word. ', 3.623897429207164e-08),
 ('Trump has his family in Ohio. ', 3.6879377697380734e-08),
 ('Trump shows that there - TRUMP ', 5.107147964290821e-08),
 ('Trump for Kasich is incompetent Mitt ', 7.010938817289676e-08),
 ('Trump won all the evangelical vote ', 8.233400435546883e-08)]

In [250]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("ISIS", 5))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[('ISIS is that Jeb is way ', 1.042342524454919e-07),
 ('ISIS is amazing. We the founder ', 1.3101139222662245e-07),
 ('ISIS is he spent a choker, ', 2.099041157999026e-07),
 ('ISIS is about Trump National Zogby ', 2.142231305282957e-07),
 ('ISIS is lying is not caring ', 3.968648733313155e-07),
 ('ISIS is I put on the ', 4.312649864582795e-07),
 ('ISIS is losing jobs in many ', 5.684453701261493e-07),
 ('ISIS is a person in Salem, ', 5.776138438378616e-07),
 ('ISIS is so the only when ', 5.927174909329041e-07),
 ('ISIS is by Cruz apart for ', 6.79689517828256e-07)]

In [280]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("I'm", 5))
curr = sorted(tweets, key = lambda x: x[1])[0:10]
curr

[("I'm not watch her a morning ", 1.0887107661301211e-07),
 ("I'm a dog-over and didn't meet ", 1.659092357026057e-07),
 ("I'm going very incompetent and never ", 3.789716226048993e-07),
 ("I'm fed up and that when ", 4.521482467499584e-07),
 ("I'm like a @FoxNews only wish ", 4.850836769342712e-07),
 ("I'm going to church I WILL. ", 5.951028445915971e-07),
 ("I'm not controlled by his prize, ", 6.105319198298325e-07),
 ("I'm going to support & his ", 6.925765863781519e-07),
 ("I'm like a fool, is this ", 8.156274213939073e-07),
 ("I'm like Trump is at the ", 8.915109800065552e-07)]

In [292]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("many", 5))
curr = sorted(tweets, key = lambda x: x[1])[20:30]
curr

[('many more VOTE and spirited crowd! ', 5.874501842243778e-07),
 ('many others in Louisiana. Big and ', 6.588136084538962e-07),
 ('many people in it. Lindsey got ', 6.815313190902374e-07),
 ('many will repeal and I am ', 6.987170045057804e-07),
 ('many of #CommonCore and Twitter poll- ', 7.08159126188291e-07),
 ('many people saying the record, most ', 7.657389993935348e-07),
 ('many voters by @CNN he was ', 8.514572081934511e-07),
 ('many years. He is that shows ', 1.075019135340609e-06),
 ('many believe @realDonaldTrump Wow! Thank you, ', 1.2365884706310241e-06),
 ('many of New National Poll, where ', 1.5304876975572648e-06)]

In [294]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("so", 5))
curr = sorted(tweets, key = lambda x: x[1])[20:30]
curr

[('so happy & his fraudulent T.V. ', 1.1498852414529027e-06),
 ('so than a choker, always negative? ', 1.4694818019373646e-06),
 ('so good night. The New Hampshire. ', 1.51846452866861e-06),
 ('so out yesterday by Elton John ', 1.5526600171413663e-06),
 ('so easy to be president - ', 1.6079210909508772e-06),
 ('so much Jeb --- and I ', 1.6593014423892609e-06),
 ('so haltingly said that @oreillyfactor did ', 1.7353880327641257e-06),
 ('so impressive and then they like ', 1.8152422259226268e-06),
 ('so much more, he is 7,533,692-a ', 1.847603219818131e-06),
 ('so they say something much forward ', 2.0074919599947004e-06)]

In [307]:
tweets = []
for _ in range(100):
    tweets.append(gen_tweet_given_start("no", 5))
curr = sorted(tweets, key = lambda x: x[1])[10:20]
curr

[('no show that has never held ', 1.8590479164952853e-07),
 ('no money, no one will end ', 2.810033054418819e-07),
 ('no guts and payed for Cruz ', 2.980235080943185e-07),
 ('no buyers for lightweight from D.C. ', 3.14509914925068e-07),
 ('no chance in Cincinnati is why ', 3.8820659423249227e-07),
 ('no other politician and #VoteTrump tomorrow. ', 4.0597629585603754e-07),
 ('no clue & is a possible ', 4.5017283035302915e-07),
 ('no and said no buyer! Liabilities ', 4.79745159371342e-07),
 ('no chance in 4 or keeping ', 9.139030239223256e-07),
 ('no show first day to authorities ', 1.1083955420331298e-06)]