In [1]:
from getauth import get_auth
import tweepy

In [2]:
auth_keys = {}
with open('keys.txt', 'r') as file:
    for line in file:
        var = line.split()
        auth_keys[var[0]] = var[2]

In [3]:
auth = tweepy.OAuthHandler(auth_keys["API_KEY"], auth_keys["API_KEY_SECRET"])
auth.set_access_token(auth_keys["ACCESS_TOKEN"], auth_keys["ACCESS_TOKEN_SECRET"])

api = tweepy.API(auth)

In [4]:
naval_tweets = api.user_timeline(screen_name='naval')
print([tweet.text for tweet in naval_tweets])

['RT @micsolana: can a tech writer for @wired, @latimes, @techCrunch, or the @washingtonpost, all of which have published anti-proposition 22…', 'RT @nvk: 2020 will be known as the year Journalists finally burned down USA for the clicks.', '@vk3_prince @whoisaddison You’d think a fan would cite sources though. 🤷🏽\u200d♂️', 'Our temporary solution to a temporary problem has become a permanent problem.', '@StephenPiment @0x49fa98 GPT-3 is really a Turing Test for the reader.', '@NedLorde It’s definitely getting harder.', '@DylanoRepublic @visualizevalue 🙏', 'RT @atulrc: https://t.co/9jLS1NE6r9', "RT @galjudo: There's no opponent. You're fighting against yourself.", '@Algorithmexist Yes, similar setting, different point.', 'I wrote a story.\n\nhttps://t.co/StyzoQldYF', '@bizontheside Thank you. Not a "guru" though 😉.', 'RT @SvenSchnieders: By making modesty a virtue, we have made excellence a vice.', '@HustleScienceOn Credit: https://t.co/7VkyhUVTwC', '@spakhm @RationalAztec Love.', '@Rat

In [5]:
class GetUserTweets:
    def __init__(self, twitter_api, username):
        self.username = username
        self.tweets = []
        self.tweets_text = []
        self.max_pages = 1
        self.api = twitter_api
        
    def print_tweets(self):
        print(self.tweets_text)
    
    def fetch_tweets(self, num_pages):
        for i in range(1, num_pages+1):
            self.tweets += self.api.user_timeline(screen_name=self.username, page=i, tweet_mode='extended')
        self.tweets_text = list(filter(lambda sentence: sentence != '', [self.clean_tweet(tweet.full_text) for tweet in self.tweets]))
        
    def get_tweets(self):
        return self.tweets_text
    
    def clean_tweet(self, tweet):
        words = tweet.split('\n\n')
        words = ' '.join(words)
        words = words.split('\n')
        words = ' '.join(words)
        words = words.split(' ')
        
        # remove RT
        if words[0] == 'RT':
            words = words[2:]
        
        words = list(filter(
            lambda word: not (word.startswith('https://') or word.startswith('@')),
            words
        ))
        
        return ' '.join(words)

In [6]:
def get_clean_data(list_of_usernames, num_pages):
    all_data = []
    for username in list_of_usernames:
        user = GetUserTweets(api, username)
        user.fetch_tweets(num_pages)
        user_tweets = user.get_tweets()
        all_data += user_tweets
        print(f'got {len(user_tweets)} tweets from @{username}')
    
    return all_data

In [87]:
naval = GetUserTweets(api, 'naval')

In [78]:
naval.fetch_tweets(10)
sample = naval.get_tweets()
naval.print_tweets()

Status(_api=<tweepy.api.API object at 0x7f2cb403e1f0>, _json={'created_at': 'Mon Oct 12 20:07:20 +0000 2020', 'id': 1315745893199478784, 'id_str': '1315745893199478784', 'full_text': 'Our temporary solution to a temporary problem has become a permanent problem.', 'truncated': False, 'display_text_range': [0, 77], 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 745273, 'id_str': '745273', 'name': 'Naval', 'screen_name': 'naval', 'location': '', 'description': '', 'url': 'https://t.co/s6SQqD1ln0', 'entities': {'url': {'urls': [{'url': 'https://t.co/s6SQqD1ln0', 'expanded_url': 'https://nav.al', 'display_url': 'nav.al', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 

In [46]:
lots_of_data = get_clean_data(['waitbutwhy', 'slatestarcodex', 'davidgoggins', 'ericrweinstein', 'lexfridman', 'paulg'], 50)

got 981 tweets from @waitbutwhy
got 984 tweets from @slatestarcodex
got 554 tweets from @davidgoggins
got 1000 tweets from @ericrweinstein
got 998 tweets from @lexfridman
got 972 tweets from @paulg


In [47]:
print(len(lots_of_data))

5489


In [48]:
import keras
import tensorflow as tf
import numpy as np

In [49]:
one_big_string = ' '.join(lots_of_data)
print(len(one_big_string))

744094


In [50]:
# this tokenizes our characters into indices
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(one_big_string)

In [51]:
# we need to encode our entire dataset
[encoded] = np.array(tokenizer.texts_to_sequences([one_big_string])) - 1
max_id = len(tokenizer.word_index)
dataset_size = tokenizer.document_count

In [52]:
# we will use 90% of our data as training data
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [53]:
# we need to create 'windows' into the dataset, which is similar to creating batches.
# we are making a 'dataset of datasets' which each contain 100 characters
n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [54]:
# then we need to smush the dataset of DATASETS into a dataset of TENSORS
# because our model will only operate on tensors
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [55]:
# first we shuffle, then we split each window into X and Y, Y including the next char we want to predict
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [56]:
# then we turn the X input into a one-hot vector (w/ max_id encodings)
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [57]:
dataset = dataset.prefetch(1)

In [58]:
model = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [59]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
   36/20925 [..............................] - ETA: 55:04 - loss: 1.5825

KeyboardInterrupt: 

In [34]:
def preprocess(text):
    X = np.array(tokenizer.texts_to_sequences(text)) - 1
    return tf.one_hot(X, max_id)

In [60]:
X_new = preprocess(["hey ther"])
y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(y_pred + 1)[0][-1]

'e'

In [61]:
def finish_text(text, n_chars=50, temperature=0.2):
    for _ in range(n_chars):
        x_new = preprocess([text])
        y_probability = model.predict(x_new)[0, -1:, :]
        rescaled_logits = tf.math.log(y_probability) / temperature
        char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
        text += tokenizer.sequences_to_texts(char_id.numpy())[0]
    return text

In [62]:
finish_text('h', 140)

'he interesting the statement and the press in the problem is that the people would be the country of the great complete in the experience of '

In [41]:
finish_text('t', 280, 0.4)

'to see that even making share service a little be more something on the show — zen, investing, mike tyson, artificial intelligence, and the dangers of security, which is probably doesn’t could be new technology from the training in about heavy for the latest of the typersign of th'

In [42]:
finish_text('the thing about quantum computing is', 280, 0.5)

'the thing about quantum computing is the production tasty need to see time? yeah, so we will be a little be the tesla will be the great of part. well we could probably doesn’t creative production tanks when i have such a best weeks. we’re a model 3 or the show to be a fairly energy cargo starlink satellites are pro'

In [68]:
finish_text('420 ', 140, 0.4)

'420 in the statement startup in the most people are all the same thing in the and the other instead of a country and in a fast statement in the '

In [67]:
finish_text('starlink s', 100, 0.4)

'starlink some power that a power to you the answer is not the best indiction is that i think that seems to the'

In [65]:
finish_text('investing ', 280, 0.3)

"investing the second that i don't want to the people are and the state so i think the specific problems is that i think the world is the second people was a problem in the most because the second statements in the world of being the best the specific statements of the startup and i was a p"