In [1]:
import keras
import pandas as pd
import numpy as np
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from nltk.tokenize.casual import casual_tokenize
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense, LSTM
from keras.optimizers import RMSprop

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

Using TensorFlow backend.


In [2]:
df_tweets = pd.read_csv("tiwari_unique_tweets.csv", index_col=0)

In [3]:
df_tweets.created_at = pd.to_datetime(df_tweets.created_at)
df_tweets.rename(index=str, columns={"created_at": "time"}, inplace=True)
df_noat = df_tweets.loc[df_tweets['text'].str.find('@') != 0].copy()

In [4]:
df_noat['text_lower'] = df_noat.text.str.lower()

In [5]:
nw_words = [
    casual_tokenize(
        tweet,
        reduce_len=True,
    ) for tweet in df_noat['text_lower']
]

In [6]:
nw_corp = [word for sub in nw_words for word in sub if 'http' not in word]
nw_chars = [char for word in nw_corp for char in word]

In [7]:
chars_in_corpus = Counter()
text = []
for tweet in nw_words:
    for word in tweet:
        if 'http' not in word:
            text.append(word + ' ')
    text.append(' ` ')

text = ''.join(text)

for char in text:
    chars_in_corpus.update(char)
    lexicon = list(chars_in_corpus.keys())
    char_to_index = {char: i for (i, char) in enumerate(lexicon)}
    index_to_char = {i: char for (i, char) in enumerate(lexicon)}

In [9]:
df_noat['text_lower'].str.len().describe()

count    1697.000000
mean       85.723630
std        38.764767
min         3.000000
25%        53.000000
50%        85.000000
75%       123.000000
max       143.000000
Name: text_lower, dtype: float64

In [50]:
CORPUS_LENGTH = len(text)
MAX_SEQ_LENGTH = 65
SEQ_STEP = 3
N_SEQS = None


def create_sequences(corpus):
    sequences, next_chars = [], []
    for i in range(0, CORPUS_LENGTH - MAX_SEQ_LENGTH, SEQ_STEP):
        sequences.append(corpus[i:i + MAX_SEQ_LENGTH])
        next_chars.append(corpus[i + MAX_SEQ_LENGTH])
    global N_SEQS
    N_SEQS = len(sequences)
    return np.array(sequences), np.array(next_chars)


sequences, next_chars = create_sequences(text)

In [51]:
cv = CountVectorizer(analyzer='char')
one_hot_X = cv.fit_transform(sequences)

X = np.zeros([len(sequences), MAX_SEQ_LENGTH, len(cv.get_feature_names())])
y = np.zeros([len(next_chars), len(cv.get_feature_names())])
char_to_index = {char: i for (i, char) in enumerate(cv.get_feature_names())}
index_to_char = {i: char for (i, char) in enumerate(cv.get_feature_names())}

for index, seq in enumerate(sequences):
    for i2, char in enumerate(seq):
        X[index, i2, char_to_index[char]] = 1
    y[index, char_to_index[char]] = 1

In [52]:
N_CHARS = len(cv.get_feature_names())


def build_model(hidden_layer_size=128,
                dropout=0.2,
                learning_rate=0.01,
                verbose=0):
    model = Sequential()
    model.add(
        LSTM(
            hidden_layer_size,
            return_sequences=True,
            input_shape=(MAX_SEQ_LENGTH, N_CHARS)))
    model.add(Dropout(dropout))
    model.add(LSTM(hidden_layer_size, return_sequences=False))
    model.add(Dropout(dropout))
    model.add(Dense(N_CHARS, activation='softmax'))
    optimizer = RMSprop(lr=0.01)
    model.compile(
        loss='categorical_crossentropy', optimizer=RMSprop(lr=learning_rate))
    if verbose:
        print('Model Summary:')
        model.summary()

    return model


model = build_model()

In [53]:
verbose = 1
def train_model(model, X, y, batch_size=128, epochs=1, verbose=0):
    checkpointer = ModelCheckpoint(
        filepath="weights.hdf5",
        monitor='loss',
        verbose=verbose,
        save_best_only=True,
        mode='min')
    model.fit(
        X,
        y,
        batch_size=batch_size,
        epochs=epochs,
        verbose=verbose,
        callbacks=[checkpointer])


train_model(model, X, y, verbose=verbose)

Epoch 1/1

Epoch 00001: loss improved from inf to 0.67306, saving model to weights.hdf5


In [54]:
np.random.seed(42)

In [55]:
def sample(preds):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / 0.2
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [56]:
def generate_tweets(model, corpus, char_to_idx, idx_to_char, n_tweets=10, verbose=0): 
    model.load_weights('weights.hdf5')
    tweets = []
    spaces_in_corpus = np.array([idx for idx in range(CORPUS_LENGTH) if text[idx] == ' '])
    for i in range(1, n_tweets + 1):
        begin = np.random.choice(spaces_in_corpus)
        tweet = u''
        sequence = text[begin:begin + MAX_SEQ_LENGTH]
        tweet += sequence
        if verbose:
            print('Tweet no. %03d' % i)
            print('=' * 13)
            print('Generating with seed:')
            print(sequence)
            print('_' * len(sequence))
        for _ in range(1):
            x = np.zeros((1, MAX_SEQ_LENGTH, N_CHARS))
            for t, char in enumerate(sequence):
                x[0, t, char_to_idx[char]] = 1.0

            preds = model.predict(x, verbose=0)[0]
            next_idx = sample(preds)
            next_char = idx_to_char[next_idx]

            tweet += next_char
            sequence = sequence[1:] + next_char
        if verbose:
            print(tweet)
            print()
        tweets.append(tweet)
    return tweets

tweets = generate_tweets(model, text, char_to_index, index_to_char, verbose=verbose)


Tweet no. 001
Generating with seed:
 memphis is hanging out with pogba and lukaku . i'm not saying an
_________________________________________________________________
 memphis is hanging out with pogba and lukaku . i'm not saying ann

Tweet no. 002
Generating with seed:
 #freereus #mufc #transfers #deadlineday  ` 2 minutes to bring in
_________________________________________________________________
 #freereus #mufc #transfers #deadlineday  ` 2 minutes to bring inn

Tweet no. 003
Generating with seed:
 have bastian schweinsteiger and ander herrera and we play michea
_________________________________________________________________
 have bastian schweinsteiger and ander herrera and we play micheaa

Tweet no. 004
Generating with seed:
 . lvg , use your head . #mufc #facupsemifinal #lvgout  ` van gaa
_________________________________________________________________
 . lvg , use your head . #mufc #facupsemifinal #lvgout  ` van gaaa

Tweet no. 005
Generating with seed:
 apology from media 

In [30]:
for subtweets in tweets:
    temp = subtweets.split(' ` ')
    for tweet in temp:
        if 

SyntaxError: invalid syntax (<ipython-input-30-e2c1fc6d3fa3>, line 4)

In [28]:
len(tweets[2])

170