In [1]:
import keras
import pandas as pd
import numpy as np
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from nltk.tokenize.casual import casual_tokenize

from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense, LSTM
from keras.optimizers import RMSprop

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

Using TensorFlow backend.


In [2]:
df_tweets = pd.read_csv("tiwari_unique_tweets.csv", index_col=0)

In [3]:
df_tweets.created_at = pd.to_datetime(df_tweets.created_at)
df_tweets.rename(index=str, columns={"created_at": "time"}, inplace=True)
df_noat = df_tweets.loc[df_tweets['text'].str.find('@') != 0].copy()

In [4]:
df_noat['text_lower'] = df_noat.text.str.lower()

In [5]:
nw_words = [
    casual_tokenize(
        tweet,
        reduce_len=True,
    ) for tweet in df_noat['text_lower']
]

In [6]:
nw_corp = [word for sub in nw_words for word in sub if 'http' not in word]
nw_chars = [char for word in nw_corp for char in word]

In [7]:
chars_in_corpus = Counter()
text = []
for tweet in nw_words:
    for word in tweet:
        if 'http' not in word:
            text.append(word + ' ')
    text.append(' ` ')

text = ''.join(text)

for char in text:
    chars_in_corpus.update(char)
    lexicon = list(chars_in_corpus.keys())
    char_to_index = {char: i for (i, char) in enumerate(lexicon)}
    index_to_char = {i: char for (i, char) in enumerate(lexicon)}

In [14]:
nw_corpus = create_lookup_dictionary_and_lexicon()

In [8]:
df_noat['text_lower'].str.len().describe()

count    1697.000000
mean       85.723630
std        38.764767
min         3.000000
25%        53.000000
50%        85.000000
75%       123.000000
max       143.000000
Name: text_lower, dtype: float64

In [9]:
CORPUS_LENGTH = len(text)
MAX_SEQ_LENGTH = 85
SEQ_STEP = 6
N_SEQS = None


def create_sequences(corpus):
    sequences, next_chars = [], []
    for i in range(0, CORPUS_LENGTH - MAX_SEQ_LENGTH, SEQ_STEP):
        sequences.append(corpus[i:i + MAX_SEQ_LENGTH])
        next_chars.append(corpus[i + MAX_SEQ_LENGTH])
    global N_SEQS
    N_SEQS = len(sequences)
    return np.array(sequences), np.array(next_chars)


sequences, next_chars = create_sequences(text)

In [36]:
df_sparse = pd.DataFrame(columns=range(len(lexicon))).to_sparse(fill_value=0)
for seq in sequences:
    label_encoded_text = list(map(lambda char: char_to_index[char], seq))
    identity = np.eye(len(lexicon))
    one_hot_encoded_text = list(
        map(lambda idx: identity[idx], label_encoded_text))
    temp = pd.DataFrame(one_hot_encoded_text).sum()
    df_sparse = df_sparse.append(temp, ignore_index=True)

KeyboardInterrupt: 

In [36]:
len(sequences)

25209

In [32]:
cv = CountVectorizer(analyzer='char')
one_hot_X = cv.fit_transform(sequences)
X = one_hot_X.todense()
X = [X]
X = np.array(X)
y = np.zeros([len(next_chars),1 , len(cv.get_feature_names())])
char_to_index = {char: i for (i, char) in enumerate(cv.get_feature_names())}

for index, char in enumerate(next_chars):
    y[index][0][char_to_index[char]] = 1


In [11]:
N_CHARS = len(next_chars)
def build_model(hidden_layer_size=128, dropout = 0.2, learning_rate=0.1):
    model = Sequential()
    model.add(LSTM(hidden_layer_size, return_sequences=True, input_shape=(X.shape[1],X.shape[2])))
    model.add(Dropout(dropout))
    model.add(LSTM(hidden_layer_size, return_sequences=False))
    model.add(Dropout(dropout))
    model.add(Dense(N_CHARS, activation='softmax'))
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=learning_rate))
    return model
model = build_model()

In [29]:
print("Total words: ", len(nw_corp))
print("Unique words: ", len(set(nw_corp)))

print("Total words: ", len(clean_corp))
print("Unique words: ", len(set(clean_corp)))

Total words:  29131
Unique words:  3939
Total words:  28997
Unique words:  3822


In [27]:
pd.Series(nw_corp).value_counts()

.                          2859
,                           951
the                         770
#mufc                       746
to                          511
a                           468
and                         462
!                           385
is                          297
in                          287
of                          284
for                         265
we                          218
this                        203
be                          190
you                         185
on                          169
it                          143
but                         136
united                      130
#lvgout                     124
with                        123
have                        110
i                           109
what                        108
from                        107
not                         102
game                        102
good                         99
us                           97
                           ... 
nigel   

In [29]:
model.fit(x=X, y=y.reshape())

ValueError: Input arrays should have the same number of samples as target arrays. Found 1 input samples and 83 target samples.

In [143]:
char_to_index

{' ': 0,
 '!': 1,
 '"': 2,
 '#': 3,
 '$': 4,
 '%': 5,
 '&': 6,
 "'": 7,
 '(': 8,
 ')': 9,
 '*': 10,
 '+': 11,
 ',': 12,
 '-': 13,
 '.': 14,
 '/': 15,
 '0': 16,
 '1': 17,
 '2': 18,
 '3': 19,
 '4': 20,
 '5': 21,
 '6': 22,
 '7': 23,
 '8': 24,
 '9': 25,
 ':': 26,
 ';': 27,
 '=': 28,
 '>': 29,
 '?': 30,
 '@': 31,
 '_': 32,
 '`': 33,
 'a': 34,
 'b': 35,
 'c': 36,
 'd': 37,
 'e': 38,
 'f': 39,
 'g': 40,
 'h': 41,
 'i': 42,
 'j': 43,
 'k': 44,
 'l': 45,
 'm': 46,
 'n': 47,
 'o': 48,
 'p': 49,
 'q': 50,
 'r': 51,
 's': 52,
 't': 53,
 'u': 54,
 'v': 55,
 'w': 56,
 'x': 57,
 'y': 58,
 'z': 59,
 '|': 60,
 '¬£': 61,
 '√ü': 62,
 '√†': 63,
 '√°': 64,
 '√ß': 65,
 '√©': 66,
 '√≠': 67,
 '√±': 68,
 '√∂': 69,
 '√º': 70,
 'ƒá': 71,
 '≈æ': 72,
 '‚Äô': 73,
 '‚Ä¶': 74,
 '‚Ç¨': 75,
 'üî•': 76,
 'üòî': 77,
 'üòò': 78,
 'üòú': 79,
 'üòù': 80,
 'üò§': 81,
 'üôå': 82}

In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 25209, 128)        108544    
_________________________________________________________________
dropout_1 (Dropout)          (None, 25209, 128)        0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 25209)             3251961   
Total params: 3,492,089
Trainable params: 3,492,089
Non-trainable params: 0
_________________________________________________________________


In [61]:
len(cv.get_feature_names())
print(temp[0])
print(sequences[0])
print(np.unique([char for word in sequences[0] for char in word], return_counts = True))
print(char_to_index)

83

  (0, 51)	1
  (0, 35)	1
  (0, 39)	1
  (0, 36)	1
  (0, 57)	2
  (0, 49)	1
  (0, 14)	2
  (0, 37)	2
  (0, 38)	5
  (0, 54)	1
  (0, 53)	7
  (0, 45)	6
  (0, 59)	1
  (0, 40)	2
  (0, 47)	4
  (0, 42)	4
  (0, 55)	1
  (0, 41)	1
  (0, 58)	4
  (0, 48)	3
  (0, 43)	1
  (0, 0)	15
  (0, 52)	4
  (0, 34)	12
  (0, 56)	3
was a joy having zlatan at united . always played exciting football . always set extr
(array([' ', '.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l',
       'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'], 
      dtype='<U1'), array([15,  2, 12,  1,  1,  2,  5,  1,  2,  1,  4,  1,  6,  4,  3,  1,  1,
        4,  7,  1,  1,  3,  2,  4,  1]))
{'w': 0, 'a': 1, 's': 2, ' ': 3, 'j': 4, 'o': 5, 'y': 6, 'h': 7, 'v': 8, 'i': 9, 'n': 10, 'g': 11, 'z': 12, 'l': 13, 't': 14, 'u': 15, 'e': 16, 'd': 17, '.': 18, 'p': 19, 'x': 20, 'c': 21, 'f': 22, 'b': 23, 'r': 24, 'm': 25, '‚Ä¶': 26, '`': 27, '4': 28, '0': 29, '!': 30, '3': 31, 'k': 32, "'": 33, ',': 34, '8': 35, '#': 36, ':': 37, 

In [43]:
tokenizer = Tokenizer(num_words=1500)
corpus = tokenizer.fit_on_texts(clean_corp)
corpus = tokenizer.word_index

In [44]:
def convert_text_to_index_array(text):
    return [corpus[word] for word in text_to_word_sequence(text)]

In [45]:
allWordIndices = []
for text in clean_corp:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)
allWordIndices = np.asarray(allWordIndices)
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')

In [24]:
inp = [X[0]]
model.predict(X[0][0].reshape(1,1,-1))

ValueError: Error when checking : expected lstm_1_input to have shape (25209, 83) but got array with shape (1, 83)

In [117]:
x = np.reshape(x, [2 ,-1 , 84])

ValueError: cannot reshape array of size 2092347 into shape (2,newaxis,84)

In [109]:
x.shape

(25209, 83)

In [None]:
x.reshape()