In [1]:
import numpy as np
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import backend

import json
import re

Using TensorFlow backend.


# Load Posts and Comments from data

In [2]:
def preproc(s):
    s = re.sub(r'\[.*\|.*\]', ' ', s)
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'[¬´¬ª]', '"', s)
    return s.lower()

In [3]:
with open('data/kalikfan.json', 'r') as f:
    kalikfan = json.load(f)
    
text = ''.join([preproc(x['text']) for x in kalikfan])
text

'–¥–µ–Ω—å–≥–∏ –≤ –∫–∞—Ä–º–∞–Ω–µ –∂–µ–Ω–∞ –Ω–∞ –¥–∏–≤–∞–Ω–µ —Å—ã–Ω –Ω–∞ –±–æ—Ä—å–±–µ –∂–∏—Ç—É—Ö–∞ –ø–æ –∫–∞–π—Ñ—É)-–º–∏–ª—ã–π —Å–º–æ—Ç—Ä–∏ –∫–∞–∫—É—é —è —Å–∫—É–ª—å–ø—Ç—É—Ä—É —Å–ª–µ–ø–∏–ª–∞ –∏–∑ –∫–∞–ª—å—è–Ω—á–∏–∫–∞) -–¥–æ—Ä–æ–≥–∞—è, –∞ —á—Ç–æ —ç—Ç–æüòÖ -–∫–∞–∫ —á—Ç–æ? –º–æ—Ç–æ–≥–æ–Ω—â–∏–∫ - –≤–æ–Ω —à–ª–µ–º —É –Ω–µ–≥–æ –Ω–∞–¥–µ—Ç) -–∞–∞–∞–∞–∞, –º–æ—Ç–æ–≥–æ–Ω—â–∏–∫) –∞ —è —É–∂ –Ω–∞–ø—Ä–∏–¥—É–º—ã–≤–∞–ª –≤—Å—è–∫–æ–≥–æü§£ –Ω–æ—Ä–º–∞–ª—å–Ω–∞—è –¥–µ–≤—É—à–∫–∞) –≤–æ–æ–±—Ä–∞–∂—É–ª—è –æ–¥–Ω–∏–º —Å–ª–æ–≤–æ–ºüòå –≤—ã—Å—Ç–∞–≤–∏–ª–∞ —Å–≤–æ—é –ø–æ–¥–µ–ª–∫—É –Ω–∞ , –Ω–∞–¥–µ—é—Å—å –µ–π –ø–æ—Å—Ç–∞–≤—è—Ç –≤—ã—Å—à–∏–π –±–∞–ª–ª)–≤–æ—Ç –≤—ã –¥—É–º–∞–µ—Ç–µ, –ø–æ—á–µ–º—É –≤–∞—à–∏ —Ä–æ–¥–∏—Ç–µ–ª–∏ —Å —Ç–∞–∫–∏–º —Ç–µ–ø–ª–æ–º –≤—Å–ø–æ–º–∏–Ω–∞—é—Ç –ø–∏–æ–Ω–µ—Ä—Å–∫–∏–µ –¥–µ–Ω—å–∫–∏ü§î –∏–∑-–∑–∞ —Ç–æ–≥–æ, —á—Ç–æ –µ–∑–¥–∏–ª–∏ –≤ –∫–∞–∫–æ–π-–Ω–∏–±—É–¥—å –ª–∞–≥–µ—Ä—å —Ç–∏–ø–∞ "–æ—Ä–ª–µ–Ω–æ–∫" –∏–ª–∏ "–∞—Ä—Ç–µ–∫"? –∏–ª–∏ –∏–∑-–∑–∞ –ø–∏–ª–æ—Ç–æ–∫? –Ω–µ–µ–µ–µ–µ–µ—ÇüòÄ –æ—à–∏–±–∞–µ—Ç–µ—Å—å) –≤—Å–µ –ø–æ—Ç–æ–º—É, —á—Ç–æ

# Shisha Learning ü§ôü§ôü§ô
From https://stackabuse.com/text-generation-with-python-and-tensorflow-keras/

In [4]:
chars = sorted(list(set(text)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

input_len = len(text)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 75950
Total vocab: 172


In [5]:
seq_length = 20
x_data = []
y_data = []

# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = text[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = text[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 75930


In [6]:
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [7]:
y = np_utils.to_categorical(y_data)

In [8]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [9]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [10]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [11]:
model.fit(X, y, epochs=20, batch_size=256, callbacks=desired_callbacks)

Train on 75930 samples
Epoch 1/20








Epoch 00001: loss improved from inf to 3.40842, saving model to model_weights_saved.hdf5
Epoch 2/20
Epoch 00002: loss improved from 3.40842 to 3.32395, saving model to model_weights_saved.hdf5
Epoch 3/20
Epoch 00003: loss improved from 3.32395 to 3.31547, saving model to model_weights_saved.hdf5
Epoch 4/20
Epoch 00004: loss improved from 3.31547 to 3.30973, saving model to model_weights_saved.hdf5
Epoch 5/20
Epoch 00005: loss improved from 3.30973 to 3.30735, saving model to model_weights_saved.hdf5
Epoch 6/20
Epoch 00006: loss improved from 3.30735 to 3.30526, saving model to model_weights_saved.hdf5
Epoch 7/20
Epoch 00007: loss improved from 3.30526 to 3.29921, saving model to model_weights_saved.hdf5
Epoch 8/20
Epoch 00008: loss improved from 3.29921 to 3.22438, saving model to model_weights_saved.hdf5
Epoch 9/20
Epoch 00009: loss improved from 3.22438 to 3.13835, saving model to model_weights_saved.hdf5
Epoch 10/20
Epoch 00010: loss improved from 3.13835 to 3.11290, saving model to

<tensorflow.python.keras.callbacks.History at 0x7f8b901c0c10>

In [12]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [13]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [14]:
def take_seed():
    start = np.random.randint(0, len(kalikfan) - 1)
    s = preproc(kalikfan[start]['text'])[:seq_length]
    return [char_to_num[x] for x in s]

In [15]:
def pred(pattern):
    result = []
    for i in range(100):
        x = np.reshape(pattern, (1, len(pattern), 1))
        x = x / float(vocab_len)
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result += num_to_char[index]
        seq_in = [num_to_char[value] for value in pattern]

        pattern.append(index)
        pattern = pattern[1:len(pattern)]
        
    return ''.join(result)

In [16]:
def to_input(s):
    return [char_to_num[x] for x in s]

In [17]:
def test():
    s = take_seed()
    p = pred(s)
    print(''.join([num_to_char[x] for x in s]) + "|" + p)

In [19]:
test()

-–¥–∞ —è —Ç–µ–±–µ –≥–æ–≤–æ—Ä—é - -|-–¥–∞ –ø–∞ –ø–æ –ø–∞ –ø–æ –ø–æ–æ–µ—Ä–∏ –ø–∞ –ø–æ–æ–µ—Ä–∏ –ø–∞ –ø–æ–æ–µ—Ä–∏ –ø–∞ –ø–æ–æ–µ—Ä–∏ –ø–∞ –ø–æ–æ–µ—Ä–∏ –ø–∞ –ø–æ–æ–µ—Ä–∏ –ø–∞ –ø–æ–æ–µ—Ä–∏ –ø–∞ –ø–æ–æ–µ—Ä–∏ –ø–∞ –ø–æ–æ–µ
