In [1]:
import numpy as np
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import backend

import json
import re

Using TensorFlow backend.


# Load Posts and Comments from data

In [2]:
BEFORE_START = '\0'
AFTER_END = '\1'
def preproc(s):
    s = BEFORE_START + s + AFTER_END
    s = re.sub(r'\[.*\|.*\]', ' ', s)
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'[«»]', '"', s)
    return s.lower()

In [3]:
with open('data/kalikfan.json', 'r') as f:
    kalikfan = json.load(f)
    
texts = [preproc(x['text']) for x in kalikfan]

# Shisha Learning 🤙🤙🤙
From https://stackabuse.com/text-generation-with-python-and-tensorflow-keras/

In [4]:
chars = sorted(list(set(''.join(texts))))
char_to_num = dict((c, i) for i, c in enumerate(chars))
num_to_char = dict((i, c) for i, c in enumerate(chars))

input_len = len(texts)
vocab_len = len(chars)
print ("Total number of text samples:", input_len)
print ("Total vocab:", vocab_len)

Total number of text samples: 755
Total vocab: 174


In [5]:
SEQ_LENGTH = 10
x_data = []
y_data = []

for text in texts:
    t = [BEFORE_START] * (SEQ_LENGTH - 1) + list(text) + [AFTER_END] * SEQ_LENGTH
    for i in range(0, len(t) - SEQ_LENGTH):
        in_seq = t[i:i + SEQ_LENGTH]
        out = t[i + SEQ_LENGTH]
        x_data.append([char_to_num[x] for x in in_seq])
        y_data.append(char_to_num[out])
        
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 84255


In [6]:
X = np.reshape(x_data, (n_patterns, SEQ_LENGTH, 1))
X = X/float(vocab_len)

In [7]:
y = np_utils.to_categorical(y_data)

In [8]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [9]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [10]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [68]:
model.fit(X, y, epochs=100, batch_size=256, callbacks=desired_callbacks)

Train on 84255 samples
Epoch 1/10
Epoch 00001: loss did not improve from 1.46251
Epoch 2/10
Epoch 00002: loss did not improve from 1.46251
Epoch 3/10
Epoch 00003: loss did not improve from 1.46251
Epoch 4/10
Epoch 00004: loss did not improve from 1.46251
Epoch 5/10
Epoch 00005: loss did not improve from 1.46251
Epoch 6/10
Epoch 00006: loss did not improve from 1.46251
Epoch 7/10
Epoch 00007: loss did not improve from 1.46251
Epoch 8/10
Epoch 00008: loss did not improve from 1.46251
Epoch 9/10
Epoch 00009: loss did not improve from 1.46251
Epoch 10/10
Epoch 00010: loss did not improve from 1.46251


<tensorflow.python.keras.callbacks.History at 0x7f5b9423dc50>

In [12]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [83]:
def pred(pattern):
    result = []
    while True:
        if len(result) > 500:
            result += '@'
            break
        x = np.reshape(pattern, (1, len(pattern), 1))
        x = x / float(vocab_len)
        prediction = model.predict(x, verbose=0)
        res = np.random.choice(chars, 1, False, prediction[0])[0]
        
        if res == AFTER_END:
            break
        result += res
        
        seq_in = [num_to_char[value] for value in pattern]

        pattern.append(char_to_num[res])
        pattern = pattern[1:]
        
    return ''.join(result)

In [84]:
def to_input(s):
    return [char_to_num[x] for x in s]

In [85]:
def test():
    s = [char_to_num[BEFORE_START]] * SEQ_LENGTH
    p = pred(s)
    print(p)

In [100]:
test()

-аратиккшная пасонкоэйи покурить😈
