In [202]:
def tmpdir(path):
    return "{0}/tmp/{1}".format(os.environ['HOME'], path)

JA_TWEETS_TXT = "../../dont_remove_data/ja_20170407_084226.txt"
JA_TWEETS_CLEAN_TXT = tmpdir("ja_tweets_clean.saved")

JA_TWEETS_LEMMA_TXT = tmpdir("ja_tweets_lemma.saved")
JA_TWEETS_WORD_VEC_CACHE_PATH = tmpdir("ja_tweets_word_vec_catch.saved")

JA_QUESTION_LINES_VAL_PATH = tmpdir("ja_question_lines_val.saved")
JA_ANSWER_LINES_VAL_PATH = tmpdir("ja_answer_lines_val.saved")
JA_QUESTION_LINES_TRAIN_PATH = tmpdir("ja_question_lines_train.saved")
JA_ANSWER_LINES_TRAIN_PATH = tmpdir("ja_answer_lines_train.saved")
JA_SANITIZED_TEXT_PATH = tmpdir("ja_sanitized_text.saved")
JA_SANITIZED_VEC_MODEL_CACHE_PATH = tmpdir("ja_chat_bot_sanitized_word_vec_model.saved")
WORD_VEC_SIZE = 1024
SEQ_LEN = 20
MODEL_NAME = "chatbot_ja"

# Constant tokens
UNK = 'unk'
EOS = 'EOS'

# Build word_vec from file
import os
import re
import pickle
import MeCab
import subprocess
from gensim.models import word2vec
from seq2seq.models import SimpleSeq2Seq
from keras.callbacks import ModelCheckpoint
from keras.models import load_model


def log(msg):
    print(">{0}...".format(msg))

# Remove @username and URL
def cleanupTweets(fromFile, toFile):
    with open(fromFile, "r") as fin, open(toFile, 'w') as fout:
        for line in fin:
            text = re.sub(r"@([A-Za-z0-9_]+)", "", line)
            text2 = re.sub(r'https?:\/\/.*[\r\n]*', "", text)
            fout.write(text2)
            
def lemmalize(fromFile, toFile):
    subprocess.check_output('mecab -Owakati < "{0}" > {1}'.format(fromFile, toFile),
                            shell=True)

def build_word_vec(path, word_vec_size=WORD_VEC_SIZE, min_count=0, window=15):
    sentences = word2vec.LineSentence(path)
    word_vec = word2vec.Word2Vec(sentences, size=word_vec_size, min_count=min_count, window=window)    
    return word_vec

def load_or_build_word_vec(vocab_path, cache_path, enable_cache):
    if enable_cache and os.path.exists(cache_path):
        log("loading from exisiting model")
        return word2vec.Word2Vec.load(cache_path)
    else:
        log("creating word_vec model")
        word_vec = build_word_vec(vocab_path)
        word_vec.save(cache_path)
        return word_vec    
    
def read_lines(filename):
    return open(filename).read().split('\n')[:-1]

def load_or_build_sanitized_lines(lines, word_vec, enable_cache):
    sanitized_lines = False
    key = "sanitized_lines_ja"
    if enable_cache:
        sanitized_lines = load_obj(key, sanitized_lines)
    if sanitized_lines:
        return sanitized_lines
    ret = sanitize_lines(lines, word_vec)
    save_obj(key, ret)
    return ret    

def save_obj(key, obj):
    print(">saving {0} to file".format(tmpdir(key)))
    with open(tmpdir("{0}.saved".format(key)), mode='wb') as f:
        pickle.dump(obj, f, protocol=0)

def load_obj(key, obj):
    filepath = tmpdir("{0}.saved".format(key))
    if os.path.exists(filepath):
        print(">loading {0} from file".format(key))
        with open(filepath, mode='rb') as f:
            return pickle.load(f)
    else:
        print(">{0} not found.".format(key))        
        return False
    
def split_lines(lines):
    question_lines = []
    answer_lines = []
    print(">splinting data...")
    # 2n   lines are questions
    # 2n+1 lines are answers
    for i in range(0, len(lines) // 2, 2):
        question_lines.append(lines[i])
        answer_lines.append(lines[i + 1])
    return (question_lines, answer_lines)  

def sanitize_lines(lines, word_vec):
    sanitized_lines = []    
    for line in lines:
        sanitized_line = []
        for word in line.split():
            if word in word_vec.wv:
                sanitized_line.append(word)
            else:
                sanitized_line.append(UNK)
        sanitized_line.append(EOS)
        sanitized_lines.append(' '.join(sanitized_line))
    return sanitized_lines

def save_lines(path, lines):
    f = open(path, 'w')
    for line in lines:
        f.write(line)
        f.write('\n')
    f.close()    

def build_train_data(question_lines, answer_lines, nb_train, word_vec):
    return (shape_data(question_lines, nb_train, word_vec), 
            shape_data(answer_lines, nb_train, word_vec))

def weights_dir(model_version):
    return "{0}/Dropbox/{1}-{2}".format(os.environ['HOME'], MODEL_NAME, model_version)

def best_weights_path(model_version):
    files = os.listdir(weights_dir(model_version))
    files.sort()
    return "{0}/{1}".format(weights_dir(model_version), files[0])

def default_callback_list(model_version):
    os.makedirs(weights_dir(model_version), exist_ok=True)
    filepath = weights_dir(model_version) + "/{loss:.4f}"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    return [checkpoint]

In [203]:
log("removing @usname and url")
cleanupTweets(JA_TWEETS_TXT, JA_TWEETS_CLEAN_TXT)
log("lemmalizing")
lemmalize(JA_TWEETS_CLEAN_TXT, JA_TWEETS_LEMMA_TXT)


>removing @usname and url...
>lemmalizing...


In [204]:
log("building word_vec")
word_vec = load_or_build_word_vec(JA_TWEETS_LEMMA_TXT, JA_TWEETS_WORD_VEC_CACHE_PATH, True)


>building word_vec...
>creating word_vec model...


In [205]:
print(word_vec.most_similar("いつか", topn=10))

log("word_vec is working")

[('うん', 0.9979744553565979), ('する', 0.9979556202888489), ('様', 0.9979550242424011), ('」', 0.9979531764984131), ('お', 0.9979510307312012), ('時', 0.9979509115219116), ('（', 0.9979509115219116), ('けど', 0.9979502558708191), ('プレゼント', 0.9979496002197266), ('結果', 0.9979491233825684)]
>word_vec is working...


In [206]:
log("reading tweets")
data_lines = read_lines(JA_TWEETS_LEMMA_TXT)
print("data_lines[10]", data_lines[10])


>reading tweets...
data_lines[10] 頑張っ て ( ˇ ω ˇ ) 


In [207]:
print(">sanitizing lines")
sanitized_lines = load_or_build_sanitized_lines(data_lines, word_vec, enable_cache=True)
question_lines, answer_lines = split_lines(sanitized_lines)
print(len(sanitized_lines))
print("question_lines[15]=", question_lines[15])
print("answer_lines[15]=", answer_lines[15])


>sanitizing lines
>sanitized_lines_ja not found.
>saving /Users/higepon/tmp/sanitized_lines_ja to file
>splinting data...
701
question_lines[15]= めちゃくちゃ きつい よ ね www 朝 早い し 夜 遅い し つかれ も えぐい （ 笑 ） EOS
answer_lines[15]= う ン 知っ て る () EOS


In [208]:
# 10%
nb_validation = int(len(question_lines) * 0.1)

question_lines_train = question_lines[nb_validation:]
question_lines_val = question_lines[:nb_validation]

answer_lines_train = answer_lines[nb_validation:]
answer_lines_val = answer_lines[:nb_validation]

save_lines(JA_QUESTION_LINES_VAL_PATH, question_lines_val)
save_lines(JA_ANSWER_LINES_VAL_PATH, answer_lines_val)
save_lines(JA_QUESTION_LINES_TRAIN_PATH, question_lines_train)
save_lines(JA_ANSWER_LINES_TRAIN_PATH, answer_lines_train)


# Save the sanitized text to file, so that we can use it for WordVec
save_lines(JA_SANITIZED_TEXT_PATH, sanitized_lines)
print(JA_SANITIZED_TEXT_PATH, "wrritten as file")

sanitized_lines = None
data_lines = None
word_vec = None

/Users/higepon/tmp/ja_sanitized_text.saved wrritten as file


In [209]:
# Create word_vec again, so that UNK token can also have vector representation
print(">buidling word vec with unk")
word_vec_with_unk = load_or_build_word_vec(JA_SANITIZED_TEXT_PATH, JA_SANITIZED_VEC_MODEL_CACHE_PATH, enable_cache=True)
print("word_vec_with_unk", word_vec_with_unk)

>buidling word vec with unk
>creating word_vec model...
word_vec_with_unk Word2Vec(vocab=4782, size=1024, alpha=0.025)


In [210]:
def shape_line(line, word_vec):
    zero_vec = np.zeros((WORD_VEC_SIZE))
    ret = []
    sentence = []
    words = line.split()
    for i in range(0, SEQ_LEN):
        if i < len(words):
            sentence.append(word_vec.wv[words[i]])
        else:
            sentence.append(zero_vec)            
    ret.append(sentence)
    return np.array(ret)

def generate_train_from_file(questions_path, answers_path, word_vec):
    while 1:
        with open(questions_path) as qf, open(answers_path) as af: 
            for q, a in zip(qf, af):
                x = shape_line(q, word_vec)
                y = shape_line(a, word_vec)
                yield (x, y)

In [211]:
print(">initiating model")
seq2seq = SimpleSeq2Seq(
    input_dim=WORD_VEC_SIZE,
    input_length=SEQ_LEN,
    hidden_dim=WORD_VEC_SIZE,
    output_dim=WORD_VEC_SIZE,
    output_length=SEQ_LEN,
    unroll=True,
    depth=4)
print(seq2seq)

print(">compiling seq2seq...")
#seq2seq.compile(loss='categorical_crossentropy', optimizer='adam')

seq2seq.compile(loss='mse', optimizer='sgd')
print(">done")


>initiating model
<keras.models.Sequential object at 0x125902d30>
>compiling seq2seq...
>done


In [212]:
import numpy as np
# Train!
model_version = 9
train_generator = generate_train_from_file(JA_QUESTION_LINES_TRAIN_PATH, JA_ANSWER_LINES_TRAIN_PATH, word_vec_with_unk)
val_generator = generate_train_from_file(JA_QUESTION_LINES_VAL_PATH, JA_ANSWER_LINES_VAL_PATH, word_vec_with_unk)

samples_per_epoch = 300
nb_val_samples = int(samples_per_epoch * 0.1)

seq2seq.fit_generator(train_generator,
                      samples_per_epoch=samples_per_epoch,
                      nb_val_samples=nb_val_samples,
                      nb_epoch=1,
                      verbose=1,
                      validation_data=val_generator,
                      callbacks=default_callback_list(model_version))


Epoch 1/1


<keras.callbacks.History at 0x249d89a20>

In [172]:
filtered_texts = [
    "もうかえりたい"
]

def shape_data(lines, nb_lines, word_vec):
    zero_vec = np.zeros((WORD_VEC_SIZE))
    ret = []
    for line in lines[:nb_lines]:
        sentence = []
        words = line.split()
        for i in range(0, SEQ_LEN):
            if i < len(words):
                sentence.append(word_vec.wv[words[i]])
            else:
                sentence.append(zero_vec)            
        ret.append(sentence)
    return np.array(ret)

In [173]:
texts = []
for line in filtered_texts:
    sanitized_line = []
    for word in line.split():
        if word in word_vec_with_unk.wv:
            sanitized_line.append(word)
        else:
            sanitized_line.append(UNK)
    texts.append(' '.join(sanitized_line))
print("texts=", texts)
X_test = shape_data(texts, len(texts), word_vec_with_unk)
print("shaped\n")
Y_test = seq2seq.predict(X_test.reshape(-1, 20, WORD_VEC_SIZE))
for i, y in enumerate(Y_test):
    for word in y:
        print(word_vec_with_unk.similar_by_vector(word, topn=1)[0])

texts= ['unk']


KeyError: "word 'unk' not in vocabulary"