In [1]:
import pickle
import os.path

# Constants and Helpers
FILE_PATH = "../../dont_remove_data/twitter_en.txt"
WORD_VEC_MODEL = "chat_bot_word_vec_model.saved"
SANITIZED_TEXT_PATH = "sanitized_text.saved"
SANITIZED_VEC_MODEL = "chat_bot_sanitized_word_vec_model.saved"
EN_WHITELIST = '0123456789abcdefghijklmnopqrstuvwxyz ' # space is included in whitelist
word_vec_size = 1024
seq_len = 20

def read_lines(filename):
    return open(filename).read().split('\n')[:-1]

def filter_line(line, whitelist):
    return ''.join([ ch for ch in line if ch in whitelist ])

def namestr(obj):
    return [name for name in globals() if globals()[name] is obj][0]

def save_obj(obj):
    varname = namestr(obj)
    print(">saving {0} to file".format(varname))
    with open("{0}.saved".format(varname), mode='wb') as f:
        pickle.dump(obj, f, protocol=0)

def load_obj(obj):
    varname = namestr(obj)
    filepath = "{0}.saved".format(varname)
    if os.path.exists(filepath):
        print(">loading {0} from file".format(varname))
        with open(filepath, mode='rb') as f:
            return pickle.load(f)
    else:
        print(">{0} not found.".format(varname))        
        return False

In [2]:
# Read file as lines

print(">reading file...")
lines = read_lines(FILE_PATH)
print(lines[10])

print(">removing unnecesary characters")
lines = [filter_line(line, EN_WHITELIST) for line in lines]

>reading file...
jill stein &amp; her fan club can now officially go to hell -just vote trump &amp; be done with it
>removing unnecesary characters


In [3]:
# Create Word Vector for all vocaburaries in the whole data
from gensim.models import word2vec
import os.path

if os.path.exists(WORD_VEC_MODEL):
    print(">loading from exisiting model")
    word_vec = word2vec.Word2Vec.load(WORD_VEC_MODEL)
else:
    print(">creating word_vec model")
    print(">loading corpus for word vec...")
    sentences = word2vec.LineSentence(FILE_PATH)
    word_vec = word2vec.Word2Vec(sentences, size=word_vec_size, min_count=20, window=15)
    word_vec.save(WORD_VEC_MODEL)
 
print(">word_vec done", word_vec)

>loading from exisiting model
>word_vec done Word2Vec(vocab=22915, size=1024, alpha=0.025)


In [4]:
# Replace all unknown words with UNK token
sanitized_lines = False
sanitized_lines = load_obj(sanitized_lines)
UNK = 'unk'
EOS = 'EOS'

print("len(lines)", len(lines))

if not sanitized_lines:
    sanitized_lines = []
    for line in lines:
        sanitized_line = []
        for word in line.split():
            if word in word_vec.wv:
                sanitized_line.append(word)
            else:
                sanitized_line.append(UNK)
        sanitized_line.append(EOS)
        sanitized_lines.append(' '.join(sanitized_line))
    save_obj(sanitized_lines)

print("len(sanitized_lines)", len(sanitized_lines))
del word_vec

>loading sanitized_lines from file
len(lines) 754530
len(sanitized_lines) 754530


In [5]:
# Save the sanitized text to file, so that we can use it for WordVec
f = open(SANITIZED_TEXT_PATH, 'w')
for line in sanitized_lines:
    f.write(line)
    f.write('\n')
f.close()
print(SANITIZED_TEXT_PATH, "wrritten as file")

sanitized_text.saved wrritten as file


In [6]:
# Create word_vec again, so that UNK token can also have vector representation
if os.path.exists(SANITIZED_VEC_MODEL):
    print(">loading from exisiting model")
    word_vec_with_unk = word2vec.Word2Vec.load(SANITIZED_VEC_MODEL)
else:
    print(">creating word_vec model")
    print(">loading corpus for word vec...")
    sentences = word2vec.LineSentence(FILE_PATH)
    word_vec_with_unk = word2vec.Word2Vec(word2vec.LineSentence(SANITIZED_TEXT_PATH), size=word_vec_size, min_count=20, window=15)
    word_vec_with_unk.save(SANITIZED_VEC_MODEL)
 
print(">word_vec done", word_vec_with_unk)
print(word_vec_with_unk.wv[UNK][0])
wv = word_vec_with_unk.wv

>loading from exisiting model
>word_vec done Word2Vec(vocab=14369, size=1024, alpha=0.025)
0.135079


In [7]:
# Split the lines to question and answer
question_lines = []
answer_lines = []

print(">splinting data")

# 2n   lines are questions
# 2n+1 lines are answers
for i in range(0, len(sanitized_lines) // 2, 2):
    question_lines.append(sanitized_lines[i])
    answer_lines.append(sanitized_lines[i + 1])

print("question_lines[10]=", question_lines[10])
print("answer_lines[10]=", answer_lines[10])

>splinting data
question_lines[10]= unk operation in best response for any action pakistan and army EOS
answer_lines[10]= while unk corruption schemes before next unk unk gets the charge EOS


In [8]:
def shape_data(lines, nb_lines):
    zero_vec = np.zeros((word_vec_size))
    ret = []
    for line in lines[:nb_lines]:
        sentence = []
        words = line.split()
        for i in range(0, seq_len):
            if i < len(words):
                sentence.append(wv[words[i]])
            else:
                sentence.append(zero_vec)            
        ret.append(sentence)
    return np.array(ret)

def build_train_data(question_lines, answer_lines, nb_train):
    return (shape_data(question_lines, nb_train), 
            shape_data(answer_lines, nb_train))




In [9]:
# split train data
import numpy as np
TEST_DATA_RATIO = 0.70
print(">spliting train data")
nb_train = int(len(question_lines) * TEST_DATA_RATIO)
print("nb_train=", nb_train)



X_train = False
Y_train = False
X_train = load_obj(X_train)
Y_train = load_obj(Y_train)

if not X_train:
    print(">building X_train...")
    X_train, Y_train = build_train_data(question_lines, answer_lines, nb_train)
    print(">saving X_train...")
#    save_obj(X_train)
    print(">saving Y_train...")
#    save_obj(Y_train)

print("X_train.shape", X_train.shape)
print("Y_train.shape", Y_train.shape)


>spliting train data
nb_train= 132043
>X_train not found.
>X_train not found.
>building X_train...
>saving X_train...
>saving Y_train...
X_train.shape (132043, 20, 1024)
Y_train.shape (132043, 20, 1024)


In [10]:
# too large
# Should split this as functional data
#with open("{0}.saved".format("X_train2"), mode='wb') as f:
#        np.save(f, X_train)


In [11]:
from keras.callbacks import ModelCheckpoint
model_name = "chat_bot"

def weights_dir(model_version):
    return "{0}/Desktop/{1}-{2}".format(os.environ['HOME'], model_name, model_version)

def best_weights_path(model_version):
    files = os.listdir(weights_dir(model_version))
    files.sort()
    return "{0}/{1}".format(weights_dir(model_version), files[0])

def default_callback_list(model_version):
    os.makedirs(weights_dir(model_version), exist_ok=True)
    filepath = weights_dir(model_version) + "/{loss:.4f}"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    return [checkpoint]


Using TensorFlow backend.


In [12]:
# Keras model
from seq2seq.models import SimpleSeq2Seq

print(">initiating model")
seq2seq = SimpleSeq2Seq(
    input_dim=word_vec_size,
    input_length=seq_len,
    hidden_dim=word_vec_size,
    output_dim=word_vec_size,
    output_length=seq_len,
    unroll=True,
    depth=3) # todo change this to 3

print(word_vec_size)
print(seq_len)
#model = SimpleSeq2Seq(input_dim=5, input_length=8, hidden_dim=10, output_length=8, output_dim=8, depth=3, unroll=True)


print(seq2seq)

>initiating model
1024
20
<keras.models.Sequential object at 0x124dacba8>


In [13]:
print(">compiling seq2seq...")
#seq2seq.compile(loss='categorical_crossentropy', optimizer='adam')
seq2seq.compile(loss='mse', optimizer='adam')
print(">done")
model_version = 3


>compiling seq2seq...
>done


In [22]:

seq2seq.fit(X_train, Y_train, batch_size=100, nb_epoch=10, verbose=1, validation_split=0.05, callbacks=default_callback_list(model_version))
#seq2seq.fit(X_train, Y_train)

Train on 125440 samples, validate on 6603 samples
Epoch 1/10
 10900/125440 [=>............................] - ETA: 19132s - loss: 0.1435

KeyboardInterrupt: 

In [21]:
# you want to turn twitter followers into blog readers
tt = seq2seq.predict(X_train[1].reshape(1, 20, word_vec_size))[0]
print(tt.shape)
for t in tt:
    print(word_vec_with_unk.similar_by_vector(t, topn=1)[0])


(20, 1024)
('i', 0.8755702972412109)
('i', 0.6418577432632446)
('it', 0.6855287551879883)
('it', 0.6676929593086243)
('it', 0.6593996286392212)
('it', 0.6500342488288879)
('it', 0.6462546586990356)
('it', 0.6440309286117554)
('it', 0.6432981491088867)
('it', 0.6429035663604736)
('it', 0.6425304412841797)
('it', 0.6419748067855835)
('EOS', 0.6438758373260498)
('EOS', 0.6474924087524414)
('EOS', 0.6516628265380859)
('EOS', 0.6564445495605469)
('EOS', 0.6617050766944885)
('EOS', 0.6668707132339478)
('EOS', 0.6715269684791565)
('EOS', 0.6755284070968628)


In [15]:
print("loading", best_weights_path(model_version))
seq2seq.load_weights(best_weights_path(model_version))  
print("loaded")

IndexError: list index out of range

In [16]:
texts = [
    'When is good for you today?',
    "I'm leaving soon."
]
filtered_texts = [filter_line(line.lower(), EN_WHITELIST) for line in texts]

print(filtered_texts)



['when is good for you today', 'im leaving soon']


In [17]:
texts = []
for line in filtered_texts:
    sanitized_line = []
    for word in line.split():
        if word in word_vec_with_unk.wv:
            sanitized_line.append(word)
        else:
            sanitized_line.append(UNK)
    texts.append(' '.join(sanitized_line))
print("texts=", texts)
X_test = shape_data(texts, len(texts))
print("shaped\n")
Y_test = seq2seq.predict(X_test.reshape(-1, 20, word_vec_size))
for i, y in enumerate(Y_test):
    for word in y:
        print(word_vec_with_unk.similar_by_vector(word, topn=1)[0])

texts= ['when is good for you today', 'im leaving soon']
shaped

('i', 0.8358275890350342)
('i', 0.6553348302841187)
('it', 0.6553231477737427)
('it', 0.6335973739624023)
('EOS', 0.6429638862609863)
('EOS', 0.6631112694740295)
('EOS', 0.6733455657958984)
('EOS', 0.6790584921836853)
('EOS', 0.6820131540298462)
('EOS', 0.6836193799972534)
('EOS', 0.6847196817398071)
('EOS', 0.6858545541763306)
('EOS', 0.6873364448547363)
('EOS', 0.689303457736969)
('EOS', 0.6917437314987183)
('EOS', 0.6944006681442261)
('EOS', 0.6969164609909058)
('EOS', 0.6991458535194397)
('EOS', 0.7008798122406006)
('EOS', 0.7021517753601074)
('i', 0.8621678352355957)
('i', 0.6555297374725342)
('it', 0.670153021812439)
('it', 0.6468616127967834)
('EOS', 0.6353713870048523)
('EOS', 0.6575458645820618)
('EOS', 0.6684443354606628)
('EOS', 0.6744173765182495)
('EOS', 0.6773630976676941)
('EOS', 0.6789718866348267)
('EOS', 0.680191159248352)
('EOS', 0.6815271973609924)
('EOS', 0.6832265257835388)
('EOS', 0.6853458881378174

In [18]:
## EOS
## refactoring