# Preprocessing
scripts to change into raw files suitable for google seq2seq library

following https://google.github.io/seq2seq/nmt/

In [1]:
import pandas as pd
import random
from tqdm import tqdm_notebook as tqdm
import numpy as np
from tensorflow.contrib import learn


## load data

In [None]:
def read_lines(filenames):
    data_files = []
    for filename in filenames:
        print("Reading Tweets_processed_%s.txt" % filename)
        f = pd.read_csv('./Tweets_processed_%s.txt' % filename,
                         sep="\t",
                         skiprows=[0],
                         error_bad_lines=False,
                         names=["index", "tweet_id", "text", "user_id"],
                         dtype={"text": str, "user_id": str, "tweet_id": str, "index": str})
        data_files.append(f)


    data_files = pd.concat(data_files)
    print("Appending all the files")

    count = 0
    q_lines = []
    a_lines = []
    previous = None
    for i, row in tqdm(data_files.iterrows()):
        try:
            index = int(row['index'])
        except ValueError:
            print("ValueError %s" % row['index'])
            index = ""
        if index:
            if previous != None:
                if previous[0] == index - 1:
                    q_lines.append(str(previous[1]))
                    a_lines.append(str(row['text']))
                    count += 1
            previous = (index, row['text'])

    print("%s Q&A pairs added" % count)
    return q_lines, a_lines


In [None]:
q, a = read_lines(["Train", "Valid", "Test"])

In [None]:
random_idx = list(range(len(q)))
random.shuffle(random_idx)
train_idx = random_idx[:int(len(random_idx)*0.9)]
test_idx = random_idx[int(len(random_idx)*0.9)+1:]
print(len(train_idx))
print(len(test_idx))

In [None]:
for i in test_idx[:10]:
    print("Q: %s A: %s\n" % (q[i], a[i]))

In [None]:
test_q = [q[i] for i in test_idx]
len(test_q)

In [None]:
train_q = [q[i] for i in tqdm(train_idx)]
len(train_q)

In [None]:
test_a = [a[i] for i in test_idx]
len(test_a)

In [None]:
train_a = [a[i] for i in tqdm(train_idx)]
len(train_a)

In [None]:
def save_file(name, q, a):
    with open("%s.question" % name, "w") as f1:
        with open("%s.answer" % name, "w") as f2:
            for i in range(len(q)):
                f1.write(q[i].replace("\n", " ") + "\n")
                f2.write(a[i].replace("\n", " ") + "\n")

In [2]:
def load_file(name):
    with open("%s.question" % name, "r") as f1:
        with open("%s.answer" % name, "r") as f2:
            q = [line.rstrip() for line in f1]
            a = [line.rstrip() for line in f2]
            assert len(q) == len(a)
    return q,a

In [3]:
# if u need to load again
test_q, test_a = load_file("test_large")

In [4]:
train_q, train_a = load_file("train_large")

## create vocabulary

In [39]:
tokens_test_q = [line.split(" ") for line in tqdm(test_q)]
tokens_test_a = [line.split(" ") for line in tqdm(test_a)]





In [40]:
tokens_train_q = [line.split(" ") for line in tqdm(train_q)]
tokens_train_a = [line.split(" ") for line in tqdm(train_a)]





## save only respones with 2~30 tokens 

In [None]:
max_len = 30

In [7]:
# make into tuples first
train = []
for i in range(len(tokens_train_q)):
    train.append((tokens_train_q[i], tokens_train_a[i]))
test = []
for i in range(len(tokens_test_q)):
    test.append((tokens_test_q[i], tokens_test_a[i]))    

In [None]:
_test = list(filter(lambda x: len(x[0]) >= 2 and len(x[0]) <= max_len - 1 and len(x[1]) >= 2 and len(x[1]) <= max_len - 1, test))

In [None]:
len(test), len(_test)

In [None]:
_train = list(filter(lambda x: len(x[0]) >= 2 and len(x[0]) <= max_len - 1 and len(x[1]) >= 2 and len(x[1]) <= max_len - 1, train))

In [None]:
len(train), len(_train)

In [None]:
_train[10]

In [None]:
for q,a in tqdm(_test):
    q.append("<EOS>")
    a.append("<EOS>")
    for _ in range(max_len - len(a)):
        a.append("<PAD>")
    for _ in range(max_len - len(q)):
        q.append("<PAD>")

In [None]:
for q,a in tqdm(_train):
    q.append("<EOS>")
    a.append("<EOS>")
    for _ in range(max_len - len(a)):
        a.append("<PAD>")
    for _ in range(max_len - len(q)):
        q.append("<PAD>")

In [None]:
_train[70]

merge all the words

In [41]:
_test = test
_train = train

In [42]:
total_list = []
for q,a in tqdm(_test):
    total_list.extend(q)
    total_list.extend(a)
len(total_list)




5769660

In [43]:
for q,a in tqdm(_train):
    total_list.extend(q)
    total_list.extend(a)
len(total_list)




57642542

In [14]:
total_list[:10]

['u',
 'mean',
 'the',
 'live',
 'broadcast',
 'in',
 'movie',
 'theaters',
 '?',
 'nah']

In [44]:
from collections import Counter
vocab = Counter(total_list)

In [45]:
vocab.most_common(10)

[('<PAD>', 29999158),
 ('<EOS>', 1921418),
 ('.', 906950),
 ('i', 770699),
 ('!', 766157),
 (',', 572899),
 ('?', 552936),
 ('you', 546690),
 ('the', 478095),
 ('to', 432458)]

### create vocab processor

In [46]:
vocab_size = 25000

In [47]:
vocabulary = {"UNK":0}
reverse_vocabulary = {0:"UNK"}
count = 1
for word, _ in vocab.most_common(vocab_size):
    vocabulary.update({word: count})
    reverse_vocabulary.update({count:word})
    count += 1

In [48]:
len(vocabulary.keys())

25001

In [16]:
_train_q = [" ".join(x[0]) for x in tqdm(_train)]
_train_a = [" ".join(x[1]) for x in tqdm(_train)]
len(_train_q), len(_train_a)





(864548, 864548)

In [None]:
_test_q = [" ".join(x[0]) for x in tqdm(_test)]
_test_a = [" ".join(x[1]) for x in tqdm(_test)]
len(_test_q), len(_test_a)

## Save preprocessed data

### raw text

In [None]:
save_file("test_large", _test_q, _test_a)

In [None]:
save_file("train_large", _train_q, _train_a)

test if it loads correctly

### vocabulary & idx matrix

In [49]:
def transform(qa_pairs):
    q_vocab = []
    a_vocab = []
    
    error = 0
    for q,a in tqdm(qa_pairs):
        if len(q) == len(a):
            q_vocab.append([vocabulary[token] if token in vocabulary else 0 for token in q])
            a_vocab.append([vocabulary[token] if token in vocabulary else 0 for token in a])
        else:
            error += 1
    print("transform complete with error rate %.3f" % float(error/len(qa_pairs)))
    return np.array(q_vocab), np.array(a_vocab)

In [50]:
test_q_vocab, test_a_vocab = transform(_test)
test_q_vocab.shape, test_a_vocab.shape


transform complete with error rate 0.000


((96161, 30), (96161, 30))

In [51]:
train_q_vocab, train_a_vocab = transform(_train)
train_q_vocab.shape, train_a_vocab.shape

515315/|/ 60%|| 515315/864548 [00:25<00:17, 20400.33it/s]
transform complete with error rate 0.000


((864546, 30), (864546, 30))

In [52]:
def reverse(row, reverse_vocab):
    return [reverse_vocab[id] for id in row]

In [53]:
reverse(train_q_vocab[100], reverse_vocabulary)

["won't",
 'eat',
 'either',
 '.',
 'he',
 'is',
 'so',
 'picky',
 'these',
 'days',
 '.',
 '<EOS>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>']

In [54]:
np.save('train_q_large.npy', train_q_vocab)
np.save('train_a_large.npy', train_a_vocab)

In [55]:
np.save('test_q_large.npy', test_q_vocab)
np.save('test_a_large.npy', test_a_vocab)

In [56]:
import pickle

In [57]:
vocab_ = {"word2id": vocabulary, "id2word":reverse_vocabulary}

In [58]:
with open('large.vocab', 'wb') as f:
    pickle.dump(vocab_, f)