# Fast Text

In [None]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

## Data Preprocessing

To download stanford large movie review dataset, use this command:
```sh
wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
tar -xvf aclImdb_v1.tar.gz
```

In [None]:
max_time = 500

In [None]:
import unicodedata
import re

# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [None]:
from os import listdir
import collections

train_path = 'aclImdb/train'
data_tr_pos = []
for f_name in listdir(train_path + '/pos'):
    with open(train_path + '/pos/' + f_name) as f:
        data_tr_pos.append(normalizeString(f.readlines()[0]).split()[:max_time])

        data_tr_neg = []
for f_name in listdir(train_path + '/neg'):
    with open(train_path + '/neg/' + f_name) as f:
        data_tr_neg.append(normalizeString(f.readlines()[0]).split()[:max_time])
        
test_path = 'aclImdb/test'
data_tst_pos = []
for f_name in listdir(test_path + '/pos'):
    with open(test_path + '/pos/' + f_name) as f:
        data_tst_pos.append(normalizeString(f.readlines()[0]).split()[:max_time])

data_tst_neg = []
for f_name in listdir(test_path + '/neg'):
    with open(test_path + '/neg/' + f_name) as f:
        data_tst_neg.append(normalizeString(f.readlines()[0]).split()[:max_time])

In [None]:
words = []
for d in [data_tr_pos, data_tr_neg, data_tst_pos, data_tst_neg]:
    for s in d:
        for w in s:
            words.append(w)
            
count = collections.Counter(words).most_common()

dictionary = dict()
dictionary['<Nothing>'] = 0
for word, _ in count:
    dictionary[word] = len(dictionary)
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

del(words)
del(count)

In [None]:
# data
data_tr_concat = np.concatenate([data_tr_pos, data_tr_neg], 0)
data_tst_concat = np.concatenate([data_tst_pos, data_tst_neg], 0)

data_tr = []
for i in range(len(data_tr_concat)):
    tmp = [dictionary[x] for x in data_tr_concat[i]]
    data_tr.append(tmp + [0]*(max_time-len(data_tr_concat[i])))
data_tr_len = [len(x) for x in data_tr_concat]

data_tst = []
for i in range(len(data_tst_concat)):
    tmp = [dictionary[x] for x in data_tst_concat[i]]
    data_tst.append(tmp + [0]*(max_time-len(data_tst_concat[i])))
data_tst_len = [len(x) for x in data_tst_concat]

# labels
label_tr = [1] * len(data_tr_pos) + [0] * len(data_tr_neg)
label_tst = [1] * len(data_tst_pos) + [0] * len(data_tst_neg)

del(data_tr_pos)
del(data_tr_neg)
del(data_tst_pos)
del(data_tst_neg)

In [None]:
def batch_iter(x, x_len, y, batch_size=128, shuffle=True):
    x = np.array(x)
    y = np.array(y)
    x_len = np.array(x_len)
    
    ids = np.arange(len(x))
    if shuffle:
        ids = np.random.permutation(ids)
    
    batch_num = int(np.ceil(len(ids) // batch_size))
    
    for i in range(batch_num):
        idx_str = i * batch_size
        idx_end = (i+1) * batch_size
        yield x[ids[idx_str:idx_end]], x_len[ids[idx_str:idx_end]], y[ids[idx_str:idx_end]]

## Model

In [None]:
vocab_size = len(dictionary)
hidden_dim = 10

A = tf.Variable(np.random.rand(vocab_size, hidden_dim), dtype=tf.float32, name='lookup_table')
B = tf.Variable(np.random.rand(hidden_dim, 1), dtype=tf.float32, name='output')

stn = tf.placeholder(tf.int32, [None, max_time], 'stn')
stn_len = tf.placeholder(tf.float32, [None], 'stn_len')
label = tf.placeholder(tf.float32, [None], 'label')

In [None]:
stn_emb = tf.nn.embedding_lookup(A, stn)
hidden = tf.reduce_sum(stn_emb, 1) / tf.reshape(stn_len, [-1,1])
logit = tf.matmul(hidden, B)

In [None]:
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=tf.reshape(label, [-1,1])))

## Training & Test & Logging

In [None]:
a = tf.placeholder(tf.float32, ())
train = tf.train.GradientDescentOptimizer(a).minimize(loss)

init = tf.global_variables_initializer()
sess = tf.Session()

sess.run(init)

In [None]:
# logging
tf.summary.scalar('learning_rate', a)
tf.summary.scalar('cost', loss)
summaries = tf.summary.merge_all()

summary_writer = tf.summary.FileWriter('./')

In [None]:
epochs = 3000
cnt = 0
for epoch in range(epochs):
    if epoch < 1000:
        lrn_rate = .1
    elif epoch < 1500:
        lrn_rate = .03
    elif epoch < 2000:
        lrn_rate = .01
    elif epoch < 2500:
        lrn_rate = .003
    else:
        lrn_rate = .001
        
    batches = batch_iter(data_tr, data_tr_len, label_tr, batch_size=128)
    
    for x, x_len, y in batches:
        _, s = sess.run([train, summaries], feed_dict={a: lrn_rate, stn:x, stn_len: x_len, label: y})
    
        summary_writer.add_summary(s, cnt)
        cnt += 1
    
    # test
    batches_tst = batch_iter(data_tst, data_tst_len, label_tst, batch_size=128, shuffle=False)
    
    rets = []
    ys = []
    for x, x_len, y in batches_tst:
        ret = sess.run(logit, feed_dict={stn:x, stn_len: x_len, label: y})
        rets.extend(ret)
        ys.extend(y)
    rets = np.array(rets).reshape(-1)
    
    precision = np.mean(np.round(1/(np.exp(-rets)+1)).reshape(-1) == ys)

    precision_summ = tf.Summary()
    precision_summ.value.add(tag='precision', simple_value=precision)
    summary_writer.add_summary(precision_summ, epoch)
    
    print(epoch, end='\r')

## Results

Though this simple linear model only uses 10 hidden dimension, it achieves almost same results with RNN(http://domkaukinen.com/sentiment-analysis-with-tensorflow/).

![title](logs/fasttext/precision.png)