In [13]:
import os
import pandas as pd
import numpy as np
import nltk
import collections
from sklearn.preprocessing import normalize

In [2]:
word_freq = collections.Counter()
max_len = 0
num_rec = 0

with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        label, sentence = line.decode('utf8').strip().split('\t')
        words = nltk.word_tokenize(sentence.lower())
        if len(words) > max_len:
            max_len = len(words)
        for word in words:
            word_freq[word] += 1
        num_rec += 1

In [3]:
sentence

'Oh, and Brokeback Mountain was a terrible movie.'

In [4]:
MAX_FEATURES = 2000
MAX_SENTENCE_LENGTH = 40
# most_common output -> list
word2idx = {x[0]: i+2 for i, x in enumerate(word_freq.most_common(MAX_FEATURES - 2))}
word2idx ['PAD'] = 0
word2idx['UNK'] = 1

In [5]:
idx2word= {i:v for v, i in word2idx.items()}
vocab_size = len(word2idx)

In [6]:
y = []
x = []
origin_txt = []
with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        _label, _sentence = line.decode('utf8').strip().split('\t')
        origin_txt.append(_sentence)
        y.append(int(_label))
        words = nltk.word_tokenize(_sentence.lower())
        _seq = []
        for word in words:
            if word in word2idx.keys():
                _seq.append(word2idx[word])
            else:
                _seq.append(word2idx['UNK'])
        if len(_seq) < MAX_SENTENCE_LENGTH:
            _seq.extend([0] * ((MAX_SENTENCE_LENGTH) - len(_seq)))
        else:
            _seq = _seq[:MAX_SENTENCE_LENGTH]
        x.append(_seq)

In [8]:
def one_hot(x, vocab_size):
    res = np.zeros(shape = (vocab_size))
    res[x] = 1
    return res

#### Data Iterator 만들기
  * x :$ (batch \times vocab \times word)$

In [9]:
x_1 = np.array([np.array([one_hot(word, MAX_FEATURES) for word in example]) for example in x])

In [10]:
tr_idx = np.random.choice(range(x_1.shape[0]), int(x_1.shape[0] * .8))
va_idx = [x for x in range(x_1.shape[0]) if x not in tr_idx]

In [11]:
tr_x = x_1[tr_idx, :]
tr_y = [y[i] for i in tr_idx]
va_x = x_1[va_idx, :]
va_y = [y[i] for i in va_idx]

In [14]:
import mxnet as mx
batch_size = 16
train_data = mx.io.NDArrayIter(data=[tr_x, tr_y], batch_size=batch_size, shuffle = False)
valid_data = mx.io.NDArrayIter(data=[va_x, va_y], batch_size=batch_size, shuffle = False)

#### float, int

* int, float, (unicode) -> 4 Byte
* list overhead -> 64 Byte
* int, float overhead -> 24 Byte
* 모든 primative도 객체로 인식해서 모두 20 Byte의 overhead가 붙음
  - np.float16: 26 Byte (24 + 2 Byte)
  - np.float32: 28 Byte (24 + 4 Byte)
  - np.float64: 32 Byte (24 + 8 Byte)
  
 * np.float32([5]).nbytes => 4 Byte
 * np.float32([5, 5]).nbytes => 8 Byte

 #### Byte
 
 * sys.getsizeof(b'') => 33 Byte
 * sys.getsizeof(b'5') => 34 Byte (원소가 1개 추가될 때마다 1 Byte씩 증가)
 
 #### Unicode
 
 * sys.getsizeof(u'') => 49 Byte
 * sys.getsizeof(u'5') => 58 Byte (원소가 1개 추가될 때 9 Byte 증가)
 * sys.getsizeof(u'56') => 51 Byte (원소가 1개 추가될 때마다 1 Byte씩 증가)
 * sys.getsizeof(u'123') => 52 Byte (원소가 1개 추가될 때마다 1 Byte씩 증가)
 * sys.getsizeof(u'가') => 76 Byte (27 Byte 증가)
 * sys.getsizeof(u'가나') => 78 Byte (원소가 1개 추가될 때마다 2 Byte씩 증가)
 
 



### Sentence Representation
  * 모든 단어의 조합에 대해 다음과 같은 네트워크를 적용시킴
  
  $$ f(x_i, x_j ) =W \phi(U_{left} e_i + U_{right} e_j)$$
  
  * Sentence representation과 classification을 구분짓기가 어려움
    - BOW는 단순히 word vector의 합, 혹은 평균을 sentence representation으로 보고 이를 input으로 classify를 구성하였음
      - 문장을 1개의 vector로 압축.
    - CBOW는 one-hot representation을 보다 작은 차원의 embedding space로 mapping시킨 후에 embedding 결과를 classification의 input으로 사용
      - 문장을 여러개의 vector로 압축. pre-trained embedding을 이용하면, classification과 sentence representation을 구분시킬 수 있으나,
      - 만약 network에 녹여 동시에 학습을 시키려 한다면, 둘은 구분이 되지 않음
    - 하나의 network로 두가지 task를 연속적으로 학습시켜야 함

In [15]:
from mxnet import gluon, autograd, nd
from mxnet.gluon import nn
context = mx.gpu()

In [13]:
class RN_Classifier(nn.HybridBlock):
    def __init__(self, SENTENCE_LENGTH, VOCABULARY, **kwargs):
        super(RN_Classifier, self).__init__(**kwargs)
        self.SENTENCE_LENGTH = SENTENCE_LENGTH
        self.VOCABULARY = VOCABULARY
        with self.name_scope():
            self.g_fc1 = nn.Dense(256,activation='relu')
            self.g_fc2 = nn.Dense(256,activation='relu')
            self.g_fc3 = nn.Dense(256,activation='relu')
            self.g_fc4 = nn.Dense(256,activation='relu')


            self.fc1 = nn.Dense(128, activation = 'relu') # 256 * 128
            self.fc2 = nn.Dense(2) # 128 * 2
            # 1253632 param : 약 20MB
    def hybrid_forward(self, F, x):
        # (x_i, x_j)의 pair를 만들기
        # 64 배치를 가정하면
        print('x shape = {}'.format(x.shape))
        x_i = x.expand_dims(1) # 64 * 1* 40 * 2000* : 0.02GB
        print('x shape = {}'.format(x.shape))
        x_i = F.repeat(x_i,repeats= self.SENTENCE_LENGTH, axis=1) # 64 * 40 * 40 * 2000: 1.52GB
        print('x shape = {}'.format(x.shape))
        x_j = x.expand_dims(2) # 64 * 40 * 1 * 2000
        x_j = F.repeat(x_j,repeats= self.SENTENCE_LENGTH, axis=2) # 64 * 40 * 40 * 2000: 1.52GB
        x_full = F.concat(x_i,x_j,dim=3) # 64 * 40 * 40 * 4000: 3.04GB
        
        # batch*sentence_length*sentence_length개의 batch를 가진 2*VOCABULARY input을 network에 feed
        _x = x_full.reshape((-1, 2 * self.VOCABULARY))
        _x = self.g_fc1(_x) # (64 * 40 * 40) * 256: .1GB 추가메모리는 안먹나?
        _x = self.g_fc2(_x) # (64 * 40 * 40) * 256: .1GB (reuse)
        _x = self.g_fc3(_x) # (64 * 40 * 40) * 256: .1GB (reuse)
        _x = self.g_fc4(_x) # (64 * 40 * 40) * 256: .1GB (reuse)
        
        # sentence_length*sentence_length개의 결과값을 모두 합해서 sentence representation으로 나타냄
        x_g = _x.reshape((-1, self.SENTENCE_LENGTH * self.SENTENCE_LENGTH,256)) # (64, 40*40, 256) : .1GB
        sentence_rep = x_g.sum(1) # (64, 256): ignorable
        
        # 여기서부터는 classifier
        clf = self.fc1(sentence_rep)
        clf = self.fc2(clf)
        return clf

In [None]:
rn = RN_Classifier(MAX_FEATURES, 50)
z = np.random.uniform(size = (64, 40, 2000))
z1 = nd.array(z, ctx =context)
rn(z1)

x shape = (64, 40, 2000)


In [118]:
rn = RN_Classifier(MAX_FEATURES, 50)

rn.collect_params().initialize(mx.init.Xavier(), ctx = context)
%memit(rn.g_fc1) # 4000 * 256 )
%memit(rn.g_fc2) # 256 * 256
%memit(rn.g_fc3) # 256 * 256
%memit(rn.g_fc4) # 256 * 256
loss = gluon.loss.SoftmaxCELoss()
trainer = gluon.Trainer(rn.collect_params(), 'adam', {'learning_rate': 1e-3})

peak memory: 14219.05 MiB, increment: -0.01 MiB
peak memory: 14219.08 MiB, increment: 0.02 MiB
peak memory: 14219.08 MiB, increment: 0.00 MiB
peak memory: 14219.08 MiB, increment: 0.00 MiB


In [31]:
n_epoch = 10
from tqdm import tqdm, tqdm_notebook

In [None]:
for epoch in tqdm_notebook(range(n_epoch), desc = 'epoch'):
    ## Training
    train_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(train_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        with autograd.record():
            print(_dat.shape)
            _out = rn(_dat)
            _los = nd.sum(loss(_out, _label)) # 배치의 크기만큼의 loss가 나옴
            _los.backward()
        trainer.step(_dat.shape[0])
        n_obs += _dat.shape[0]
        _total_los += nd.sum(_los).asnumpy()
        # Epoch loss를 구하기 위해서 결과물을 계속 쌓음
        pred.extend(nd.softmax(_out)[:,1].asnumpy()) # 두번째 컬럼의 확률이 예측 확률
        label.extend(_label.asnumpy())
    tr_acc = accuracy_score(label, [round(p) for p in pred])
    tr_loss = _total_los/n_obs
    
    ### Evaluate training
    valid_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(valid_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        _out = mlp(_dat)
        _pred_score = nd.softmax(_out)
        n_obs += _dat.shape[0]
        _total_los += nd.sum(loss(_out, _label))
        pred.extend(nd.softmax(_out)[:,1].asnumpy())
        label.extend(_label.asnumpy())
    va_acc = accuracy_score(label, [round(p) for p in pred])
    va_loss = _total_los/n_obs
    tqdm.write('Epoch {}: tr_loss = {}, tr_acc= {}, va_loss = {}, va_acc= {}'.format(epoch, tr_loss, tr_acc, va_loss, va_acc))

HBox(children=(IntProgress(value=0, description='epoch', max=10), HTML(value='')))

In [186]:
y_pred_mlp = mlp(nd.array(va_x, ctx = context))
# softmax를 적용하고
# 두번째 열을 뽑아와서
# nd.round 함수를 적용해서 0/1 예측값을 얻고
# numpy array로 바꾸고
# 첫번째 원소를 뽑아서 예측 label로 사용
pred_mlp = [nd.round(val).asnumpy()[0] for val in nd.softmax(y_pred_mlp)[:, 1]] 

In [188]:
accuracy_mlp = accuracy_score(va_y, pred_mlp)
print('Accuracy: %.2f%%'%(accuracy_rf * 100.0))

Accuracy: 98.47%


In [17]:
a = nd.array([1,2,2])

In [27]:
print(np.finfo(a[0]).dtype)

float32


#### DNN without embedding

In [28]:
class MLP(nn.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.dense1 = nn.Dense(64)
            #self.dense2 = nn.Dense(32, activation = 'relu')
            self.bn = nn.BatchNorm()
            self.dense2 = nn.Dense(2)
            
    def forward(self, x):
        x = self.dense1(x)
        x = self.bn(x)
        x = nd.relu(x)
        x = self.dense2(x)
        return x

In [59]:
g_fc1 = nn.Dense(256, activation='relu')

In [195]:
n_epoch = 10
batch_size = 64
from tqdm import tqdm, tqdm_notebook

In [55]:
mlp_no_embedding = MLP()
mlp_no_embedding.collect_params().initialize(mx.init.Xavier(), ctx = context)
loss = gluon.loss.SoftmaxCELoss()
trainer = gluon.Trainer(mlp_no_embedding.collect_params(), 'adam', {'learning_rate': 1e-3})

In [56]:
[mlp_no_embedding.collect_params()[x].data() for x in mlp_no_embedding.collect_params()]

DeferredInitializationError: Parameter mlp1_dense0_weight has not been initialized yet because initialization was deferred. Actual initialization happens during the first forward pass. Please pass one batch of data through the network before accessing Parameters. You can also avoid deferred initialization by specifying in_units, num_features, etc., for network layers.

In [53]:
mlp_no_embedding.collect_params().values
critic.collect_params()['critic1_dense0_weight'].data()

odict_values([Parameter mlp0_dense0_weight (shape=(64, 0), dtype=<class 'numpy.float32'>), Parameter mlp0_dense0_bias (shape=(64,), dtype=<class 'numpy.float32'>), Parameter mlp0_batchnorm0_gamma (shape=(0,), dtype=<class 'numpy.float32'>), Parameter mlp0_batchnorm0_beta (shape=(0,), dtype=<class 'numpy.float32'>), Parameter mlp0_batchnorm0_running_mean (shape=(0,), dtype=<class 'numpy.float32'>), Parameter mlp0_batchnorm0_running_var (shape=(0,), dtype=<class 'numpy.float32'>), Parameter mlp0_dense1_weight (shape=(2, 0), dtype=<class 'numpy.float32'>), Parameter mlp0_dense1_bias (shape=(2,), dtype=<class 'numpy.float32'>)])

In [200]:
for epoch in tqdm_notebook(range(n_epoch), desc = 'epoch'):
    ## Training
    train_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(train_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        with autograd.record():
            _out = mlp_no_embedding(_dat)
            _los = nd.sum(loss(_out, _label)) # 배치의 크기만큼의 loss가 나옴
            _los.backward()
        trainer.step(_dat.shape[0])
        n_obs += _dat.shape[0]
        #print(n_obs)
        _total_los += nd.sum(_los).asnumpy()
        # Epoch loss를 구하기 위해서 결과물을 계속 쌓음
        pred.extend(nd.softmax(_out)[:,1].asnumpy()) # 두번째 컬럼의 확률이 예측 확률
        label.extend(_label.asnumpy())
    #print(pred)
    #print([round(p) for p in pred]) # 기본이 float임
    #print(label)
    #print('**** ' + str(n_obs))
    #print(label[:10])
    #print(pred[:10])
    #print([round(p) for p in pred][:10])
    tr_acc = accuracy_score(label, [round(p) for p in pred])
    tr_loss = _total_los/n_obs
    
    ### Evaluate training
    valid_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(valid_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        _out = mlp(_dat)
        _pred_score = nd.softmax(_out)
        n_obs += _dat.shape[0]
        _total_los += nd.sum(loss(_out, _label)).asnumpy()
        pred.extend(nd.softmax(_out)[:,1].asnumpy())
        label.extend(_label.asnumpy())
    va_acc = accuracy_score(label, [round(p) for p in pred])
    va_loss = _total_los/n_obs
    tqdm.write('Epoch {}: tr_loss = {}, tr_acc= {}, va_loss = {}, va_acc= {}'.format(epoch, tr_loss, tr_acc, va_loss, va_acc))

HBox(children=(IntProgress(value=0, description='epoch', max=10), HTML(value='')))

Epoch 0: tr_loss = [0.00022346], tr_acc= 1.0, va_loss = [0.03824826], va_acc= 0.9880514705882353
Epoch 1: tr_loss = [0.00019266], tr_acc= 1.0, va_loss = [0.03824826], va_acc= 0.9880514705882353
Epoch 2: tr_loss = [0.00016774], tr_acc= 1.0, va_loss = [0.03824826], va_acc= 0.9880514705882353
Epoch 3: tr_loss = [0.0001472], tr_acc= 1.0, va_loss = [0.03824826], va_acc= 0.9880514705882353
Epoch 4: tr_loss = [0.00013019], tr_acc= 1.0, va_loss = [0.03824826], va_acc= 0.9880514705882353
Epoch 5: tr_loss = [0.00011583], tr_acc= 1.0, va_loss = [0.03824826], va_acc= 0.9880514705882353
Epoch 6: tr_loss = [0.00010363], tr_acc= 1.0, va_loss = [0.03824826], va_acc= 0.9880514705882353
Epoch 7: tr_loss = [9.320182e-05], tr_acc= 1.0, va_loss = [0.03824826], va_acc= 0.9880514705882353
Epoch 8: tr_loss = [8.417194e-05], tr_acc= 1.0, va_loss = [0.03824826], va_acc= 0.9880514705882353
Epoch 9: tr_loss = [7.6324715e-05], tr_acc= 1.0, va_loss = [0.03824826], va_acc= 0.9880514705882353



In [202]:
y_pred_mlp_no_embedding = mlp_no_embedding(nd.array(va_x, ctx = context))
# softmax를 적용하고
# 두번째 열을 뽑아와서
# nd.round 함수를 적용해서 0/1 예측값을 얻고
# numpy array로 바꾸고
# 첫번째 원소를 뽑아서 예측 label로 사용
pred_mlp_no_embedding = [nd.round(val).asnumpy()[0] for val in nd.softmax(y_pred_mlp)[:, 1]] 

In [203]:
accuracy_mlp_no_embedding = accuracy_score(va_y, pred_mlp_no_embedding)
print('Accuracy: %.2f%%'%(accuracy_rf * 100.0))

Accuracy: 98.47%


In [206]:
va_txt = pd.DataFrame(np.array([origin_txt[idx] for idx in va_idx]), columns = ['txt'])
pred_mlp_no_embedding_pd = pd.DataFrame(pred_mlp_no_embedding, columns  = ['pred_mlp_no_embedding'])
label_pd = pd.DataFrame(va_y, columns = ['label'])
result = pd.concat([va_txt, pred_mlp_no_embedding_pd, label_pd], axis = 1)

In [210]:
result[result['pred_mlp_no_embedding'] != result['label']].shape

(33, 3)

In [194]:
_pred_score[:, 0]


[9.9985087e-01 9.9978369e-01 9.9972004e-01 9.9981624e-01 9.9978369e-01
 4.8423914e-04 2.4478373e-01 9.1124475e-02 1.6645603e-03 4.0846759e-01
 6.2343909e-04 3.0040858e-02 1.7331500e-03 1.4756978e-03 7.7909184e-01
 2.6120720e-08 4.5528898e-01 1.1758124e-03 1.3725077e-02 1.5767918e-04
 2.0312884e-06 2.2029674e-04 2.0878112e-04 9.5378027e-06 5.9853699e-02
 1.5292705e-04 1.4807166e-04 3.5203213e-04 8.5062282e-05 8.4243221e-03
 3.5203213e-04 8.5656675e-06]
<NDArray 32 @gpu(0)>