In [1]:
import os
import pandas as pd
import numpy as np
import nltk
import collections
from sklearn.preprocessing import normalize

In [2]:
word_freq = collections.Counter()
max_len = 0
num_rec = 0

with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        label, sentence = line.decode('utf8').strip().split('\t')
        words = nltk.word_tokenize(sentence.lower())
        if len(words) > max_len:
            max_len = len(words)
        for word in words:
            word_freq[word] += 1
        num_rec += 1

In [3]:
MAX_FEATURES = 2000
MAX_SENTENCE_LENGTH = 40
# most_common output -> list
word2idx = {x[0]: i+2 for i, x in enumerate(word_freq.most_common(MAX_FEATURES - 2))}
word2idx ['PAD'] = 0
word2idx['UNK'] = 1

In [4]:
idx2word= {i:v for v, i in word2idx.items()}
vocab_size = len(word2idx)

In [5]:
y = []
x = []
origin_txt = []
with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        _label, _sentence = line.decode('utf8').strip().split('\t')
        origin_txt.append(_sentence)
        y.append(int(_label))
        words = nltk.word_tokenize(_sentence.lower())
        _seq = []
        for word in words:
            if word in word2idx.keys():
                _seq.append(word2idx[word])
            else:
                _seq.append(word2idx['UNK'])
        if len(_seq) < MAX_SENTENCE_LENGTH:
            _seq.extend([0] * ((MAX_SENTENCE_LENGTH) - len(_seq)))
        else:
            _seq = _seq[:MAX_SENTENCE_LENGTH]
        x.append(_seq)

In [6]:
pd.DataFrame(y, columns = ['yn']).reset_index().groupby('yn').count().reset_index()

Unnamed: 0,yn,index
0,0,3091
1,1,3995


## Sentence representation: Average of BOW

In [7]:
def one_hot(x, vocab_size):
    res = np.zeros(shape = (vocab_size))
    res[x] = 1
    return res

In [8]:
x_1 = np.array([np.sum(np.array([one_hot(word, MAX_FEATURES) for word in example]), axis = 0) for example in x])

## Data process - tr/va split and define iterator

In [9]:
tr_idx = np.random.choice(range(x_1.shape[0]), int(x_1.shape[0] * .8))
va_idx = [x for x in range(x_1.shape[0]) if x not in tr_idx]

In [10]:
tr_x = x_1[tr_idx, :]
tr_y = [y[i] for i in tr_idx]
va_x = x_1[va_idx, :]
va_y = [y[i] for i in va_idx]

In [12]:
tr_x.shape

(5668, 2000)

## Classification

* If we transform sentence into machine-understandable form via average of BOW, we can separate representation and classification
* Here, we will apply various classifiers

### XGBoost

In [11]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [12]:
xgb = XGBClassifier()
xgb.fit(tr_x, tr_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [13]:
y_pred_xgb = xgb.predict(va_x)
pred_xgb = [round(val) for val in y_pred_xgb]

# Check predictions
#pred_pd= pd.DataFrame(pred_xgb, columns = ['pred']).reset_index()
#pred_pd.groupby(['pred']).count()

  if diff:


In [14]:
accuracy_xgb = accuracy_score(va_y, pred_xgb)
print('Accuracy: %.2f%%'%(accuracy_xgb * 100.0))

Accuracy: 97.76%


### Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rf = RandomForestClassifier()
rf.fit(tr_x, tr_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
y_pred_rf = rf.predict(va_x)
pred_rf = [round(val) for val in y_pred_rf]

In [18]:
accuracy_rf = accuracy_score(va_y, pred_rf)
print('Accuracy: %.2f%%'%(accuracy_rf * 100.0))

Accuracy: 98.08%


### SVM

In [19]:
from sklearn import svm

In [20]:
models = (svm.SVC(kernel = 'linear', C = 1.0), # C: SVM Regularization parameter
          svm.LinearSVC(C = 1.0),
          svm.SVC(kernel = 'rbf', gamma = .7, C = 1.0),
          svm.SVC(kernel = 'poly', degree = 3, C = 1.0)
)

models = (mdl.fit(tr_x, tr_y) for mdl in models)

In [21]:
y_pred_svm = (mdl.predict(va_x) for mdl in models)
pred_svm = [[round(val) for val in _pred] for _pred in y_pred_svm]

In [22]:
accuracy_svm = [accuracy_score(va_y, pred) for pred in pred_svm]
print('Accuracy: {}'.format(np.round(accuracy_svm, 4)*100))

Accuracy: [99.34 99.27 93.79 57.41]


## Check results

In [23]:
va_txt = pd.DataFrame(np.array([origin_txt[idx] for idx in va_idx]), columns = ['txt'])
pred_rf_pd = pd.DataFrame(pred_rf, columns  = ['pred_rf'])
pred_xgb_pd = pd.DataFrame(pred_xgb, columns  = ['pred_xgb'])
pred_svm_svc_pd = pd.DataFrame(pred_svm[2], columns  = ['pred_svm'])
label_pd = pd.DataFrame(va_y, columns = ['label'])
result = pd.concat([va_txt, pred_rf_pd, pred_xgb_pd, pred_svm_svc_pd, label_pd], axis = 1)

In [24]:
result.head()

Unnamed: 0,txt,pred_rf,pred_xgb,pred_svm,label
0,I liked the Da Vinci Code but it ultimatly did...,0,1,1,1
1,that's not even an exaggeration ) and at midni...,1,0,1,1
2,"I loved the Da Vinci Code, but now I want some...",1,1,1,1
3,"i thought da vinci code was great, same with k...",1,0,1,1
4,The Da Vinci Code is actually a good movie...,1,0,1,1


In [25]:
print('# of error case {}'.format(result[result['pred_rf'] != result['label']].shape[0]))
print('# of error case {}'.format(result[result['pred_xgb'] != result['label']].shape[0]))
print('# of error case {}'.format(result[result['pred_svm'] != result['label']].shape[0]))


# of error case 61
# of error case 71
# of error case 197


### DNN with embedding layer

In [26]:
import mxnet as mx
from mxnet import gluon, autograd, nd
from mxnet.gluon import nn
context = mx.gpu()

In [27]:
class MLP(nn.Block):
    def __init__(self, input_dim, emb_dim, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.embed = nn.Embedding(input_dim = input_dim, output_dim = emb_dim)
            self.dense1 = nn.Dense(64)
            #self.dense2 = nn.Dense(32, activation = 'relu')
            self.bn = nn.BatchNorm()
            self.dense2 = nn.Dense(2)
            
    def forward(self, x):
        x = self.embed(x)
        x = self.dense1(x)
        x = self.bn(x)
        x = nd.relu(x)
        x = self.dense2(x)
        return x

In [28]:
def acc_f(label, pred):
    pred = pred.ravel()
    label = label.ravel()
    #print('pred = {}'.format(pred))
    #print('label = {}'.format(label))
    corr = ((pred > 0.5) == label)*1.
    return (((pred > 0.5) == label)*1.).mean()
tr_metric = mx.metric.CustomMetric(acc_f)
va_metric = mx.metric.CustomMetric(acc_f)

In [29]:
n_epoch = 10
batch_size = 64
from tqdm import tqdm, tqdm_notebook
os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'

In [30]:
train_data = mx.io.NDArrayIter(data=[tr_x, tr_y], batch_size=batch_size, shuffle = False)
valid_data = mx.io.NDArrayIter(data=[va_x, va_y], batch_size=batch_size, shuffle = False)

In [31]:
mlp = MLP(input_dim = MAX_FEATURES, emb_dim = 50)
mlp.collect_params().initialize(mx.init.Xavier(), ctx = context)
loss = gluon.loss.SoftmaxCELoss()
trainer = gluon.Trainer(mlp.collect_params(), 'adam', {'learning_rate': 1e-3})

MXNetError: [21:45:50] src/ndarray/ndarray.cc:1233: GPU is not enabled

Stack trace returned 10 entries:
[bt] (0) /home/kookmin/py_libs/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1d00c2) [0x7f7ba24440c2]
[bt] (1) /home/kookmin/py_libs/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1d06c8) [0x7f7ba24446c8]
[bt] (2) /home/kookmin/py_libs/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2a0d883) [0x7f7ba4c81883]
[bt] (3) /home/kookmin/py_libs/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x28d4ce8) [0x7f7ba4b48ce8]
[bt] (4) /home/kookmin/py_libs/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x28d9e07) [0x7f7ba4b4de07]
[bt] (5) /home/kookmin/py_libs/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x28da7e3) [0x7f7ba4b4e7e3]
[bt] (6) /home/kookmin/py_libs/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x27fd86b) [0x7f7ba4a7186b]
[bt] (7) /home/kookmin/py_libs/lib/python3.6/site-packages/mxnet/libmxnet.so(MXImperativeInvokeEx+0x6f) [0x7f7ba4a71e2f]
[bt] (8) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f7d0bee8e40]
[bt] (9) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x2eb) [0x7f7d0bee88ab]



In [None]:
for epoch in tqdm_notebook(range(n_epoch), desc = 'epoch'):
    ## Training
    train_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(train_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        with autograd.record():
            _out = mlp(_dat)
            _los = nd.sum(loss(_out, _label)) # 배치의 크기만큼의 loss가 나옴
            _los.backward()
        trainer.step(_dat.shape[0])
        n_obs += _dat.shape[0]
        #print(n_obs)
        _total_los += nd.sum(_los).asnumpy()
        # Epoch loss를 구하기 위해서 결과물을 계속 쌓음
        pred.extend(nd.softmax(_out)[:,1].asnumpy()) # 두번째 컬럼의 확률이 예측 확률
        label.extend(_label.asnumpy())
    #print(pred)
    #print([round(p) for p in pred]) # 기본이 float임
    #print(label)
    #print('**** ' + str(n_obs))
    #print(label[:10])
    #print(pred[:10])
    #print([round(p) for p in pred][:10])
    tr_acc = accuracy_score(label, [round(p) for p in pred])
    tr_loss = _total_los/n_obs
    
    ### Evaluate training
    valid_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(valid_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        _out = mlp(_dat)
        _pred_score = nd.softmax(_out)
        n_obs += _dat.shape[0]
        _total_los += nd.sum(loss(_out, _label)).asnumpy()
        pred.extend(nd.softmax(_out)[:,1].asnumpy())
        label.extend(_label.asnumpy())
    va_acc = accuracy_score(label, [round(p) for p in pred])
    va_loss = _total_los/n_obs
    tqdm.write('Epoch {}: tr_loss = {}, tr_acc= {}, va_loss = {}, va_acc= {}'.format(epoch, tr_loss, tr_acc, va_loss, va_acc))

In [None]:
y_pred_mlp = mlp(nd.array(va_x, ctx = context))
# softmax를 적용하고
# 두번째 열을 뽑아와서
# nd.round 함수를 적용해서 0/1 예측값을 얻고
# numpy array로 바꾸고
# 첫번째 원소를 뽑아서 예측 label로 사용
pred_mlp = [nd.round(val).asnumpy()[0] for val in nd.softmax(y_pred_mlp)[:, 1]] 

In [None]:
accuracy_mlp = accuracy_score(va_y, pred_mlp)
print('Accuracy: %.2f%%'%(accuracy_rf * 100.0))

#### DNN without embedding

In [None]:
class MLP(nn.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        with self.name_scope():
            self.dense1 = nn.Dense(64)
            #self.dense2 = nn.Dense(32, activation = 'relu')
            self.bn = nn.BatchNorm()
            self.dense2 = nn.Dense(2)
            
    def forward(self, x):
        x = self.dense1(x)
        x = self.bn(x)
        x = nd.relu(x)
        x = self.dense2(x)
        return x

In [None]:
n_epoch = 10
batch_size = 64
from tqdm import tqdm, tqdm_notebook

In [None]:
mlp_no_embedding = MLP()
mlp_no_embedding.collect_params().initialize(mx.init.Xavier(), ctx = context)
loss = gluon.loss.SoftmaxCELoss()
trainer = gluon.Trainer(mlp_no_embedding.collect_params(), 'adam', {'learning_rate': 1e-3})

In [None]:
for epoch in tqdm_notebook(range(n_epoch), desc = 'epoch'):
    ## Training
    train_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(train_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        with autograd.record():
            _out = mlp_no_embedding(_dat)
            _los = nd.sum(loss(_out, _label)) # 배치의 크기만큼의 loss가 나옴
            _los.backward()
        trainer.step(_dat.shape[0])
        n_obs += _dat.shape[0]
        #print(n_obs)
        _total_los += nd.sum(_los).asnumpy()
        # Epoch loss를 구하기 위해서 결과물을 계속 쌓음
        pred.extend(nd.softmax(_out)[:,1].asnumpy()) # 두번째 컬럼의 확률이 예측 확률
        label.extend(_label.asnumpy())
    #print(pred)
    #print([round(p) for p in pred]) # 기본이 float임
    #print(label)
    #print('**** ' + str(n_obs))
    #print(label[:10])
    #print(pred[:10])
    #print([round(p) for p in pred][:10])
    tr_acc = accuracy_score(label, [round(p) for p in pred])
    tr_loss = _total_los/n_obs
    
    ### Evaluate training
    valid_data.reset()
    n_obs = 0
    _total_los = 0
    pred = []
    label = []
    for i, batch in enumerate(valid_data):
        _dat = batch.data[0].as_in_context(context)
        _label = batch.data[1].as_in_context(context)
        _out = mlp(_dat)
        _pred_score = nd.softmax(_out)
        n_obs += _dat.shape[0]
        _total_los += nd.sum(loss(_out, _label)).asnumpy()
        pred.extend(nd.softmax(_out)[:,1].asnumpy())
        label.extend(_label.asnumpy())
    va_acc = accuracy_score(label, [round(p) for p in pred])
    va_loss = _total_los/n_obs
    tqdm.write('Epoch {}: tr_loss = {}, tr_acc= {}, va_loss = {}, va_acc= {}'.format(epoch, tr_loss, tr_acc, va_loss, va_acc))

In [None]:
y_pred_mlp_no_embedding = mlp_no_embedding(nd.array(va_x, ctx = context))
# softmax를 적용하고
# 두번째 열을 뽑아와서
# nd.round 함수를 적용해서 0/1 예측값을 얻고
# numpy array로 바꾸고
# 첫번째 원소를 뽑아서 예측 label로 사용
pred_mlp_no_embedding = [nd.round(val).asnumpy()[0] for val in nd.softmax(y_pred_mlp)[:, 1]] 

In [None]:
accuracy_mlp_no_embedding = accuracy_score(va_y, pred_mlp_no_embedding)
print('Accuracy: %.2f%%'%(accuracy_rf * 100.0))

## Errors

In [None]:
va_txt = pd.DataFrame(np.array([origin_txt[idx] for idx in va_idx]), columns = ['txt'])
pred_mlp_no_embedding_pd = pd.DataFrame(pred_mlp_no_embedding, columns  = ['pred_mlp_no_embedding'])
label_pd = pd.DataFrame(va_y, columns = ['label'])
result = pd.concat([va_txt, pred_mlp_no_embedding_pd, label_pd], axis = 1)

In [None]:
result[result['pred_mlp_no_embedding'] != result['label']].shape

In [None]:
_pred_score[:, 0]