In [1]:
import sys
import torch
sys.path.insert(0, '/anything/git/asa/CERT/source')

#### reference
- https://blog.cambridgespark.com/50-free-machine-learning-datasets-natural-language-processing-d88fb9c5c8da
- https://tensorflowkorea.gitbooks.io/tensorflow-kr/content/g3doc/tutorials/word2vec/

In [2]:
import os
import re
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm, trange, tqdm_notebook
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import recall_score, precision_score, roc_auc_score, confusion_matrix
from utils import fix_torch_randomness
from feature import InputExample, InputFeatures
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from torch.nn import CrossEntropyLoss, MSELoss
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam

#### movie 데이터셋을 pd.DataFrame으로 로드 (https://ai.stanford.edu/~amaas/data/sentiment/에서 download 가능)

In [3]:
positive_filename = os.listdir('/data/asa/imdb/aclImdb/train/pos')
negative_filename = os.listdir('/data/asa/imdb/aclImdb/train/neg')
positive_filename[:5], negative_filename[:5]

(['127_7.txt', '126_10.txt', '125_7.txt', '124_10.txt', '123_10.txt'],
 ['127_4.txt', '126_1.txt', '125_1.txt', '124_2.txt', '123_1.txt'])

In [4]:
positive_filename = [os.path.join('/data/asa/imdb/aclImdb/train/pos', f) for f in positive_filename]
negative_filename = [os.path.join('/data/asa/imdb/aclImdb/train/neg', f) for f in negative_filename]

In [5]:
positive_filename[:5]

['/data/asa/imdb/aclImdb/train/pos/127_7.txt',
 '/data/asa/imdb/aclImdb/train/pos/126_10.txt',
 '/data/asa/imdb/aclImdb/train/pos/125_7.txt',
 '/data/asa/imdb/aclImdb/train/pos/124_10.txt',
 '/data/asa/imdb/aclImdb/train/pos/123_10.txt']

In [6]:
def files_to_df(files, label):
    df = {
        'text':[open(f, 'r').readline() for f in files],
        'label': [label]*len(files)
    }
    df = pd.DataFrame(df)[['text','label']]
    return df

In [7]:
pos_df = files_to_df(positive_filename, 1)
neg_df = files_to_df(negative_filename, 0)
df = pd.concat([pos_df, neg_df])
df = df.sample(frac=1.0)

In [8]:
df.head(15)

Unnamed: 0,text,label
12386,"A masterpiece of comedy, a masterpiece of horr...",1
3950,I had hoped this movie was going to be mildly ...,0
5356,"Apparently Ruggero Deodato figured out, early ...",1
8972,The Slackers as titled in this movie are three...,0
9674,First love is a desperately difficult subject ...,1
541,"I purchased this film on the cheap in a sale, ...",0
9542,This is one of the best made movies from 2002....,1
2914,"New York, I Love You finally makes it to our s...",1
1870,This could be looked at in many different ways...,0
4399,I happily admit that I'm a sucker for a beauti...,1


#### 전처리 함수 정의 및 테스트

In [9]:
br = re.compile(r'<br \/>', flags=re.IGNORECASE)
nt = re.compile(r'n\'t', flags=re.IGNORECASE)
wd = re.compile(r'\'d', flags=re.IGNORECASE)
nb = re.compile(r'(\s(\-|)\d+)')
special_chars = [';',':','"','\'','(',')','{','}','\.',',','\?','/','!','@','#','$','%','^','&','\*','\-','\+','=','\r\n','\n','\\\\','`','>','<','~']
sp_pattern = '[{}]'.format('|'.join(special_chars))
sp = re.compile(sp_pattern)
def preprocess(x):
    x = x.lower()
    x = re.sub(br, ' ', x)
    x = re.sub(nb, ' [number] ', x)
    x = re.sub(nt, ' not ', x)
    x = re.sub(wd, ' would ', x)
    x = re.sub(sp, ' ', x)
    return x

In [10]:
txt = 'Zentropa has much in 333 common with The Third Man, another noir-like film set among the rubble of postwar Europe. Like TTM, there is much inventive camera work. There is an innocent American who gets emotionally involved with a woman he doesn\'t really understand, and whose naivety is all the more striking in contrast with the natives.<br /><br />But I\'d have to say that The Third Man has a more well-crafted storyline. Zentropa is a bit disjointed in this respect. Perhaps this is intentional: it is presented as a dream/nightmare, and making it too coherent would spoil the effect. <br /><br />This movie is unrelentingly grim--"noir" in more than one sense; one never sees the sun shine. Grim, but intriguing, and frightening.'
preprocess(txt)

'zentropa has much in [number]  common with the third man  another noir like film set among the rubble of postwar europe  like ttm  there is much inventive camera work  there is an innocent american who gets emotionally involved with a woman he does not  really understand  and whose naivety is all the more striking in contrast with the natives   but i would  have to say that the third man has a more well crafted storyline  zentropa is a bit disjointed in this respect  perhaps this is intentional  it is presented as a dream nightmare  and making it too coherent would spoil the effect    this movie is unrelentingly grim   noir  in more than one sense  one never sees the sun shine  grim  but intriguing  and frightening '

#### 데이터셋 전처리

In [11]:
tqdm.pandas()
df['text'] = df.text.progress_apply(preprocess)

100%|██████████| 25000/25000 [00:01<00:00, 13815.72it/s]


In [12]:
df.head(10)

Unnamed: 0,text,label
12386,a masterpiece of comedy a masterpiece of horr...,1
3950,i had hoped this movie was going to be mildly ...,0
5356,apparently ruggero deodato figured out early ...,1
8972,the slackers as titled in this movie are three...,0
9674,first love is a desperately difficult subject ...,1
541,i purchased this film on the cheap in a sale ...,0
9542,this is one of the best made movies from [numb...,1
2914,new york i love you finally makes it to our s...,1
1870,this could be looked at in many different ways...,0
4399,i happily admit that i m a sucker for a beauti...,1


#### vocabulary 만들기

In [13]:
def make_word(texts):
    vocab = {}
    for text in tqdm(texts):
        text = text.split()
        for word in text:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [14]:
vocab = make_word(df.text.values)

100%|██████████| 25000/25000 [00:00<00:00, 29571.48it/s]


In [15]:
vocab_df = pd.DataFrame(data=[(k, v) for k, v in vocab.items()], columns=['word','cnt'])
vocab_df.head(10)

Unnamed: 0,word,cnt
0,a,163113
1,masterpiece,613
2,of,145864
3,comedy,3246
4,horror,3591
5,romance,694
6,if,16807
7,there,18858
8,is,110502
9,anything,2947


In [16]:
vocab_df.shape

(75280, 2)

## 1) Naive-Bayes로 간단한 classifier 만들기

#### feature(단어) 선택해서 추출하기

In [24]:
hand_picked_word = 'happy,sad,gloomy,funny,beautiful,ugly,handsome,great,good,bad,awesome,worst,best'.split(',')
len(hand_picked_word)

13

In [25]:
for w in tqdm(hand_picked_word):
    df[w] = df.text.apply(lambda x: 1 if w in x.split() else 0)

100%|██████████| 13/13 [00:03<00:00,  4.01it/s]


In [26]:
df[hand_picked_word].head(5)

Unnamed: 0,happy,sad,gloomy,funny,beautiful,ugly,handsome,great,good,bad,awesome,worst,best
9424,0,0,0,0,0,0,0,1,0,1,0,0,0
4972,0,0,0,0,0,0,0,0,0,0,0,0,0
10195,0,0,0,0,0,0,0,1,0,0,0,0,0
9991,0,0,0,0,0,0,0,1,0,0,0,0,0
11735,0,0,0,1,0,0,0,0,1,0,0,0,1


In [27]:
df.head(5)

Unnamed: 0,text,label,happy,sad,gloomy,funny,beautiful,ugly,handsome,great,good,bad,awesome,worst,best
9424,my kids picked this out at the video store i...,0,0,0,0,0,0,0,0,1,0,1,0,0,0
4972,we as a family were so delighted with the l...,1,0,0,0,0,0,0,0,0,0,0,0,0,0
10195,this great movie has failed to register a high...,1,0,0,0,0,0,0,0,1,0,0,0,0,0
9991,the performance by om puri smita patil and s...,1,0,0,0,0,0,0,0,1,0,0,0,0,0
11735,down to earth is the best movie it is so fu...,1,0,0,0,1,0,0,0,0,1,0,0,0,1


In [28]:
df.shape

(25000, 15)

In [29]:
features = hand_picked_word

In [30]:
n_train = int(df.shape[0] * 0.8)
train_df = df[:n_train]
test_df = df[n_train:]
train_df.shape, test_df.shape

((20000, 15), (5000, 15))

#### RandomForestClassifier와 AdaBoostClassifier 학습하기

In [31]:
m1 = RandomForestClassifier(n_estimators=11)

In [32]:
m1.fit(train_df[features], train_df['label'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=11,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [33]:
pred = m1.predict(test_df[features])
rec = recall_score(test_df['label'], pred)
pre = precision_score(test_df['label'], pred)
auc = roc_auc_score(test_df['label'], pred)

print('rec: {:.4f}'.format(rec))
print('pre: {:.4f}'.format(pre))
print('auc: {:.4f}'.format(auc))

rec: 0.6151
pre: 0.6587
auc: 0.6486


In [34]:
m2 = AdaBoostClassifier(n_estimators=100, random_state=0)

In [35]:
m2.fit(train_df[features], train_df['label'])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=0)

In [36]:
pred = m2.predict(test_df[features])
rec = recall_score(test_df['label'], pred)
pre = precision_score(test_df['label'], pred)
auc = roc_auc_score(test_df['label'], pred)

print('rec: {:.4f}'.format(rec))
print('pre: {:.4f}'.format(pre))
print('auc: {:.4f}'.format(auc))

rec: 0.6039
pre: 0.6708
auc: 0.6541


#### 두 개의 모델 ensemble 하기

In [37]:
prob_1 = m1.predict_proba(test_df[features])
prob_2 = m2.predict_proba(test_df[features])
prob = (prob_1 + prob_2) / 2

In [38]:
pred = (prob[:,1]>0.5).astype(int)
rec = recall_score(test_df['label'], pred)
pre = precision_score(test_df['label'], pred)
auc = roc_auc_score(test_df['label'], pred)

print('rec: {:.4f}'.format(rec))
print('pre: {:.4f}'.format(pre))
print('auc: {:.4f}'.format(auc))

rec: 0.6151
pre: 0.6589
auc: 0.6488


## 2) deep leanrning + machine learning approach

#### 워드벡터 만들기

In [39]:
sentences = list(df.text.apply(lambda x: x.split()))

In [40]:
%%time
w2v = Word2Vec(sentences)

CPU times: user 32.3 s, sys: 67.9 ms, total: 32.4 s
Wall time: 11.7 s


#### 워드벡터를 이용해서 문장을 벡터화하기

In [41]:
def generate_vector(text):
    text = text.split()
    vector = []
    for word in text:
        try:
            vector.append(w2v[word])
        except KeyError:
            pass
    vector = np.array(vector)
    return vector.mean(axis=0)

In [42]:
#generate_vector('i am so happy with it')
x = df.text.progress_apply(generate_vector).values.tolist()
x = np.array(x).reshape((len(x), 100))
y = df.label

  
100%|██████████| 25000/25000 [00:20<00:00, 1205.37it/s]


In [43]:
x.shape, y.shape

((25000, 100), (25000,))

In [44]:
n_train = int(len(x) * 0.8)
train_x = x[:n_train]
train_y = y[:n_train]
test_x = x[n_train:]
test_y = y[n_train:]

#### 새로운 피처를(워드벡터) 이용해서 RandomForestClassifier와 AdaBoostClassifier 만들기

In [45]:
m1 = RandomForestClassifier(n_estimators=11)

In [46]:
m1.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=11,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [47]:
pred = m1.predict(test_x)
rec = recall_score(test_y, pred)
pre = precision_score(test_y, pred)
auc = roc_auc_score(test_y, pred)

print('rec: {:.4f}'.format(rec))
print('pre: {:.4f}'.format(pre))
print('auc: {:.4f}'.format(auc))

rec: 0.7765
pre: 0.7461
auc: 0.7564


In [48]:
m2 = AdaBoostClassifier(n_estimators=100, random_state=0)

In [49]:
m2.fit(train_x, train_y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=0)

In [50]:
pred = m2.predict(test_x)
rec = recall_score(test_y, pred)
pre = precision_score(test_y, pred)
auc = roc_auc_score(test_y, pred)

print('rec: {:.4f}'.format(rec))
print('pre: {:.4f}'.format(pre))
print('auc: {:.4f}'.format(auc))

rec: 0.8022
pre: 0.7933
auc: 0.7968


In [51]:
prob_1 = m1.predict_proba(test_x)
prob_2 = m2.predict_proba(test_x)
prob = (prob_1 + prob_2) / 2

In [52]:
pred1 = (prob[:,1]>0.5).astype(int)

In [53]:
rec = recall_score(test_y, pred1)
pre = precision_score(test_y, pred1)
auc = roc_auc_score(test_y, pred1)

print('rec: {:.4f}'.format(rec))
print('pre: {:.4f}'.format(pre))
print('auc: {:.4f}'.format(auc))

rec: 0.7765
pre: 0.7461
auc: 0.7564


## 3) only deep learning approach

#### vocabulary 선택하기 및 패딩/UNK 추가하기

In [54]:
my_vocab = vocab_df[vocab_df.cnt>5].word.values
my_vocab_dict = {v:i+1 for i,v in enumerate(my_vocab)}

In [55]:
len(my_vocab_dict)

26416

In [56]:
my_vocab_dict['UNK'] = len(my_vocab) + 1
my_vocab_dict['PAD'] = 0

#### 모델 정의하기

In [57]:
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()

In [58]:
def real_length(batches):
    padidx = my_vocab_dict['PAD']
    reallen_list = [(batch.tolist()+[padidx]).index(padidx) for batch in batches]
    reallen_list = np.array(reallen_list)
    return reallen_list

def load_embeddings(word2index, n_dimen):
    word_embeddings = {}
    for word in word2index:
        word_embeddings[word] = np.random.uniform(-0.25, 0.25, n_dimen)
    return word_embeddings

In [59]:
class TextCNNBIRNN(object):
    def __init__(self, hdnsize, maxword, mxpsize, n_class, n_dimen, fltsize, n_filts, vocab_size, l2regld=0.0):

        # input placeholders
        self.input_x = tf.placeholder(tf.int32, [None, maxword], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, n_class], name='input_y')
        self.keep_pr = tf.placeholder(tf.float32, name='keep_pr')
        self.n_input = tf.placeholder(tf.int32, [], name='n_input')
        self.reallen = tf.placeholder(tf.int32, [None], name='reallen')
        self.padding = tf.placeholder(tf.float32, [None, 1, n_dimen, 1], name='padding')

        self.a = tf.Variable(tf.random_uniform([vocab_size, n_dimen], -0.1, 0.1), name='embedded_matrix')
        self.b = tf.nn.dropout(self.a, self.keep_pr, name='embedded_dropout_matrix')
        self.c = tf.nn.embedding_lookup(self.b, self.input_x, name='embedded_w')
        self.embedded_w = tf.expand_dims(self.c, -1, name='embedded_expanded_w')

        reduced = np.int32(np.ceil((maxword) * 1.0 / mxpsize))

        # construct TextCNN
        pooled_outputs = []
        for flt in fltsize:
            # zero paddings
            num_prio = (flt-1) // 2
            num_post = (flt-1) - num_prio
            pad_prio = tf.concat([self.padding] * num_prio, 1)
            pad_post = tf.concat([self.padding] * num_post, 1)
            #pad_prio = tf.concat([padding] * num_prio, 1)
            #pad_post = tf.concat([padding] * num_post, 1)
            emb_pad = tf.concat([pad_prio, self.embedded_w, pad_post], 1)

            # convolution
            filter_shape = [flt, n_dimen, 1, n_filts]
            conv_w = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='conv_w')
            conv_b = tf.Variable(tf.constant(0.1, shape=[n_filts]), name='conv_b')
            conv = tf.nn.conv2d(emb_pad, conv_w, strides=[1, 1, 1, 1], padding='VALID', name='conv')

            # relu activation
            h = tf.nn.relu(tf.nn.bias_add(conv, conv_b), name='relu')

            # maxpooling over the outputs
            pooled = tf.nn.max_pool(
                    h, ksize=[1, mxpsize, 1, 1],\
                    strides=[1, mxpsize, 1, 1],\
                    padding='SAME', name='pool'
            )
            pooled = tf.reshape(pooled, [-1, reduced, n_filts])
            pooled_outputs.append(pooled)

        pooled_outputs = tf.concat(pooled_outputs,2)
        self.pooled_outputs = tf.nn.dropout(pooled_outputs, self.keep_pr, name='pooled_outputs')

        # construct Bidirectional LSTM
        lstm_fwcell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hdnsize)
        lstm_bwcell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hdnsize)
        #lstm_fwcell = tf.contrib.rnn.BasicLSTMCell(num_units=hdnsize)
        #lstm_bwcell = tf.contrib.rnn.BasicLSTMCell(num_units=hdnsize)
        lstm_fwcell = tf.nn.rnn_cell.DropoutWrapper(lstm_fwcell, output_keep_prob=self.keep_pr)
        lstm_bwcell = tf.nn.rnn_cell.DropoutWrapper(lstm_bwcell, output_keep_prob=self.keep_pr)
        _fwinitial_state = lstm_fwcell.zero_state(self.n_input, tf.float32)
        _bwinitial_state = lstm_bwcell.zero_state(self.n_input, tf.float32)

        inputs = [tf.squeeze(input_, [1]) for input_ in\
                    tf.split(self.pooled_outputs,num_or_size_splits=int(reduced),axis=1)]

        self.outputs, _, _ = tf.nn.static_bidirectional_rnn(\
            lstm_fwcell, lstm_bwcell,\
            inputs,\
            initial_state_fw=_fwinitial_state,
            initial_state_bw=_bwinitial_state,
            dtype=tf.float32
        )

        # collect the appropriate last words into variable output (dimension = batch x n_dimen)
        output = self.outputs[0]
        with tf.variable_scope('Output'):
            tf.get_variable_scope().reuse_variables()
            one = tf.ones([1, hdnsize*2], tf.float32, name='one')
            for i in range(1,len(self.outputs)):
                ind = self.reallen < (i+1)
                ind = tf.to_float(ind, name='ind_tofloat')
                ind = tf.expand_dims(ind, -1, name='ind_expanded')
                mat = tf.matmul(ind, one, name='output_matmul')
                output = tf.add(tf.multiply(output, mat),tf.multiply(self.outputs[i], 1.0 - mat), name='output')

        # define l2loss
        l2loss = tf.constant(0.0)

        # define output weights and bias for prediction
        output_w = tf.Variable(tf.truncated_normal([hdnsize*2, n_class], stddev=0.1), name='output_w')
        output_b = tf.Variable(tf.constant(0.1, shape=[n_class]), name='output_b')
        l2loss = tf.nn.l2_loss(output_w) + l2loss
        l2loss = tf.nn.l2_loss(output_b) + l2loss

        # predict data
        self.scores = tf.nn.xw_plus_b(output, output_w, output_b, name='scores')
        #self.scores = tf.matmul(output, output_w, name='scores')
        self.predictions = tf.argmax(self.scores, 1, name='predictions')

        # calculate loss and accuracy
        losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y, logits=self.scores)
        #losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.input_y, logits=self.scores)
        correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
        self.loss = tf.reduce_mean(losses) + l2regld*l2loss
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name='accuracy')

In [60]:
vocab_size = (vocab_df.cnt>5).sum()
model = TextCNNBIRNN(
    hdnsize=100,
    maxword=200,
    mxpsize=5,
    n_class=2,
    n_dimen=100,
    fltsize=[3,4,5],
    n_filts=128,
    vocab_size=vocab_size,
    l2regld=0.01)

W1017 17:18:20.060177 140220720973568 deprecation.py:506] From /usr/local/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W1017 17:18:20.060735 140220720973568 deprecation.py:506] From <ipython-input-59-fb5cb373895c>:13: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W1017 17:18:20.156929 140220720973568 deprecation.py:323] From <ipython-input-59-fb5cb373895c>:53: BasicLSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is eq

#### optimizer 정의하기

In [61]:
# define Training procedure
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.RMSPropOptimizer(0.001, decay=0.9)
grads_and_vars = optimizer.compute_gradients(model.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

W1017 17:18:26.559751 140220720973568 deprecation.py:506] From /usr/local/lib/python3.6/site-packages/tensorflow_core/python/training/rmsprop.py:119: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


#### 텐서플로우 초기화 및 학습/검증에 필요한 함수 정의

In [62]:
init = tf.global_variables_initializer()

In [63]:
def embed_text(text):
    text = text.split()
    emb = []
    for word in text[:200]:
        try:
            emb.append(my_vocab_dict[word])
        except KeyError:
            emb.append(my_vocab_dict['UNK'])
            
    if len(text) < 200:
        emb += [my_vocab_dict['PAD']] * (200-len(text))
    emb = np.array(emb)    
    return emb

In [64]:
n_dimen = 100
def train_model(images, labels):
    # single training step
    feed_dict = {
        model.input_x: images,
        model.input_y: labels,
        model.padding: np.zeros([len(images), 1, n_dimen, 1]),
        model.reallen: real_length(images),
        model.n_input: len(images),
        model.keep_pr: 0.6
    }
    _, step, loss, accuracy = sess.run(
        [train_op, global_step, model.loss, model.accuracy],
        feed_dict)
    time_str = datetime.datetime.now().isoformat()
    return loss, accuracy
    #print('{}: step {}, loss {:.6f}, acc {:.4f}'.format(time_str, step, loss, accuracy))

In [65]:
def dev_model(images, labels, writer=None, rwords=None):
    # evaluates model on a dev set
    feed_dict = {
        model.input_x: images,
        model.input_y: labels,
        model.padding: np.zeros([len(images), 1, n_dimen, 1]),
        model.reallen: real_length(images),
        model.n_input: len(images),
        model.keep_pr: 1.0
    }
    step, predictions = sess.run(
        [global_step, model.predictions],
        feed_dict)
    return predictions

#### 실제 학습 실행하기

In [None]:
batch_size = 256

with tf.Session() as sess:
    sess.run(init)
    
    ################################
    batch_size = 256
    start = 0
    tbar = trange(1, 2000)
    for step in tbar:
        if start >= train_df.shape[0]:
            start = start % train_df.shape[0]
        batch_x, batch_y = train_df[start:start+batch_size].text.apply(embed_text).values.tolist(), train_df[start:start+batch_size].label
        batch_x = np.array(batch_x).reshape((len(batch_x), 200))
        batch_y = np.eye(2)[batch_y]
        loss, acc = train_model(batch_x, batch_y)
        tbar.set_description('Tranining at {}: {:.4f} {:.4f}'.format(step, loss, acc))
        start += batch_size
        #break
        
    start = 0
    batch_size = 1000
    pred_list = []
    while(1):
        if start >= test_df.shape[0]:
            break
        batch_x, batch_y = test_df[start:start+batch_size].text.apply(embed_text).values.tolist(), test_df[start:start+batch_size].label
        batch_x = np.array(batch_x).reshape((len(batch_x), 200))
        batch_y = np.eye(2)[batch_y]
        pred = dev_model(batch_x, batch_y)
        pred_list += pred.tolist()
        start += batch_size
    pred_list = np.array(pred_list)
    print(pred_list)

#### 모델 검증하기

In [None]:
pred = pred_list
rec = recall_score(test_y, pred)
pre = precision_score(test_y, pred)
auc = roc_auc_score(test_y, pred)

print('rec: {:.4f}'.format(rec))
print('pre: {:.4f}'.format(pre))
print('auc: {:.4f}'.format(auc))

## 4) BERT 모델

`3) only deep learning approach`에서 GPU를 사용하기 때문에 GPU를 Free한 후 실행해야 함

In [17]:
n_train = int(df.shape[0] * 0.8)
train_df = df[:n_train]
test_df = df[n_train:]

In [18]:
tok = BertTokenizer.from_pretrained('bert-base-uncased')

In [19]:
# load config and define pretrained model
config = BertConfig(
    len(tok.vocab),
    type_vocab_size=2,
    half=False,
    gpu=0)

In [20]:
# define model
fix_torch_randomness()
model = BertForSequenceClassification(config, num_labels=2)

In [22]:
# load model
model.cuda()
state_dict = torch.load('/data/asa/imdb_bert.bin')['state_dict']
model.load_state_dict(state_dict)

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [23]:
df.shape

(25000, 2)

In [24]:
test_y = test_df.label.values.copy()

In [27]:
def get_input_example(txt, lbl, i):
    return InputExample(guid=i, tokens_a=txt, tokens_b=None, label=lbl)

examples = []
for i, (txt, lbl) in enumerate(test_df[['text','label']].values):
    examples.append(get_input_example(txt, lbl, i))

In [28]:
def text_to_ids(tokens, vocab):
    ids = []
    for t in tokens:
        try:
            ids.append(vocab[t])
        except KeyError:
            ids.append(vocab['[UNK]'])
    return ids

In [34]:
def convert_imdb_example_to_features(example, max_seq_length, tokenizer, do_display=False, do_mask=False):
    pad_idx = tokenizer.vocab['[PAD]']
    cls_idx = tokenizer.vocab['[CLS]']
    sep_idx = tokenizer.vocab['[SEP]']

    tokens = example.tokens_a
    input_ids = [cls_idx] + text_to_ids(tokens.split(), tok.vocab)[:max_seq_length-2] + [sep_idx]
    segment_ids = [1]*len(input_ids)
    input_mask = [1]*len(input_ids)

    if len(input_ids) < max_seq_length:
        n_pad = max_seq_length - len(input_ids)
        input_ids = input_ids + [pad_idx]*n_pad
        segment_ids = segment_ids + [0]*n_pad
        input_mask = input_mask + [0]*n_pad
    else:
        input_ids = input_ids[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
        input_mask = input_mask[:max_seq_length]

    assert len(input_ids) == max_seq_length, 'input_ids has invalid length of {}'.format(len(input_ids))
    assert len(segment_ids) == max_seq_length, 'segment_ids has invalid length of {}'.format(len(segment_ids))
    assert len(input_mask) == max_seq_length, 'input_mask has invalid length of {}'.format(len(input_mask))

    features = InputFeatures(input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            lm_label_ids=None,
                            is_next=None,
                            label=example.label)
    return features

In [35]:
features = list(map(lambda x: convert_imdb_example_to_features(x, 100, tok), examples))

In [37]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)

In [38]:
data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

In [39]:
dataloader = DataLoader(data, batch_size=16)

In [40]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def validate_classifier_model(model, valid_dataloader, threshold=0.5):
    accuracy = AverageMeter()

    # transform to evaluating mode
    model.eval()

    label_list = []
    pred_list = []
    tk0 = tqdm(valid_dataloader, desc='Evaluating')
    for step, batch in enumerate(tk0):
        batch = tuple(t.cuda() for t in batch)
        input_ids, segment_ids, input_mask, labels = batch

        pred = model(
            input_ids=input_ids,
            token_type_ids=segment_ids,
            attention_mask=input_mask,
            labels=None)

        if isinstance(pred, tuple):
            _, pred = pred
        pred = torch.sigmoid(pred) > threshold

        labels = labels.cpu().numpy()
        label_list.extend(labels.astype(int))
        pred = pred.cpu().numpy()
        pred_list.extend(pred)

    return label_list, pred_list

In [41]:
labels, predictions = validate_classifier_model(model, dataloader)

Evaluating: 100%|██████████| 313/313 [00:14<00:00, 20.87it/s]


In [42]:
predictions = np.array(predictions).reshape(len(predictions), 2)
test_df['prediction'] = predictions[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [43]:
print(test_df.shape, predictions.shape)
if isinstance(test_df.index, pd.MultiIndex):
    print('re-indexed')
    test_df = test_df.reset_index(drop=True)

(5000, 3) (5000, 2)


In [44]:
test_df.head(5)

Unnamed: 0,text,label,prediction
3725,damn i thought i would seen some bad western...,0,0
10477,this could have been interesting  a japan set...,0,0
3663,n b spoilers within assigning an artistic d...,0,0
8815,all of you who despaired looking at the emptin...,1,1
1242,the last hunt is one of the few westerns ever ...,1,1


In [45]:
pred = np.array(predictions).reshape(test_df.shape[0], 2)[:,1]
test_y = test_df['label']
rec = recall_score(test_y, pred)
pre = precision_score(test_y, pred)
auc = roc_auc_score(test_y, pred)

print('rec: {:.4f}'.format(rec))
print('pre: {:.4f}'.format(pre))
print('auc: {:.4f}'.format(auc))

rec: 0.9762
pre: 0.9754
auc: 0.9760
