In [2]:
from __future__ import division

In [3]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

In [4]:
import numpy as np
import operator
import time
import glob

In [5]:
from random import shuffle

In [6]:
from collections import defaultdict

### 計算 POS, NEG 詞頻，觀察一下訓練與測試資料（都 random shuffle 過了）

#### Training data (POS)

In [7]:
train_size = 0

with open('training-data/train_POS.txt') as f:
    words = []
    for line in f:
        train_size += 1
        words += utils.to_unicode(line).split()

vocab = defaultdict(lambda: 0)
for word in words:
    vocab[word] += 1

train_POS_vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)

# for key, val in train_POS_vocab:
#     print key, val

#### Training data (NEG)

In [8]:
with open('training-data/train_NEG.txt') as f:
    words = []
    for line in f:
        train_size += 1
        words += utils.to_unicode(line).split()

vocab = defaultdict(lambda: 0)
for word in words:
    vocab[word] += 1

train_NEG_vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)

# for key, val in train_NEG_vocab:
#     print key, val

#### Testing data (POS)

In [9]:
test_size = 0

with open('training-data/test_POS.txt') as f:
    words = []
    for line in f:
        test_size += 1
        words += utils.to_unicode(line).split()

vocab = defaultdict(lambda: 0)
for word in words:
    vocab[word] += 1

test_POS_vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)

# for key, val in test_POS_vocab:
#     print key, val

#### Testing data (NEG)

In [10]:
with open('training-data/test_NEG.txt') as f:
    words = []
    for line in f:
        test_size += 1
        words += utils.to_unicode(line).split()

vocab = defaultdict(lambda: 0)
for word in words:
    vocab[word] += 1

test_NEG_vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)

# for key, val in test_NEG_vocab:
#     print key, val

In [12]:
print 'POS, NEG 訓練資料各有：{} 筆。訓練資料共有：{} 筆'.format(int(train_size/2), train_size)
print 'POS, NEG 測試資料各有：{} 筆。測試資料共有：{} 筆'.format(int(test_size/2), test_size)

POS, NEG 訓練資料各有：658 筆。訓練資料共有：1316 筆
POS, NEG 測試資料各有：165 筆。測試資料共有：330 筆


### 開始訓練 doc2vec

In [13]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [14]:
sources = {
    'training-data/train_POS.txt':'TRAIN_POS',
    'training-data/train_NEG.txt':'TRAIN_NEG',
    'training-data/test_POS.txt': 'TEST_POS',
    'training-data/test_NEG.txt': 'TEST_NEG'
}

sentences = LabeledLineSentence(sources)

In [15]:
d2v = Doc2Vec(min_count=10, window=20, size=100, sample=1e-5, negative=15, workers=20)
d2v.build_vocab(sentences.to_array())

In [16]:
for epoch in range(20):
    print 'Epoch: {}'.format(epoch+1)
    start_time = time.time()
    d2v.train(sentences.sentences_perm())
    print '--- {} seconds ---'.format(time.time() - start_time)

Epoch: 1
--- 0.864768981934 seconds ---
Epoch: 2
--- 0.781358957291 seconds ---
Epoch: 3
--- 1.02417707443 seconds ---
Epoch: 4
--- 0.989632844925 seconds ---
Epoch: 5
--- 0.89649605751 seconds ---
Epoch: 6
--- 0.956152200699 seconds ---
Epoch: 7
--- 0.956073999405 seconds ---
Epoch: 8
--- 0.990180015564 seconds ---
Epoch: 9
--- 0.909914016724 seconds ---
Epoch: 10
--- 1.01475381851 seconds ---
Epoch: 11
--- 0.826818943024 seconds ---
Epoch: 12
--- 0.78685092926 seconds ---
Epoch: 13
--- 0.872048854828 seconds ---
Epoch: 14
--- 1.13171219826 seconds ---
Epoch: 15
--- 1.13373208046 seconds ---
Epoch: 16
--- 0.992563962936 seconds ---
Epoch: 17
--- 0.960308074951 seconds ---
Epoch: 18
--- 0.773078918457 seconds ---
Epoch: 19
--- 0.788988113403 seconds ---
Epoch: 20
--- 0.978390216827 seconds ---


### 看看 doc2vec 的 vocabulary (到時候要 handle unknown word)

In [19]:
VOCAB = [key for key, _ in d2v.vocab.items()]
# for vocab in VOCAB: print vocab

In [18]:
# vocab size
len(d2v.vocab.items())

1865

### 觀察 word vector 品質

In [20]:
for key, val in d2v.most_similar(u'前方'):
    print key, val

高能 0.92427611351
预警 0.840725779533
要来 0.827927708626
神曲 0.764733314514
背影 0.758471131325
非战斗 0.729398429394
离开 0.728282749653
请速 0.723563849926
人员 0.723447680473
一段 0.696435213089


### Save and load doc2vec model

In [1209]:
# d2v.save('model/doc2vec_model.d2v')
# d2v = Doc2Vec.load('model/doc2vec_model.d2v')

### Train a Classifier

In [21]:
document_dict = {} # key: document tag, value: list of words of document
for tagged_doc in sentences.to_array():
    document_dict[tagged_doc.tags[0]] = tagged_doc.words

In [62]:
# 我們改以 document 中每個 word 的 vector 乘上該字出現在該 document 的字數權重
# 再 weighted average 來表示一個 document 的 vector

def convert_words_to_vector(words):
    word_count = defaultdict(lambda: 0)
    for word in words:
        word_count[word] += 1
    doc_vec = np.zeros(d2v.vector_size)
    for word in words:
        try:
            doc_vec += d2v[word] #* (word_count[word]/len(words))
        except KeyError:
            doc_vec += d2v[u'UNK']
    return doc_vec

In [24]:
# 這樣來取得 tag 為 'TRAIN_POS_2' 的 document (同樣等於是 'TRAIN_POS.txt' 內第二個 document)
# for word in document_dict['TRAIN_POS_2']:
#     print word

### 把 training data, testing data 的 vectors, label 都各自放進 array (處理成 scikit-learn model input format)
第 0 個 document 的 vector 就是 train_arrays[0]，其對應的 label 就是 train_labels[0]

In [27]:
def get_train_test_data(option='doc2vec'):
    # key: 第幾個 document; value: document vector (100-D)
    train_arrays = np.zeros((train_size, d2v.vector_size), dtype='f')
    # POS: 1; NEG: 0
    train_labels = np.zeros(train_size)

    half_train_size = int(train_size/2)

    for i in range(half_train_size):
        prefix_train_pos = 'TRAIN_POS_' + str(i)
        prefix_train_neg = 'TRAIN_NEG_' + str(i)
        if option == 'doc2vec':
            train_arrays[i] = d2v.docvecs[prefix_train_pos]
            train_arrays[half_train_size + i] = d2v.docvecs[prefix_train_neg]
        if option == 'word2vec':
            train_arrays[i] = convert_words_to_vector(document_dict[prefix_train_pos])
            train_arrays[half_train_size + i] = convert_words_to_vector(document_dict[prefix_train_neg])
        train_labels[i] = 1
        train_labels[half_train_size + i] = 0

    test_arrays = np.zeros((test_size, d2v.vector_size))
    test_labels = np.zeros(test_size)

    half_test_size = int(test_size/2)

    for i in range(half_test_size):
        prefix_test_pos = 'TEST_POS_' + str(i)
        prefix_test_neg = 'TEST_NEG_' + str(i)
        if option == 'doc2vec':
            test_arrays[i] = d2v.docvecs[prefix_test_pos]
            test_arrays[half_test_size + i] = d2v.docvecs[prefix_test_neg]
        if option == 'word2vec':
            test_arrays[i] = convert_words_to_vector(document_dict[prefix_test_pos])
            test_arrays[half_test_size + i] = convert_words_to_vector(document_dict[prefix_test_neg])
        test_labels[i] = 1
        test_labels[half_test_size + i] = 0
    
    return (train_arrays, train_labels, test_arrays, test_labels)

### Train a classification model (need keras)

In [39]:
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, Dropout
from keras.metrics import precision, recall, fmeasure

In [72]:
classifier = Sequential()
classifier.add(Dense(500, input_dim=d2v.vector_size, init='normal', activation='relu'))
classifier.add(Dropout(0.5))
classifier.add(Dense(500, init='normal', activation='relu'))
classifier.add(Dropout(0.5))
classifier.add(Dense(1, activation='sigmoid'))

classifier.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[precision, recall, fmeasure])

In [73]:
train_arrays, train_labels, test_arrays, test_labels = get_train_test_data(option='word2vec')

classifier.fit(train_arrays, train_labels,
          nb_epoch=100,
          batch_size=128)
score = classifier.evaluate(test_arrays, test_labels, batch_size=50)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Save and load model  (need h5py)

In [42]:
classifier.save('model/classifier_model.h5')
# classifier = load_model('model/classifier_model.h5')

### Make predictions

In [1437]:
import os
import json

In [1439]:
"""
載入前面訓練好的 models
"""
print 'Loading classifier model ...'
classifier = load_model('model/classifier_model.h5')
print 'Load classifier model success.'

Loading classifier model ...
Load classifier model success.


In [1440]:
from math import floor

def secondTohhmmss(second):
  h = int(floor(second / 3600))
  m = int(floor(second % 3600 / 60))
  s = int(floor(second % 3600 % 60))
  return (str(h) if (h > 9) else '0' + str(h)) + ':' + (str(m) if (m > 9) else '0' + str(m)) + ':' + (str(s) if (s > 9) else '0' + str(s))

In [1441]:
"""
使用 './processed-script/split.py' 的 'toClip()'
依照指定的時間間隔（30秒）將處理完的彈幕切割為數個片段，將每個片段中的所有彈幕串成同一列（視為一個 document）
"""
def toClips(filename):
    interval = 30
    overlap = 15
    res = defaultdict(lambda: [])

    try:
        with open(filename) as f:
            data = json.load(f)
            for comment in data['comments']:
                time = int(comment['time'])
                # 0~30, 31~60, 61~90, ...
                if time%interval == 0 and time != 0:
                    res[time].append(comment)
                else:
                    res[(time//interval+1)*interval].append(comment)
                # 15~45, 46~75, 76~105, ...
                if time > 15:
                    if (time-overlap)%interval == 0:
                        res[time].append(comment)
                    else:
                        res[(time//interval+1)*interval + overlap].append(comment)
                        
            # Merge words to document for every comment
            document_dict = {}
            for time, comments in sorted(res.items()):
                document = ' '.join([' '.join(comment['words']) for comment in comments])
                document_dict[time] = document
            return document_dict
        
    except FileNotFoundError:
        print('Error. Missing {}.'.format(filename))
        return None

In [1477]:
"""
剛下載未處理過的彈幕(.json)必須放在 './data/' 中
使用 os.system 執行系統指令執行彈幕的前處理後，會把處理完的彈幕存到'./processed-data'

filename: 要highlight的新電影彈幕資料
preprocess_script_dirname: 放前處理程式(preprocess.js)的資料夾路徑
processed_data_dirname: 放前處理完後的彈幕資料後的資料夾路徑
"""
filename = '逃學威龍3.json'
preprocess_script_dirname = 'preprocess-script/'
processed_data_dirname = 'processed-data/'

os.system('cd {} && node preprocess.js -f {}'.format(preprocess_script_dirname, filename))

0

In [1478]:
document = toClips('{}{}'.format(processed_data_dirname, filename))

In [1495]:
highlight_time_ranges = []
for time, doc in sorted(document.items()):
    # reload the d2v model to solve different prediction output when input the same document.
    d2v = Doc2Vec.load('model/doc2vec_model.d2v')
    doc_vec = d2v.infer_vector(doc.split())
    predict = classifier.predict(np.array([doc_vec]))[0][0]
    if int(round(predict)) == 1:
        highlight_time_ranges.append((time-30, time))

In [1496]:
for start, end in highlight_time_ranges:
    print '{} -- {}'.format(secondTohhmmss(start), secondTohhmmss(end))

00:05:30 -- 00:06:00
00:06:15 -- 00:06:45
00:09:45 -- 00:10:15
00:20:30 -- 00:21:00
00:20:45 -- 00:21:15
00:21:15 -- 00:21:45
00:23:30 -- 00:24:00
00:25:30 -- 00:26:00
00:29:30 -- 00:30:00
00:36:30 -- 00:37:00
00:56:00 -- 00:56:30
00:56:15 -- 00:56:45
01:02:15 -- 01:02:45
01:04:00 -- 01:04:30
01:04:15 -- 01:04:45
01:05:30 -- 01:06:00
01:05:45 -- 01:06:15
01:14:00 -- 01:14:30
01:14:15 -- 01:14:45
01:15:00 -- 01:15:30
01:19:45 -- 01:20:15
01:23:00 -- 01:23:30
01:25:15 -- 01:25:45
