In [1]:
import os,sys
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.3-src.zip'))
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.0.2
      /_/

Using Python version 2.7.9 (default, Jun 29 2016 13:08:31)
SparkSession available as 'spark'.


## 載入 Word2Vec Embeddings

In [2]:
class PixWord2Vec:
    # vocabulary indexing 
    index2word = None
    word2indx = None
    
    #  embeddings vector
    embeddings = None
    
    # Normailized embeddings vector
    final_embeddings = None
    
    # hidden layer's weight and bias
    softmax_weights = None
    softmax_biases = None
    


In [3]:
# 此 Model 檔必需要先 Trainig Word2Vec
import pickle
pixword = pickle.load(open("./pixword_cnn_word2vec.pk"))

## 資料前處理

In [21]:
import numpy as np
import random
import tensorflow as tf
import json
from pyspark import StorageLevel

In [5]:
vocabulary_size = len(pixword.index2word)
print "vocabulary_size" , vocabulary_size

vocabulary_size 500000


# 設計 Graph

In [6]:
pixword.embeddings.shape

(500000, 128)

In [94]:
import math

append_size = 1000

batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
graph = tf.Graph()

with graph.as_default():
    np.random.seed(0)
    # doc(tags or category) batch size , this is key !!! And this batch size cant be too large !!
    append_size = 1000

    # Input data.
    train_dataset = tf.placeholder(tf.int32, shape=[None])
    train_labels = tf.placeholder(tf.int32, shape=[None, 1])
    
    # Variables.
    embeddings = tf.Variable(np.append(pixword.embeddings,
                         np.random.randn(append_size,128)).reshape(vocabulary_size+append_size,128).astype('float32'))
    softmax_weights = tf.Variable(np.append(pixword.embeddings,
                         np.random.randn(append_size,128)).reshape(vocabulary_size+append_size,128).astype('float32'))
    softmax_biases = tf.Variable(np.append(pixword.softmax_biases,[0]*append_size).astype('float32'))

    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
                               train_labels, num_sampled, vocabulary_size))

    # Optimizer.
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    init = tf.global_variables_initializer()


In [95]:
session = tf.Session(graph=graph)
session.run(init)

# Build Category2Vec 

* 參考論文: https://cs.stanford.edu/~quocle/paragraph_vector.pdf
* 演算法核心概念圖
<img width="50%" src="./doc/doc2vec_concept.png">
* 基本概念說明: 將 Document( or Category or Tag Set)也是視為一個 embedding vector , 而且這個 embedding vector 的概念就再出現這些關鍵字下用來代表 Document( or Category or Tag Set) 
* 得到一個小結論
 * 當在算 Tag2Vec ，如果想要正確表達 Tag2Vec 與原本 Vocabulary 之間的關，原本的 final_embeddings 必需重新更新一次，其程式碼如下
```python
return (final_embeddings[vocabulary_size:vocabulary_size+index+1],final_embeddings[:vocabulary_size])
```
 * AVG Vector 用肉眼隨機抽樣觀察效果似乎效果比 Tag2Vec 的效果要來的好，其實驗結果在下面的 Block 中有呈現出來


In [61]:
def train(batch_data,batch_labels):
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    return l

In [62]:
def searchByVec(vec,final_embeddings,scope=5):
    sim = np.dot(final_embeddings,vec)
    for index in sim.argsort()[-scope:][::-1][1:]:
        print pixword.index2word[index],sim[index]

# 測試 Category Vec

In [96]:
cate_vec = []
count = 0

def tags2vec(words_set):
    np.random.seed(0)
    session.run(init)
    if len(words_set)>append_size: raise
    cat_data = []
    cat_label = []
    for index , words in enumerate(words_set):

        for w in words :
            if w not in pixword.word2indx :
                continue
            wi =  pixword.word2indx[w]

            cat_data.append(vocabulary_size+index)
            cat_label.append([wi])

    for _ in range(20):
        train(cat_data,cat_label)
    final_embeddings = session.run(normalized_embeddings)

    return (final_embeddings[vocabulary_size:vocabulary_size+index+1],final_embeddings[:vocabulary_size])
    


    

In [141]:
words = [u'旅遊',u'台東']
avg_vec = np.average([pixword.final_embeddings[pixword.word2indx[w]]  for w in words],0)

for w in words:
    print "#{}#".format(w.encode('utf-8'))
    searchByVec(pixword.final_embeddings[pixword.word2indx[w]] ,pixword.final_embeddings)
print

# 單純取這此字的 Vector Mean
print "AVG Vector"
searchByVec(avg_vec,pixword.final_embeddings,scope=20)

print 


# 假設有個一 document 包含這些 tag 字 ，所產生的新的 vecotr 所找的新的關鍵字如下
print "Tag Vector"
result = tags2vec([words])
searchByVec(result[0][0],result[1],scope=20)

#旅遊#
旅行 0.658322
遊玩 0.549489
小旅行 0.481131
行時 0.457654
#台東#
花蓮 0.641304
花東 0.633475
池上 0.614713
台東市 0.612805

AVG Vector
台東 0.583593
花東 0.444601
台東旅遊 0.419452
花蓮旅遊 0.416617
旅行 0.394489
花蓮 0.390234
鹿野 0.381428
單車環島 0.373653
台東市 0.366209
臺東縣 0.357951
小旅行 0.353606
池上 0.347728
花東縱谷 0.343686
台東縣 0.340992
鹿野高台 0.340063
台東鹿野 0.332205
臺東 0.32835
日本旅行 0.327987
三仙台 0.322876

Tag Vector
同遊 0.3913
面待 0.373453
繳付 0.370212
這不喝 0.370032
盲射 0.368177
內自 0.367993
麗星 0.362303
天人峽 0.360436
低薪 0.359077
社福 0.354792
徐大英 0.349344
PowerFalcon 0.349034
床墊尺寸 0.348453
北秋田 0.346732
地铁 0.343437
英哩 0.342608
電啊 0.341504
每隊 0.341301
不壓 0.340378


In [65]:
# read raw data
def checkInVoc(tlist):
    r = []
    for t in tlist :
        if t in pixword.word2indx:
            r.append(t)
    return r
def merge(x):
    x[0]['tags'] = x[1]
    return x[0]


In [22]:
test_set = sc.textFile("./data/cuted_test/").map(
    json.loads).map(
    lambda x : (x,x['tags']) ).mapValues(
    checkInVoc).filter(
    lambda x : len(x[1])>1)
test_set.persist(StorageLevel.DISK_ONLY)

PythonRDD[11] at RDD at PythonRDD.scala:48

In [24]:
!rm -rvf ./data/cuted_and_tags/
import json
test_set.map(merge).map(json.dumps).saveAsTextFile("./data/cuted_and_tags/")

In [66]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            if 'crc' in fname : continue
            if fname.startswith('_'):continue
            for line in open(os.path.join(self.dirname, fname)):
                yield line

In [26]:
sc.textFile("./data/cuted_and_tags/").count()

109486

# 開始轉換成向量

In [72]:
def toVector(docs,tags_set,f):
    res_vecs = tags2vec(tags_set)
    if len(docs) != len(res_vecs[0]): 
        print len(docs) , len(res_vecs)
        raise
    for index,d in enumerate(docs):
        d['tag_vec'] = [float(i) for i in list(res_vecs[0][index])]
    for d in docs:
        jstr = json.dumps(d)
        f.write(jstr+'\n')    


In [98]:
!rm ./data/cuted_and_vec.json
f = open('./data/cuted_and_vec.json','w')
docs = []
tags_set = []
for doc in MySentences("./data/cuted_and_tags/"):

    js_objects = json.loads(doc)
    
    docs.append(js_objects)
    tags_set.append(js_objects['tags'])
    
    if len(docs) == 1000:
        toVector(docs,tags_set,f)
        docs = []
        tags_set = []
        print '*',
    
toVector(docs,tags_set,f)


* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *


In [99]:
def loadjson(x):
    try:
        return json.loads(x)
    except:
        return None

In [100]:
jsondoc = sc.textFile(
    "./data/cuted_and_vec.json").map(
    loadjson).filter(
    lambda x : x!=None)

In [101]:
from operator import add

# Load TagVectors

In [102]:
import json
def loadjson(x):
    try:
        return json.loads(x)
    except:
        return None
url_vecs = np.array(jsondoc.map(
    lambda x: np.array(x['tag_vec'])).collect())

In [103]:
url_vecs.shape

(109481, 128)

In [104]:
urls = jsondoc.collect()

In [130]:
def search(wvec,final_embeddings,cate):
#     wvec = final_embeddings[windex]
    sim = np.dot(final_embeddings,wvec)
    result = []
    for index in sim.argsort()[-1000:][::-1][1:]:
        if urls[index]['category'] == cate and sim[index]>0.9 :
            print urls[index]['url'],sim[index],
            for tag in urls[index]['tags']:
                print tag,
            print
    return sim

# 進行隨機抽樣驗證

In [140]:
index = np.random.randint(10000)
print urls[index]['url'],urls[index]['category'],
for tag in urls[index]['tags']:
    print tag,
print
print 
print "########以下是用 Tag Vecotr 所找出來的 URL #########"

sim = search(url_vecs[index],url_vecs,urls[index]['category'])
print 
print 

print "########以下是直接用第一個 Tag 直接作比對的結果,效果好非常多 #########"

count = 0 
for _,u in  enumerate(urls):
    for t in u['tags']:
        if t  == urls[index]['tags'][0] :
            count = count + 1
            print u['url']
            
            for tt in u['tags']:
                print tt,
            print 
            break
    if count > 500 : break
        

http://eva6955.pixnet.net/blog/post/4991299 美味食記 中壢美食 大江購物中心 明太子

########以下是用 Tag Vecotr 所找出來的 URL #########
http://dream3s.pixnet.net/blog/post/172972230 0.999890313273 義大利麵 燉飯
http://tintin82477.pixnet.net/blog/post/43065037 0.999873135888 旺旺 雪餅
http://yyliu79529.pixnet.net/blog/post/21882688 0.999868960527 大巨人 鐵板燒
http://chin119.pixnet.net/blog/post/408348865 0.999868605276 桃園美食 下午茶 人妻
http://wings30222.pixnet.net/blog/post/186486225 0.999858740366 茶寮都路里 祇園
http://yenju670810.pixnet.net/blog/post/203805937 0.999856525826 高雄 大遠百 八坂丼屋
http://weiyingliao.pixnet.net/blog/post/115078261 0.999852852727 星巴克 咖啡 艋舺
http://emily710223.pixnet.net/blog/post/422684488 0.999847399745 台中美食 SweetsPURE
http://kelly09308250.pixnet.net/blog/post/219993081 0.999844465043 台東 美食 排隊 環島
http://anitavs50.pixnet.net/blog/post/43141645-%e5%a5%bd%e5%96%9d%e5%a5%bd%e5%90%83%7e%e7%ab%b9%e5%8d%97-nina-cafe-%28%e9%9b%a2%e9%a0%ad%e4%bb%bd%e4%b9%9f%e5%a5%bd%e8%bf%91%29 0.99983957455 頭份 竹南
http://alwa1919.pixnet.net

## 結論
* 由於 TagVec 一次只能算一千筆，為求 Random Initial Weight 初始值一至，故要設定 random seed
* TagVec 效果似乎沒有那麼，但是加上 Category 作為篩選條件，似乎有稍微改善一些
