# Parse Json

In [1]:
def parseRaw(json_map):
    url = json_map['url']
    content = json_map['html']
    return (url,content)

# 使用 BeautifulSoup 及 Jieba 來處理文章內容

In [2]:
def getContent(x):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(x)
    text = soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
    import jieba
    r = list()
    for term in jieba.cut(text):
        if len(term) > 1 and checkword(term): r.append(term)
    return r
def checkword(x):
    return all(u'\u4e00' <= c <= u'\u9fff' for c in x)

# 載入原始 RAW Data

In [3]:
import json
travel_content = sc.textFile("./pixnet.txt").map(json.loads).map(parseRaw)
makeup_content = sc.textFile("./makeup.txt").map(json.loads).map(parseRaw)

#建立詞庫對照表

In [4]:
tr_terms = travel_content.map(lambda x : getContent(x[1])).flatMap(lambda x : x)
mk_terms = makeup_content.map(lambda x : getContent(x[1])).flatMap(lambda x : x)
all_terms = tr_terms.union(mk_terms).distinct().collect()
all_terms_map = dict()
index = 0 
for i in all_terms:
    all_terms_map[i] = index
    index+=1
all_terms_map

{u'\u65c5\u820d': 782,
 u'\u541b\u9d3b\u570b\u969b': 643,
 u'\u897f\u65bd': 3065,
 u'\u767d\u6885': 3,
 u'\u5bae\u8cde\u6afb': 4,
 u'\u798f\u5bb9': 3435,
 u'\u897f\u65b9': 2440,
 u'\u5206\u5e97': 3642,
 u'\u4e0a\u9996': 2442,
 u'\u904a\u8eca\u6cb3': 41,
 u'\u5305\u88dd': 8,
 u'\u56de\u6709': 1267,
 u'\u5275\u610f': 2448,
 u'\u6478\u5f69': 3651,
 u'\u89c0\u5149\u884c': 10,
 u'\u5de6\u4eac\u5340': 1274,
 u'\u5730\u9435\u8c37': 3655,
 u'\u9084\u6c92': 842,
 u'\u6557\u5bb6\u6587': 1276,
 u'\u670d\u98fe\u5e97': 3658,
 u'\u9019\u5929': 16,
 u'\u4e00\u6a23': 1279,
 u'\u8c93\u7ad9': 2458,
 u'\u5742\u5167': 3659,
 u'\u6dfa\u8349': 1281,
 u'\u5e36\u6709': 1284,
 u'\u4eac\u90fd\u5e02': 2462,
 u'\u5f88\u8cb4': 2465,
 u'\u653e\u5165': 2049,
 u'\u91ce\u7dda': 2467,
 u'\u6210\u5206': 227,
 u'\u4e0d\u5f97\u4e0d': 3130,
 u'\u5929\u969b\u7dda': 1293,
 u'\u96d9\u8272': 4760,
 u'\u7b2c\u4e8c': 2473,
 u'\u53ef\u4ee3\u8cfc': 1896,
 u'\u97d3\u7cfb': 233,
 u'\u53e4\u57ce': 996,
 u'\u5403\u4e0d\u5b8c': 3676,
 

In [5]:
from pyspark.mllib.linalg import Vectors ,SparseVector
from pyspark.mllib.regression import LabeledPoint

def mapFeature(terms):
    fs = dict()
    for term in terms:
        if term not in all_terms_map : continue
        index = all_terms_map[term]
        if index not in fs:
            fs[index] = 1
        else :
            fs[index] += 1
    return fs

def buildFeature(label,terms):
    fs = mapFeature(terms)
    vec = SparseVector(len(all_terms_map),fs)
    return LabeledPoint(label, vec)

tr_fs = travel_content.map(lambda x : buildFeature(0, getContent(x[1])))
mk_fs = makeup_content.map(lambda x : buildFeature(1, getContent(x[1])))

In [6]:
all_fs =  tr_fs.union(mk_fs)
all_fs.count()

10

# 建立 Naive Bayes Classifier

In [7]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
model = NaiveBayes.train(all_fs, 1.0)

#來問一下電腦吧，這個句子是一類的？？

In [8]:
import jieba
doc = jieba.cut("我想要去馬來西亞來去旅遊")
f = SparseVector(len(all_terms_map),mapFeature(doc))
if model.predict(f) ==1 :
    print "這是美妝類"
else :
    print "這是旅遊類"

Building prefix dict from /usr/local/lib/python2.7/dist-packages/jieba/dict.txt ...
DEBUG:jieba:Building prefix dict from /usr/local/lib/python2.7/dist-packages/jieba/dict.txt ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.303 seconds.
DEBUG:jieba:Loading model cost 0.303 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


這是旅遊類


In [9]:
doc = list(jieba.cut("我想要買化妝品，且變漂亮"))
f = SparseVector(len(all_terms_map),mapFeature(doc))
if model.predict(f) ==1 :
    print "這是美妝類"
else :
    print "這是旅遊類"

這是美妝類
