Json Parsing

In [1]:
import copy
from pyspark.sql import SQLContext
import json

sqlContext = SQLContext(sc)
df = sqlContext.jsonFile("./spark_tutorial_article.json")

gf = df.map(lambda x : (x[2],x[5],x[12]))

print type(gf)
#spark.read.json(sc.wholeTextFiles('./spark_tutorial_article.json').values())

<class 'pyspark.rdd.PipelinedRDD'>


In [2]:
#sc.textFile("./spark_tutorial_article.json").map(json.loads).take(1)[0][u'author']

用BeautifulSoup擷取內容，並套用Jieba斷詞

In [3]:
## getContent: for input aritcle, get it own word set via jieba.cut()
def getContent(x):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(x)
    text = soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
    import jieba
    r = list()
    for term in jieba.cut(text):
        if len(term) > 1 and checkword(term): r.append(term)
    return r

def checkword(x):
    return all(u'\u4e00' <= c <= u'\u9fff' for c in x)

In [4]:
text_token = gf.map(lambda x: (x[0], getContent(x[1]), x[2]))

In [5]:
#check text_token
#text_token.first()
#text_token.count()

計算每篇文章的TF-IDF Vector

In [6]:
def cal_tf(tokens):
    d = {}
    for word in tokens:
        if not word in d:
            d[word] = 1
        else:
            d[word] += 1
    for word in d:
        d[word] = float(d[word])/len(tokens)
    return d

text_token_tf = text_token.map(lambda x: cal_tf(x[1]))

In [7]:
#check text_token_tf
#text_token_tf.first()

In [8]:
def cal_idf(docs):
    N = docs.count()
    uniqueTokens = docs.map(lambda x : list(set(x[1])))
    token_sum_tuples = uniqueTokens.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x,y: x+y)
    return token_sum_tuples.map(lambda x : (x[0], float(N)/x[1]))

In [9]:
def TFIDF(tokens, idfs):
    tfs = cal_tf(tokens)
    for tk in tfs:
        tfs[tk] = tfs[tk]*idfs[tk]
    tfidf_Dict = tfs
    return tfidf_Dict

In [10]:
doc_idfs = cal_idf(text_token)

doc_c = doc_idfs.collectAsMap()  #my idf dict

text_tfidf =  TFIDF(text_token.collect()[0][1], doc_c)

print text_token.collect()[0][0]

美味食記


In [11]:
#check text_tfidf
#text_tfidf
#text_token.collect()[0][1]

Cosine similarity

In [12]:
import math

def dotprod(a, b):
    dotsum = 0
    for tk in a:
        if tk in b:
            dotsum += a[tk]*b[tk]
    return dotsum

def norm(a):
    return math.sqrt(dotprod(a,a))

def cossim(a, b):
    return dotprod(a,b)/(norm(a) * norm(b))

In [13]:
def cosineSimilarity(string1, string2, idfsDictionary):
    w1 = tfidf(string1, idfsDictionary)
    w2 = tfidf(string2, idfsDictionary)
    return cossim(w1, w2)

Rule One - top words in a text

In [14]:
def showTopWord(link):
    tokens = text_token.filter(lambda x: x[2] == link).collect()[0][1]
    tokens_weights = TFIDF(tokens, doc_c)
    print type(tokens_weights)
    tokens_weights_sorted = sorted(tokens_weights, key=tokens_weights.get, reverse=True)
    for index in range(0,9):
        print tokens_weights_sorted[index], tokens_weights[tokens_weights_sorted[index]]
    print tokens_weights_sorted[:14]
    return tokens_weights_sorted[:14]

In [33]:
link = u'http://lovecc6.pixnet.net/blog/post/73513867'
#showTopWord(link)

In [16]:
urls = text_token.map(lambda x : x[2])

#top_word_list = [showTopWord(i) for i in urls]
#top_word_list = urls.map(lambda x: showTopWord(x))
#top_word_list

Rule Two - Query in text

In [17]:
query_input = [u'淺草', u'雷門', u'和服']

def check_in(query, text):
    count = 0
    for q in query:
        if q in text:
            count += 1
    return count

def query_points(query):
    query_points_table = text_token.map(lambda x : check_in(query, x[1]))
    return query_points_table

In [18]:
query_pts = query_points(query_input).collect()

len(query_pts)

2228

Rule 3 - Term Weights

In [19]:
def term_weights(tokens):
    d = {}
    for word in tokens:
        if not word in d:
            d[word] = 1
        else:
            d[word] += 1
    return d

In [20]:
def term_points(query, point_dict):
    points = 0
    for i in query:
        if i in point_dict:
            points += point_dict[i]
                
    return points

tf_list = text_token.map(lambda x : term_weights(x[1])).collect()

In [21]:
term_pts = [term_points(query_input, i) for i in tf_list]
len(term_pts)

2228

In [22]:
def doc_points(term_weight_pts, query_pts):
#    tw_dict = text_token.map(lambda x: term_weights(x[1])).collect()
#    doc_point = text_token.map(lambda x : (((term_points(query_input, tw_dict))*(check_in(query_input, x[1])) , x[2])))
    doc_point = [i*j for i,j in zip(term_weight_pts, query_pts)]
    
    return doc_point

In [23]:
total_pts = doc_points(term_pts, query_pts)

In [32]:
#print type(total_pts)
total_pts_sort = sorted(total_pts, reverse=True)
#total_pts_sort

In [25]:
url_list = text_token.map(lambda x : (x[2]))

In [29]:
#print url_list 
#url_list.collect()

In [30]:
top_text = zip(total_pts_sort, url_list.collect())
#print top_text
top_text.sort(key=lambda x: x[0], reverse=True)

In [31]:
top_text[:10]

[(8, u'http://louis740321.pixnet.net/blog/post/373737533'),
 (3, u'http://jennifersec7.pixnet.net/blog/post/344940716'),
 (2, u'http://jill7708.pixnet.net/blog/post/236934860'),
 (2, u'http://castor0605.pixnet.net/blog/post/371068016'),
 (2, u'http://lovecc6.pixnet.net/blog/post/73513867'),
 (2, u'http://jende168.pixnet.net/blog/post/74962897'),
 (2,
  u'http://merecat.pixnet.net/blog/post/222227141-%e5%96%ab%ef%bc%8a%e5%8f%b0%e5%8c%97%ef%bc%8a%e5%96%9c%e4%be%86%e7%99%bb%e2%80%a7kitchen12%ef%bd%9e%e7%94%9c%e9%a3%9f%e7%9a%84%e5%a4%a9%e5%a0%82%e2%97%95%e2%80%bf'),
 (1, u'http://vilo92.pixnet.net/blog/post/434651582'),
 (1, u'http://abby0318.pixnet.net/blog/post/370358651'),
 (1, u'http://findse.pixnet.net/blog/post/426084965')]