##Article searching and Restaurant Matching

實作簡易的文章搜尋系統，以及標題餐廳推薦

## 第一部分  文章搜尋系統

### 引入Json檔，並且將所需資料（類別，URL，名稱）

In [2]:
import copy
from pyspark.sql import SQLContext
import json

sqlContext = SQLContext(sc)
df = sqlContext.jsonFile("./spark_tutorial_article.json")

gf = df.map(lambda x : (x[2],x[5],x[12]))

print type(gf)
#spark.read.json(sc.wholeTextFiles('./spark_tutorial_article.json').values())

<class 'pyspark.rdd.PipelinedRDD'>


In [3]:
#sc.textFile("./spark_tutorial_article.json").map(json.loads).take(1)[0][u'author']

### 用BeautifulSoup擷取內容，並套用Jieba斷詞

In [4]:
## getContent: for input aritcle, get it own word set via jieba.cut()
def getContent(x):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(x)
    text = soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
    import jieba
    r = list()
    for term in jieba.cut(text):
        if len(term) > 1 and checkword(term): r.append(term)
    return r

def checkword(x):
    return all(u'\u4e00' <= c <= u'\u9fff' for c in x)

In [5]:
text_token = gf.map(lambda x: (x[0], getContent(x[1]), x[2]))

In [6]:
#check text_token
#text_token.first()
#text_token.count()

### 計算每篇文章的TF-IDF Vector

In [7]:
def cal_tf(tokens):
    d = {}
    for word in tokens:
        if not word in d:
            d[word] = 1
        else:
            d[word] += 1
    for word in d:
        d[word] = float(d[word])/len(tokens)
    return d

text_token_tf = text_token.map(lambda x: cal_tf(x[1]))

In [8]:
#check text_token_tf
#text_token_tf.first()

In [9]:
def cal_idf(docs):
    N = docs.count()
    uniqueTokens = docs.map(lambda x : list(set(x[1])))
    token_sum_tuples = uniqueTokens.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x,y: x+y)
    return token_sum_tuples.map(lambda x : (x[0], float(N)/x[1]))

In [10]:
def TFIDF(tokens, idfs):
    tfidf_Dict = {}
    tfs = cal_tf(tokens)
    for tk in tfs:
        tfs[tk] = tfs[tk]*idfs[tk]
    tfidf_Dict = tfs
    return tfidf_Dict

In [11]:
doc_idfs = cal_idf(text_token)

doc_c = doc_idfs.collectAsMap()  #my idf dict

text_tfidf = TFIDF(text_token.collect()[0][1], doc_c)

print text_token.collect()[0][0]

美味食記


In [12]:
#check text_tfidf
#text_tfidf
#text_token.collect()[0][1]

### 計算Cosine similarity

In [13]:
import math

def dotprod(a, b):
    dotsum = 0
    for tk in a:
        if tk in b:
            dotsum += a[tk]*b[tk]
    return dotsum

def norm(a):
    return math.sqrt(dotprod(a,a))

def cossim(a, b):
    return dotprod(a,b)/(norm(a) * norm(b))

In [14]:
def cosineSimilarity(string1, string2, idfsDictionary):
    w1 = tfidf(string1, idfsDictionary)
    w2 = tfidf(string2, idfsDictionary)
    return cossim(w1, w2)

### Rule One - top words in a text

In [15]:
def showTopWord(link):
    tokens = text_token.filter(lambda x: x[2] == link).collect()[0][1]
    tokens_weights = TFIDF(tokens, doc_c)
    print type(tokens_weights)
    tokens_weights_sorted = sorted(tokens_weights, key=tokens_weights.get, reverse=True)
    for index in range(0,9):
        print tokens_weights_sorted[index], tokens_weights[tokens_weights_sorted[index]]
    print tokens_weights_sorted[:14]
    return tokens_weights_sorted[:14]

In [16]:
link = u'http://lovecc6.pixnet.net/blog/post/73513867'
#showTopWord(link)

In [17]:
urls = text_token.map(lambda x : x[2])


#top_word_list = text_token.map(lambda x : showTopWord(x[2]))
#top_word_list = [showTopWord(i) for i in urls]
#top_word_list = urls.map(lambda x: showTopWord(x))
#top_word_list

In [18]:
#top_word_list

### Rule Two - Query in text

In [19]:
query_input = [u'蝦球', u'辣味', u'泰式']

def check_in(query, text):
    count = 0
    for q in query:
        if q in text:
            count += 1
    return count

def query_points(query):
    query_points_table = text_token.map(lambda x : check_in(query, x[1]))
    return query_points_table

In [20]:
query_pts = query_points(query_input).collect()

len(query_pts)

2228

### Rule 3 - Term Weights

In [21]:
def term_weights(tokens):
    d = {}
    for word in tokens:
        if not word in d:
            d[word] = 1
        else:
            d[word] += 1
    return d

In [22]:
def term_points(query, point_dict):
    points = 0
    for i in query:
        if i in point_dict:
            points += point_dict[i]
                
    return points

tf_list = text_token.map(lambda x : term_weights(x[1])).collect()

In [23]:
term_pts = [term_points(query_input, i) for i in tf_list]
len(term_pts)

2228

### 計算文章分數

In [24]:
def doc_points(term_weight_pts, query_pts):
#    tw_dict = text_token.map(lambda x: term_weights(x[1])).collect()
#    doc_point = text_token.map(lambda x : (((term_points(query_input, tw_dict))*(check_in(query_input, x[1])) , x[2])))
    doc_point = [i*j for i,j in zip(term_weight_pts, query_pts)]
    
    return doc_point

In [25]:
url_list = text_token.map(lambda x : (x[2]))

total_pts = zip(doc_points(term_pts, query_pts) , url_list.collect())

In [26]:
#print type(total_pts)
total_pts_sort = sorted(total_pts, reverse=True)
#total_pts_sort

In [27]:
total_pts_sort[:10]

[(60, u'http://changfong.pixnet.net/blog/post/40749658'),
 (34, u'http://evisko.pixnet.net/blog/post/258052708'),
 (29, u'http://justnike.pixnet.net/blog/post/61919500'),
 (28, u'http://wonderfood.pixnet.net/blog/post/198089649'),
 (28, u'http://lemonadellen.pixnet.net/blog/post/32114431'),
 (25, u'http://changfong.pixnet.net/blog/post/41828851'),
 (24,
  u'http://sedo888.pixnet.net/blog/post/341765034-%5b%e5%8f%b0%e5%8c%97%5d-%e5%96%9c%e4%be%86%e7%99%bb%e5%a4%a7%e9%a3%af%e5%ba%97-%e2%80%a7-sukhothai%e8%98%87%e5%8f%af%e6%b3%b0%e6%b3%b0%e5%bc%8f%e6%96%99'),
 (24, u'http://lemonadellen.pixnet.net/blog/post/40637716'),
 (20, u'http://protozoa.pixnet.net/blog/post/29765279'),
 (20, u'http://infinite520visa.pixnet.net/blog/post/148834846')]

## 第二部分 - 實作餐廳的 matching

### 用函式尋找完全配對，回傳分數

In [28]:
def exact_match(restaurant, title):
    exact_match_flag = 0;
    if restaurant in title:
        exact_match_pts = 1
        return 1
    else:
        return 0

### 將標題斷詞，剔除不需要字元

In [29]:
def title_checkword(x):
    return all((u'\u4e00' <= c <= u'\u9fff') or ('A' <= c <= 'Z') or ('a' <= c <= 'z') or (
            '0' <= c <= '9')for c in x)

def cut_title(title):
    import jieba
    
    r = list()
    for term in jieba.cut(title):
        if title_checkword(term): r.append(term)
    return r

In [30]:
gcf = df.map(lambda x : (x[11],x[12]))

gcf.first() 
gdf = gcf.map(lambda x : cut_title(x[0]))
print gdf.first()
#def title_scoring():
#print gcf.flatMap(lambda x: x).collect()

[u'\u559c\u4f86', u'\u767b', u'\u4e4b', u'\u5341\u4e8c', u'\u5eda', u'All', u'u', u'can', u'eat']


In [31]:
cut_title(gcf.first()[0])

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.279 seconds.
DEBUG:jieba:Loading model cost 0.279 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


[u'\u559c\u4f86',
 u'\u767b',
 u'\u4e4b',
 u'\u5341\u4e8c',
 u'\u5eda',
 u'All',
 u'u',
 u'can',
 u'eat']

In [32]:
all_cut_title = gcf.map(lambda x : cut_title(x[0]))

In [33]:
all_cut_title.first()

[u'\u559c\u4f86',
 u'\u767b',
 u'\u4e4b',
 u'\u5341\u4e8c',
 u'\u5eda',
 u'All',
 u'u',
 u'can',
 u'eat']

### 將斷過的詞兩兩相接

In [34]:
def bio_wordset(words):
    try:
        biogram_str = map(lambda x, y: x+y, words[:-1], words[1:])
        return biogram_str
    except:
        return []

In [35]:
#gcf2 = map(lambda x, y: x[0]+y[0], )
strstr = cut_title(gcf.first()[0])
print type(strstr)
strstrstr = bio_wordset(strstr)
print strstrstr

<type 'list'>
[u'\u559c\u4f86\u767b', u'\u767b\u4e4b', u'\u4e4b\u5341\u4e8c', u'\u5341\u4e8c\u5eda', u'\u5edaAll', u'Allu', u'ucan', u'caneat']


In [36]:
biogram_title = all_cut_title.map(lambda title: bio_wordset(title))

In [37]:
import pandas as pd

#res_data = sc.textFile('./restaurant.csv').map(lambda line: line.split(',')).map(
#    lambda line: line[2]).collect()[1:]

res = sc.textFile('./restaurant.csv').map(lambda line: line.split(',')).map(
    lambda line: line[2]).map(lambda line: line.split('-')[0]).collect()[1:]

res_data = list(set(res))

In [38]:
#for i in res_data:
#    print i

HOWFUN 好飯食堂 (台北內湖店)
NakedFood 裸食私廚
HERDOR Tea House
凱越越南風味火鍋
漁聞樂
夏慕尼(台北中山北店)
淡水榕堤水灣餐廳
參和院 台灣風格飲食 (桃園華泰店)
沐 創作 季節料理
極野宴燒肉專門店 (大食代)
NAPOLI’S PIZZA & COFFE
陶板屋 (豐原向陽店)
帕莎蒂娜義大利屋
寬心園精緻蔬食料理(文心公益店)
石頭日式燒肉(漢口總館)
陶板屋 (台中大里德芳南店)
欣葉小聚今品 (環球店)
饗樂 Shabu 精緻鍋品
小蒙牛頂級麻辣養生鍋(內湖店)
添好運(高雄漢神巨蛋店)
Osteria by Angie（竹北店）
蝦老爹 The Shrimp Daddy
Origines
香頌私宅洋樓
MVSA Spanish Restaurant & Bar 沐紗西班牙酒莊餐廳
品花苑
Osteria by angie (光復店)
富呷一方 健康料理 (板橋新埔店)
水舞饌
小蒙牛頂級麻辣養生鍋(家樂福鳳山店)
MW 時尚義法料理&酒品(大直店)
炭火工廠96 C_Factory96
一膳食堂 (微風信義店)
西堤牛排 (台北復興南店)
西堤牛排 (草屯碧山店)
The Green ROOM
黑浮咖啡 (高雄楠梓加盟店)
FORE restaurant
L’Atelier de Patrick 法式派翠克餐廳
遠東cafe 
西堤牛排 (新莊新泰店)
Beluga restaurant & bar
"栢金 Birking
食蔬茶齋 · 蔬食料理
舞蔬弄果(竹北店)
Que 原木燒烤餐廳 
丸本陣 鮨割烹 日式景觀餐廳
汎塔莎西餐廳 Fantasia Western Reataurant
原燒 (台中台糖東海店)
娘子韓食 (竹北店)
這一鍋皇室秘藏鍋物(崇德殿)
金鍋盃小火鍋
糖朝(高雄大立精品形象店)
你回來了 型男食堂居酒屋
Oracle Coffee
Afternoon Tea ‧ 統一午茶風光 (統一時代台北門市)
台北文華東方酒店 文華Café (Café Un Deux Trois)
Office By Mastro
小蒙牛頂級麻辣養生鍋(天母店)
樂軒和牛專門店
潮坊港式飲茶(內湖InBase)
小蒙牛頂級麻辣養生鍋(中和店)
G12地中海咖啡餐酒館
50樓Cafe 自助餐廳 
囍聚精緻鍋物


In [39]:
cut_name = [cut_title(name) for name in res_data]

In [40]:
biogram_name =  [bio_wordset(cutted) for cutted in cut_name]

### 整理出長度大於2的詞

In [41]:
def creat_long(short_str):
    result = [p for p in short_str if (len(p) > 1)]
    return result

In [42]:
creat_long(cut_title(gcf.first()[0]))

[u'\u559c\u4f86', u'\u5341\u4e8c', u'All', u'can', u'eat']

### 將英文詞分割出來

In [43]:
def separate_eng(input_str):
    result = list()
    for i in input_str:
        if i.isalpha() and (('A' <= i[0] <= 'Z') or ('a' <= i[0] <= 'z')):
            result.append(i)
    return result

In [44]:
aa = separate_eng(cut_title(gcf.first()[0]))
#print aa
#print gcf.collect()

### 長詞的比較

In [45]:
def long_term_compare(title, name):
    count = 0
    for i in title:
        if i in name:
            count += 1
    return count

In [46]:
def bio_long_term_compare(title, name):
    count = 0
    for i in title:
        if i in name:
            count += 1
    return count

### 包含全部詞的比較

In [47]:
def term_compare(title, name):
    term_count = 0
    for i in title:
        if i in name:
            term_count += 1
    return term_count

In [48]:
def bio_term_compare(title, name):
    term_count = 0
    for i in title:
        if i in name:
            term_count += 1
    return term_count

In [49]:
q1 = ['大便','好臭','誰的','肥宅','滴油']
q2 = ['大便','好臭','誰的']
term_compare(q1, q2)

3

### 導入計分function

In [50]:
def calculate_pts(short_uni, short_bio, long_uni, long_bio, eng_name, exact):
    return ((1*short_uni)+(2*short_bio)+(1*long_uni)+(4*long_bio)+(4*eng_name)+(100000*exact))

### 實作預測函式

In [51]:
def predict_restaurant(title):
    title_token = cut_title(title)
    biogram_title_token = bio_wordset(title_token)
    long_title_token = creat_long(title_token)
    long_biogram_title_token = bio_wordset(long_title_token)
    Eng_title_token = separate_eng(title_token)
    pts_list = list()
    exact_pts = 0
    for i in res_data:
#        print ("--------------",i, '------------------')
#        exact_pts = exact_match(i, title)
#        print exact_pts
        name_token = cut_title(i)
#        print ("--------------",name_token, '------------------')
        biogram_name_token = bio_wordset(name_token)
#        print ("--------------", biogram_name_token, '------------------')
        long_name_token = creat_long(name_token)
#        print ("--------------", long_name_token, '------------------')
        long_biogram_name_token = bio_wordset(long_name_token)
#        print ("--------------", long_biogram_name_token, '------------------')
        Eng_name_token = separate_eng(name_token)
#        print ("--------------", Eng_name_token, '------------------')
#        print "========================="
#        print i
#        print title
#        print "========================="
        short_uni = term_compare(title_token, name_token)
#        print ("--------------", short_uni, '------------------')
        short_bio = bio_term_compare(biogram_title_token,biogram_name_token)
#        print ("--------------", short_bio, '------------------')
        long_uni = long_term_compare(long_title_token, long_name_token)
#        print ("--------------", long_uni, '------------------')
        long_bio = bio_long_term_compare(long_biogram_title_token, long_biogram_name_token)
#        print ("--------------", long_bio, '------------------')
        eng_name = term_compare(Eng_title_token, Eng_name_token)
#        print ("--------------", eng_name, '------------------')
#        print "========================="
#        print short_uni
#        print "========================="
        pts_list.append(calculate_pts(short_uni, short_bio, long_uni, long_bio, eng_name, exact_pts))#
    
    rank_list = zip(pts_list, res_data)
    rank_list_sorted = sorted(rank_list, reverse = True)
    return rank_list_sorted[:3]

### 對所有文章的標題進行預測

In [52]:
last_test = gcf.map(lambda x: x[0]).map(lambda x : predict_restaurant(x))

In [53]:
aa = [(13, u'\u65b0\u7af9\u559c\u4f86\u767b\u5927\u98ef\u5e97'),
  (13, u'\u5341\u4e8c\u5eda\u81ea\u52a9\u9910\u5ef3 '),
  (6, u'TKK BUFFET \u9802\u5471\u5471\u81ea\u52a9\u5427')]

In [54]:
for ii in aa:
    print ii[1]

新竹喜來登大飯店
十二廚自助餐廳 
TKK BUFFET 頂呱呱自助吧


In [55]:
qqq = last_test.collect()

In [56]:
#print qqq