# Build search engine

In [1]:
import requests
import re
import string
import jieba
import collections
import networkx as nx
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

class MovieCrawler(object):
    def __init__(self):
        self.movies = []
        self.i = 1
        self.graph = nx.DiGraph()
        self.index = collections.defaultdict(list)
        self.links = collections.defaultdict(list)
    
    def tokenize_sentence(self, sentence):
        stop_words = stopwords.words("chinese")
        stop_words2 = ("\n", ", ", " ", "\r\n", "，", "。", "…", "★", "、", "《", "》")
        
        tokens = jieba.cut(sentence, cut_all = False, HMM = True)
        tokens = [i for i in tokens if i not in string.punctuation]
        tokens = [i for i in tokens if i not in stop_words]
        tokens = [i for i in tokens if i not in stop_words2]

        return tokens
    
    def query(self, target):
        print("您的搜尋結果(Sorting by PageRank Value):")
        print(f"共{len(self.index[target])}筆，符合\"{target}\"---共indexing{len(self.movies)}筆電影資料")
        pagerank = nx.pagerank(self.graph, alpha = 1, tol = 1.0e-3, max_iter = 100000)
        rank = []
        for i, value in enumerate(self.index[target]):
            rank.append((i, pagerank[value]))
        rank = sorted(rank, key = lambda x: x[1], reverse = True)
        
        for i, r in rank:
            index = self.index[target][i]
            print(f'{self.movies[index]["doc_id"]}({r}): {self.movies[index]["cname"]} {self.movies[index]["ename"]}\n{self.movies[index]["intro"]}\n')
            
        n1 = n2 = 0
        for movie in self.movies:
            if (re.search(rf"{target}", movie["cname"] + movie["ename"] + movie["intro"], re.IGNORECASE)):
                n1 += 1
        for i in self.index[target]:
            if (re.search(rf"{target}", self.movies[i]["cname"] + self.movies[i]["ename"] + self.movies[i]["intro"], re.IGNORECASE)):
                n2 += 1
        
        print(f"\nPrecision = {n2 / len(self.index[target]) * 100}%")
        print(f"Recall = {n2 / n1 * 100}%")
    
    def add_movies(self, page_url):
        resp = requests.get(page_url)
        soup = BeautifulSoup(resp.text, 'html.parser')
        movie = soup.find("div", class_ = "movie_intro_info_r")
        
        if (movie == None):
            return
        
        temp = {}
        
        temp["doc_id"] = self.i
        temp["cname"] = movie.find("h1").text
        temp["ename"] = movie.find("h3").text
        
        label_info = movie.find("div", class_ = "level_name_box").find_all("div", class_ = "level_name")
        temp["labels"] = []
        for l in label_info:
            label = l.text
            while (label[0] in [" ", "\n"]):
                label = label[1: ]
            while (label[-1] in [" ", "\n"]):
                label = label[: -1]
                
            temp["labels"].append(label)
            
        intro = soup.find("span", {"id": "story"}).text
        if (intro != None and intro != "" and len(intro) != 0):
            while (intro and intro[0] in [" ", "\n"]):\
                intro = intro[1: ]
            while (intro and intro[-1] in [" ", "\n"]):
                intro = intro[: -1]
            temp["intro"] = intro
        else:
            temp["intro"] = ""
        
        temp["release_date"] = movie.find("span").text[5: ]
        
        tokens = self.tokenize_sentence(temp["cname"]) + self.tokenize_sentence(temp["ename"]) + self.tokenize_sentence(temp["intro"])
        for token in set(tokens):
            self.index[token].append(self.i - 1)
        
        if (self.i != 1):
            self.links[self.i].append(self.i - 1)
            self.links[self.i - 1].append(self.i)
            self.graph.add_edge(self.i, self.i - 1)
            self.graph.add_edge(self.i - 1, self.i)
        
        self.i += 1
        
        self.movies.append(temp)

        return self.movies
      
movie_url = "https://movies.yahoo.com.tw/movieinfo_main/"

crawler = MovieCrawler()
for i in range(1, 15059):
    if (i % 100 == 0):
        print(f"Crawling {movie_url}{i}...")
    crawler.add_movies(movie_url + str(i))
    
movies = crawler.movies

print(f"The length of all movies: {len(movies)}")

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/ly/4jgkxntx463ghnpv_mjkrb600000gn/T/jieba.cache
Loading model cost 0.256 seconds.
Prefix dict has been built successfully.


Crawling https://movies.yahoo.com.tw/movieinfo_main/100...
Crawling https://movies.yahoo.com.tw/movieinfo_main/200...
Crawling https://movies.yahoo.com.tw/movieinfo_main/300...
Crawling https://movies.yahoo.com.tw/movieinfo_main/400...
Crawling https://movies.yahoo.com.tw/movieinfo_main/500...
Crawling https://movies.yahoo.com.tw/movieinfo_main/600...
Crawling https://movies.yahoo.com.tw/movieinfo_main/700...
Crawling https://movies.yahoo.com.tw/movieinfo_main/800...
Crawling https://movies.yahoo.com.tw/movieinfo_main/900...
Crawling https://movies.yahoo.com.tw/movieinfo_main/1000...
Crawling https://movies.yahoo.com.tw/movieinfo_main/1100...
Crawling https://movies.yahoo.com.tw/movieinfo_main/1200...
Crawling https://movies.yahoo.com.tw/movieinfo_main/1300...
Crawling https://movies.yahoo.com.tw/movieinfo_main/1400...
Crawling https://movies.yahoo.com.tw/movieinfo_main/1500...
Crawling https://movies.yahoo.com.tw/movieinfo_main/1600...
Crawling https://movies.yahoo.com.tw/movieinfo_ma

Crawling https://movies.yahoo.com.tw/movieinfo_main/13800...
Crawling https://movies.yahoo.com.tw/movieinfo_main/13900...
Crawling https://movies.yahoo.com.tw/movieinfo_main/14000...
Crawling https://movies.yahoo.com.tw/movieinfo_main/14100...
Crawling https://movies.yahoo.com.tw/movieinfo_main/14200...
Crawling https://movies.yahoo.com.tw/movieinfo_main/14300...
Crawling https://movies.yahoo.com.tw/movieinfo_main/14400...
Crawling https://movies.yahoo.com.tw/movieinfo_main/14500...
Crawling https://movies.yahoo.com.tw/movieinfo_main/14600...
Crawling https://movies.yahoo.com.tw/movieinfo_main/14700...
Crawling https://movies.yahoo.com.tw/movieinfo_main/14800...
Crawling https://movies.yahoo.com.tw/movieinfo_main/14900...
Crawling https://movies.yahoo.com.tw/movieinfo_main/15000...
The length of all movies: 12230


# Display query result

In [2]:
crawler.query("音樂")

您的搜尋結果(Sorting by PageRank Value):
共771筆，符合"音樂"---共indexing12230筆電影資料
10(8.176614881439084e-05): 北京樂與路 Beijing Rocks
北京正處於風雲匯集、新舊傳統的規律與現代的思潮所產生的強烈對比當中。在香港長大並在外國受教育的創作樂手Michael，因一次事故滯留北京，遇上從別省來到這個中國首都碰運氣的搖滾樂手平路及反叛又深情的艷舞女郎楊穎。三個背景完全不同的年青人，不由自主的互相吸引著。

這三個人產生了一段錯縱複雜的關係，展開了一段奇異的走穴(下鄉演唱）旅程。故事以北京的流行音樂壇為背景，穿插著平路中西合璧的搖滾樂，隨著音樂的旋律，我們看見一個既浪漫又色彩繽紛，既被中國傳統文化牽引著，又具備尖端現代動感的世界級都市，我們更看見這個都市裏背景不同、性格各異、但都滿懷希望與憧憬的年青人，各自為了實現心中遙遠的夢想，付出了無比的努力，有人成功，也有人失敗。但無論結果怎樣，這三個活在社會邊緣的年青人，為了愛情、事業、理想，都無悔地把生命燦爛的燃燒著....

64(8.176614881439084e-05): 逐夢鬱金香 Bread & Tulips
＊金馬影展觀眾票選最受歡迎影片
＊坎城影展獲邀參展影片
＊德國藝術電影協會最佳外語片
＊香港國際電影節參展影片
＊義大利奧斯卡九項大獎
＊歐洲電影獎提名最佳導演、最佳男、女主角繼《美麗人生》之後，義大利電影蟬連美國熱門排行榜達五個月

蘿莎是個平凡的主婦，多年來，做為一個妻子和母親，不但丈夫、孩子忽略她，連自己都遺忘了自己的的存在。在一次前往希臘的家庭旅行中，途中休息時，遊覽車開走了，而她的丈夫換孩子竟然都沒發現她沒上車！於是，她、被、放、鴿、子、了。

生命最偉大的轉捩點通常以最平淡無奇的形式發生---或許是一場邂逅，或是一段令人匪夷所思的巧合---命運，因而扭轉。

蘿莎從來沒有想過會有這種事情發生，直到這一刻，她一個人站在高速公路上，很荒謬地被全然遺忘。然而，她一點也不灰心，事實上，她已經準備展開一個人的旅程。搭上前往威尼斯的便車，突獲的自由與自信帶領她走向一個繽紛絢爛的新世界：她找到一個花店的工作，被一個從冰島來的美食主義厭世詩人收留，並且有個美艷的按摩女郎為鄰。在那裡，她重新尋獲對於音樂的喜愛與天份，在手風琴悠

# Write json file

In [3]:
import json

pagerank = nx.pagerank(crawler.graph, alpha = 1, tol = 1.0e-3, max_iter = 100000)

for i in range(len(crawler.movies)):
    crawler.movies[i]["pagerank"] = pagerank[crawler.movies[i]["doc_id"]]
    crawler.movies[i]["links"] = crawler.links[crawler.movies[i]["doc_id"]]

with open("hw2.json", "w", encoding = "utf-8") as f:
    json.dump(crawler.movies, f)

# Display result by loading json file

In [4]:
with open("hw2.json", "r") as f:
    data = json.load(f)

    print(data)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

