# Simplified Page Rank 

This project is a simplified version of the Google Page Rank algorithm. The simplified version consists of three steps

## Rank pages based on the number of referrals to the page

The user builds a graph consisting of all URLs and the set of incoming links to each node (URL) in the graph. The pages are then ranked based on the number of incoming links. 

## Create an index of all words to pages

Traverse all web pages in the graph and build an index of keywords to web pages

## Combine rank and index to deliver a search

When a search keyword is provided, first search the index to get the list of web pages. Next order the pages based on their rank and provide this result to the user.

### Get NLTK stopwords

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kent\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Get URLs from a web page

In [2]:
import urllib

In [3]:
# make sure to install bs4 (beautiful soup)
# You can use the below command to install
# pip install bs4
from bs4 import BeautifulSoup
import urllib


def getURLsList(url):

    #resp = urllib3.request.urlopen(url)
    resp = urllib.request.urlopen(url)
    charset = resp.headers.get_content_charset()
    #soup has html
    soup = BeautifulSoup(resp, from_encoding=charset)
    pages = []
    for link in soup.find_all('a', href=True):
        page = link['href']
        
        if page[:4] != "http":
            page = url + "/" + page
            
        pages.append(page)
        
    return pages

### Get keywords from a web page

In [4]:
import urllib3
import re
from nltk.corpus import stopwords

def getKeyWords(url):

    resp = urllib.request.urlopen(url)
    charset = resp.headers.get_content_charset()
    #soup = BeautifulSoup(resp, 'from_encoding=charset')
    soup = BeautifulSoup(resp, 'lxml')
    
    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
    visible_text = soup.getText()
    
    regex = r'(\w*) '
    words = filter(lambda w: w != '',re.findall(regex,visible_text))
    
    filtered_words = [w for w in words if w not in stopwords.words('english')]
    
    return filtered_words

In [5]:
getURLsList("http://www.facebook.com")



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


['http://www.facebook.com/#',
 'http://www.facebook.com/#',
 'https://www.facebook.com/',
 'https://www.facebook.com/recover/initiate?lwv=110&ars=royal_blue_bar',
 'http://www.facebook.com/#',
 'http://www.facebook.com//legal/terms/update',
 'http://www.facebook.com//about/privacy/update',
 'http://www.facebook.com//policies/cookies/',
 'http://www.facebook.com/#',
 'http://www.facebook.com//pages/create/?ref_type=registration_form',
 'https://es-la.facebook.com/',
 'https://fr-fr.facebook.com/',
 'https://zh-cn.facebook.com/',
 'https://ar-ar.facebook.com/',
 'https://pt-br.facebook.com/',
 'https://it-it.facebook.com/',
 'https://ko-kr.facebook.com/',
 'https://de-de.facebook.com/',
 'https://hi-in.facebook.com/',
 'https://ja-jp.facebook.com/',
 'http://www.facebook.com/#',
 'http://www.facebook.com//r.php',
 'http://www.facebook.com//login/',
 'https://messenger.com/',
 'http://www.facebook.com//lite/',
 'http://www.facebook.com//mobile/?ref=pf',
 'http://www.facebook.com//find-fri

### Get a list of all web pages and their corresponding keywords

In [6]:
keywordList = getKeyWords("http://www.crunchyroll.com")

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kent\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
print(keywordList)

['Premium', 'Try', 'Summer', '2018', 'Simulcasts', 'How', 'Not', 'Summon', 'Demon', 'Lord', 'Thursdays', '30am', 'PDT', 'Episode', '2', 'Episode', '3', 'Angels', 'Death', 'Fridays', '00am', 'PDT', 'Episode', '2', 'Episode', '3', 'Overlord', 'III', 'Tuesdays', '30am', 'PDT', 'Episode', '1', 'Episode', '2', 'My', 'Hero', 'Academia', 'Season', '3', 'Saturdays', '30am', 'PDT', 'Episode', '52', 'Episode', '53', 'The', 'Master', 'Ragnarok', 'Blesser', 'Einherjar', 'Saturdays', '00am', 'PDT', 'Episode', '2', 'Episode', '3', 'Sundays', '00am', 'PDT', 'Episode', '2', 'Episode', '3', 'Black', 'Clover', 'Tuesdays', '25am', 'PDT', 'Episode', '40', 'Episode', '41', 'ISLAND', 'Sundays', '30am', 'PDT', 'Episode', '2', 'Episode', '3', 'One', 'Whole', 'Cake', 'Island', 'Saturdays', '00pm', 'PDT', 'Episode', '845', 'Episode', '846', 'Gintama', 'Season', '4', 'Sundays', '35am', 'PDT', 'Episode', '354', 'Episode', '355', 'NARUTO', 'NEXT', 'GENERATIONS', 'Thursdays', '00am', 'PDT', 'Episode', '64', 'Episod

In [9]:
newList = []
for x in keywordList:
    a = x.lower()
    try:
        newList.index(a)
    except:
        newList.append(a)

In [111]:
sorted(newList,reverse=True)

['zunbera',
 'zombie',
 'zen',
 'zack',
 'yuki',
 'yui',
 'yuga',
 'your',
 'young',
 'you',
 'years',
 'yami',
 'yamato',
 'xbox',
 'x',
 'worth',
 'worried',
 'world',
 'works',
 'working',
 'work',
 'word',
 'women',
 'woman',
 'wizard',
 'without',
 'within',
 'with',
 'windows',
 'wind',
 'wild',
 'wii',
 'whose',
 'whole',
 'who',
 'white',
 'while',
 'where',
 'what',
 'week',
 'wednesdays',
 'website',
 'wearing',
 'we',
 'way',
 'water',
 'watching',
 'watched',
 'was',
 'war',
 'wants',
 'want',
 'waku',
 'wakes',
 'wake',
 'waiting',
 'wait',
 'wage',
 'vrv',
 'vrains',
 'volleyball',
 'visuals',
 'viruses',
 'village',
 'vigilante',
 'view',
 'videos',
 'vanguard',
 'usual',
 'uses',
 'used',
 'use',
 'uphill',
 'updated',
 'upcoming',
 'up',
 'unrest',
 'unicorn',
 'underlings',
 'unbeatable',
 'ultimate',
 'tyrant',
 'two',
 'twitter',
 'twin',
 'twilight',
 'tv',
 'turned',
 'turn',
 'tuesdays',
 'trying',
 'try',
 'trumps',
 'truly',
 'troop',
 'trip',
 'trial',
 'trans

In [112]:
userInput = input("Enter a keyword: ")
for x in newList:
    if userInput.lower() == x:
        print("Found")

Enter a keyword: Anime
Found


In [75]:
dictA = {}

In [76]:
def addURL(url):
    if dictA.get(url) == None:
        dictA[url] = getURLsList(url)
   # else:
    #    for x in dictA:
     #       for y in range(0, 1): #len(dictA[x])):
      #          keyMap = dictA[x][y]
       #         tempList = getURLsList(keyMap)
        #        dictA[keyMap] = tempList
                

In [77]:
addURL("http://www.crunchyroll.com")
dictA



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


{'http://www.crunchyroll.com': ['http://www.crunchyroll.com//',
  'http://www.crunchyroll.com//videos/anime',
  'http://www.crunchyroll.com//comics/manga',
  'http://www.crunchyroll.com//news',
  'http://www.crunchyroll.com//forum',
  'http://www.crunchyroll.com//store',
  'http://www.crunchyroll.com//freetrial?from=topbar',
  'https://www.crunchyroll.com/login?next=%2F',
  'http://www.crunchyroll.com//home',
  'http://www.crunchyroll.com//random/anime?random_ref=topbar',
  'https://vrv.app.link/RuaDcesjeO',
  'http://www.crunchyroll.com//videos/anime/simulcasts',
  'http://www.crunchyroll.com//how-not-to-summon-a-demon-lord',
  'http://www.crunchyroll.com//how-not-to-summon-a-demon-lord/episode-1-the-demon-lord-act-774790',
  'http://www.crunchyroll.com//how-not-to-summon-a-demon-lord/episode-2-strongest-newcomer-774791',
  'http://www.crunchyroll.com//how-not-to-summon-a-demon-lord/episode-1-the-demon-lord-act-774790',
  'http://www.crunchyroll.com//how-not-to-summon-a-demon-lord/epi

In [92]:
for x in range(0,6):
    addURL(dictA["http://www.crunchyroll.com"][x])

In [93]:
##dictA[dictA["http://www.crunchyroll.com"][1]]
dictA

{'http://www.crunchyroll.com': ['http://www.crunchyroll.com//',
  'http://www.crunchyroll.com//videos/anime',
  'http://www.crunchyroll.com//comics/manga',
  'http://www.crunchyroll.com//news',
  'http://www.crunchyroll.com//forum',
  'http://www.crunchyroll.com//store',
  'http://www.crunchyroll.com//freetrial?from=topbar',
  'https://www.crunchyroll.com/login?next=%2F',
  'http://www.crunchyroll.com//home',
  'http://www.crunchyroll.com//random/anime?random_ref=topbar',
  'https://vrv.app.link/RuaDcesjeO',
  'http://www.crunchyroll.com//videos/anime/simulcasts',
  'http://www.crunchyroll.com//how-not-to-summon-a-demon-lord',
  'http://www.crunchyroll.com//how-not-to-summon-a-demon-lord/episode-1-the-demon-lord-act-774790',
  'http://www.crunchyroll.com//how-not-to-summon-a-demon-lord/episode-2-strongest-newcomer-774791',
  'http://www.crunchyroll.com//how-not-to-summon-a-demon-lord/episode-1-the-demon-lord-act-774790',
  'http://www.crunchyroll.com//how-not-to-summon-a-demon-lord/epi