### Wiki Web crawler

* Web crawler that is designed to crawl through wikipedia links
* Recursive calls are made to the internal links of the current page
* keeps track of the previous visited pages
* csv file with fragments,internal & external links along with last modified timestamp of the crawled page.

In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import random

In [2]:
# initial crawl link
URL = "https://en.wikipedia.org/wiki/The_Doors"
df = pd.DataFrame(columns=['Pagecount','INTcount','EXTcount','URLfragments','timestamp'])

visitedLinks = []
#Crawl limit
PageLimit = 10
pageCounter = 0


#### Last modified date
* code specific for wiki json format
* changes to be made for other domain

In [3]:
# get last modified timestamp
def getTimestamp(content):
    scriptTags = content.find_all('script')
    for i in range(0,len(scriptTags)):
        if "json" in str(scriptTags[i]):
            JsonObject = json.loads(scriptTags[i].get_text())
            timestamp = JsonObject['dateModified']
            return timestamp

In [4]:
# get all links of the current page
def getAllhyperlinks(content):
    validLinks=[]
    links = content.find_all('a')
    for i in range(0,len(links)):        
        if "href" in str(links[i]):
            validLinks.append(str(links[i]))
    return validLinks

In [5]:
# convert internal links to wikipedia links
def getWikiLinks(inLinks):    
    https=[]
    wikilinks=[]

    for i in range(0,len(inLinks)):
        content = BeautifulSoup(inLinks[i],'html.parser')
        Nextlink = content.get_text()
        if "https" in str(Nextlink):
            https.append(Nextlink)
        else:
            wikilinks.append("https://en.wikipedia.org/wiki/"+str(Nextlink))
    return wikilinks

In [6]:
def webCrawl(url):
    AllLinks=[]
    inLinks=[]
    extern,intern,fragments=0,0,0

    global pageCounter
    print(url)

    # html Parser
    MainPage = requests.get(url)
    HtmlContent = BeautifulSoup(MainPage.content,'html.parser')
    timestamp=getTimestamp(HtmlContent)
    AllLinks = getAllhyperlinks(HtmlContent)

    # differentiate external,internal and fragments of current page
    global visitedLinks
    for index in range(0,len(AllLinks)):
        text = AllLinks[index]
        if "/wiki/" in text:
            intern+=1
            inLinks.append(text)
        elif "#" in text:
            fragments+=1
        else:
            extern+=1

    # dataframe for csv file
    df.loc[pageCounter] = [pageCounter,intern,extern,fragments,timestamp]
    pageCounter+=1

    #fetch all wiki links of current crawl
    Wikilinks = getWikiLinks(inLinks)

    # check condition for previous visited links
    x= random.randint(0, len(Wikilinks)-1)
    while(Wikilinks[x] in visitedLinks):
        x= random.randint(0, len(Wikilinks)-1)   
    visitedLinks.append(Wikilinks[x])    

    # check maximum limit          
    if(pageCounter>=PageLimit):
        return ;
    else:
        return webCrawl(visitedLinks[-1])

In [7]:
def main():
    try:
        webCrawl(URL)

    except:
        print("An Exception Occurred \n")

    finally:
        print(df)
        df.to_csv("./WebCrawler.csv")

In [8]:
if __name__ == "__main__":
    main()

https://en.wikipedia.org/wiki/The_Doors
https://en.wikipedia.org/wiki/keyboard bass
https://en.wikipedia.org/wiki/MIDI
https://en.wikipedia.org/wiki/sequencer
https://en.wikipedia.org/wiki/What links here
https://en.wikipedia.org/wiki/Cookie statement
https://en.wikipedia.org/wiki/Commons
https://en.wikipedia.org/wiki/S2CID
https://en.wikipedia.org/wiki/Essay
https://en.wikipedia.org/wiki/Македонски
  Pagecount INTcount EXTcount URLfragments             timestamp
0         0     1026      168          478  2022-04-10T22:26:27Z
1         1      173       37           15  2022-02-04T14:49:53Z
2         2      911      314          559  2022-04-09T06:06:38Z
3         3       61       19            3  2019-07-13T07:33:35Z
4       4.0     41.0     14.0          3.0                   NaN
5       5.0     41.0     14.0          3.0                   NaN
6         6      639      127          161  2022-04-02T20:10:22Z
7         7      213       54           62  2022-03-04T18:03:40Z
8         8 