## JITEN MISHRA DSC430_Assignment0602_WebCrawl
### I have not given or received any unauthorized assistance on this assignment.

In [2]:
# Libraries and import statements first
from urllib.request import urlopen, urljoin, urlparse, Request
from html.parser import HTMLParser
import string

In [9]:
visited = set() # initialize visited to an empty set
freqdict = {}   # initialize the dictionary to hold frequency

class Collector(HTMLParser):
    """
    Collects hyperlink URLs into a list
    Collects the data in a list from the url page
    """

    ## ignore several tags and extentions and exclusions that helps
    ## extraction of clean text and avoid error.  
    # We define them as a class variable.
    ignore_tags = ['script', 'noscript', 'input', 'meta', 'title', 
                   'style', 'form','img']
    img_ext = ['.jpg','.png','.gif','.pdf','.zip']
    exclude = ['action=download', 'mailto', 'course-evaluation', ' ','img']

    def __init__(self, url):
        """
        Initializes parser, the url, list for link and data content
        Initializes the current tag
        Fetches the stop words to be ignored 
        """
        
        # initialize the super class
        HTMLParser.__init__(self)
        self.url = url
        self.links = []
        
        # variable to store the required data
        self.dataContent = []
        
        # variable to store current tags
        self.currentTag = ''
        
        # get the stop words from the file and close 
        stopWordFile = open('M6_stopwords.txt','r')
        self.lsStopWords = stopWordFile.read().splitlines()
        stopWordFile.close()
        
    def handle_starttag(self, tag, attrs):
        """ 
        Collects hyperlink URLs in their absolute format 
        """      
        
        # save the tag to the instance variable
        self.currentTag = tag
        excFlag = False
        
        # if the tag is a anchor tag, get the href links
        # avoiding downaload actions and facweb server data
        # append it to the instance variable list by converting
        # it to a absolute url
        # appending only links of cdm domain
        # to optimize list data structure we check for global 
        # variable visited set() before adding the link to the list
        # we are also excluding few file extention cdm links declared 
        # in class variable img_ext.
        if tag == 'a':
            for attr in attrs:
                if attr[0] == 'href':
                    # represent the flag to exclude the link or not
                    # if its in the exclusion list then exclude it
                    for exc in self.exclude:
                        if exc in attr[1].lower():
                            excFlag = True
                    
                    # if the flag is not flase
                    if(not excFlag):
                        # convert the url to a absolute url
                        absolute = urljoin(self.url, attr[1].lstrip())
                        
                        # proceed if its only a http url and the url is not a file extention
                        # capture only cdm url.
                        if absolute[:4] == 'http' and absolute[-4:] not in self.img_ext : # collect HTTP URL
                            if('http://www.cdm.depaul.edu/' == absolute[:26] and absolute not in visited ):
                                self.links.append(absolute)                            
                        
    def handle_data(self, data):
        """
        Function to handle the data and create a list of data for the page
        """
        
        ## check the current tag.  If it's not in the 
        ## ignore_tags list, proceed to process the data.
        if(self.currentTag not in self.ignore_tags):
            # removes all the punctuations from the data and convert to lower case
            # using the string punctuation data - !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
            data = data.translate(str.maketrans('','',string.punctuation)).lower()
            
            # get the list of words
            dataWord = data.split()
            
            # create an empty list data
            lsData = []
            
            # ignore the words that are present in the 
            # stop words list and get only alphabatic words
            for word in dataWord:
                if(word not in self.lsStopWords and word.isalpha()):
                    lsData.append(word)
            
            # accumulate the data to the instance variable 
            if(len(lsData)>0):
                self.dataContent.append(' '.join(lsData))
            
    def getLinks(self):
        """ 
        Returns hyperlinks URLs in their absolute format 
        """
        
        return self.links

    
    def getData(self):
        """ 
        Returns the data (accumulated in the instance variable) 
        """
        
        return self.dataContent
    

In [4]:
def crawl2(url):
    """ 
    A recursive web crawler that calls analyze()
    on every visited web page 
    """
    
    # global variables warning
    global visited
    
    # add url to set of visited pages
    visited.add(url)

    # analyze() returns a list of hyperlink URLs in web page url 
    links = analyze(url)
              
    # recursively continue crawl from every link in links
    for link in links:
        # follow link only if not visited
        if link not in visited:
            try:
                crawl2(link)
            except Exception as e:
                pass
        

In [5]:
def analyze(url):
    """
    Function to analyze the url and capture frequency of words
    to the global directory
    """
    
    # set request header to avoid the HTTP error 418
    user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
    request = Request(url)
    request.add_header("User-Agent",user_agent)    
    content = urlopen(request).read().decode('utf8', errors='ignore')
    
    # obtain links in the web page
    collector = Collector(url)
    collector.feed(content)
    urls = collector.getLinks()          # get list of links

    # compute word frequencies
    content = collector.getData()
    frequency(content)
    
    # return the urls
    return urls

In [6]:
def frequency(sents):
    """
    Function to capture the word with frequencies in the dictionary
    """
    
    #global variable warning
    global freqdict
    
    # loop through the content and add word and frequency
    # to the dictionary
    for sent in sents:
        words = sent.split()
        for word in words:
            if word in freqdict:
                freqdict[word] += 1
            else:
                freqdict[word] = 1
                

In [7]:
def writeOutput():
    """
    Function to write the data to the output file
    """
    
    # open a file to write data in desired format
    file = open('crawl1.txt','w') 
    file.write("Total Number of page crawled:::: {}\n".format(len(visited)))
    file.write("The 50 most common words and their frequencies are::::\n")
    
    # initialise sl to write serial numbers to the file
    sl = 0
    
    # sort the dictionary based on the value in reverse
    # and get the first 50
    freqdictSorted = dict(sorted(freqdict.items(), key=lambda x:x[1],reverse=True)[:50])
    
    # loop through the sorted dictionary
    # and write the word and the frequency 
    # to the file with formatting
    for key in freqdictSorted.keys():
        sl +=1
        wrd = str(key)[:15]                   
        count = str(freqdictSorted[key])
        file.write("\n{:5}: {:15} {:10}".format(sl,wrd,count))
        
    # close the file
    file.close()
        
    print("End of Code run\n")

In [10]:
print("\nWeb Crawling Started....")
crawl2('http://www.cdm.depaul.edu')
print("\nWeb Crawling Ended....")
print("\nWritting Output to file....")
writeOutput()


Web Crawling Started....

Web Crawling Ended....

Writting Output to file....
End of Code run



#### Few Important points on Approach
   * How You Extended HTML Parser ?
       * The HTMLParser is extended in class Collector by initializing the object from the init method of Collecter class and overriding the methods to obtain desired output.
       
   * Which methods you overwrote ?
       * Method : handle_starttag and handle_data are overwritten.
       
   * How you restricted your search to webpages at CDM ?
       * In the handle_startag method we have a logic to capture the links in it absolute form when ever the tag is an anchor tag. While doing so we checke if the first 26 character of the absolute url is 'http://www.cdm.depaul.edu/' only then we add the link to be searched.
       * We also have logic to avoid files of few extenstions stored in a list and we have also avoided download links with few other exclusions in order for the program to run smoothly.