In [2]:
import pandas as pd
import numpy as np

References:

http://doc.scrapy.org/en/latest/intro/tutorial.html

https://www.data-blogger.com/2016/08/18/scraping-a-website-with-python-scrapy/

https://cs7083.wordpress.com/2013/01/31/demystifying-the-pagerank-and-hits-algorithms/

#### Steps to write a simple webscraper in Python using the Scrapy framework

The purpose of Scrapy is to extract content and links from a website by recursively following all the links on the given website. 

Install Scrapy using command: pip install scrapy

A. Create the folder structure for your project by using Command: scrapy startproject datablogger_scraper and Update the below files as shown 

In [None]:
# items.py
# %load homework7/datablogger_scraper/datablogger_scraper/items.py
import scrapy

class DatabloggerScraperItem(scrapy.Item):
    # The source URL
    url_from = scrapy.Field()
    # The destination URL
    url_to = scrapy.Field()

Creating first spider,Datablogger.py
You can customize this file as much as you want. I ended up with the following code,
Start_url is "data-blogger.com" and setting the DEPTH_LIMIT to 200.
In the parse function, parent url, links present in the parent web page and title are extracted.
Next if the extracted links are not already visited, then they are recursively called to scrap a website.

In [None]:
#Creating first spider,Datablogger.py
# %load homework7/datablogger_scraper/datablogger_scraper/spiders/Datablogger.py
import scrapy
import csv
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
from scrapy.conf import settings

class DatabloggerSpider(scrapy.Spider):

    custom_settings = {'DEPTH_LIMIT': 200,}
    
    
    # The name of the spider
    name = "datablogger"
    # The domains that are allowed (links to other domains are skipped)
    allowed_domains = ["data-blogger.com"]
    
    # to keep track of already visited web pages
    linksvisited=[]
    linksvisited.append("https://www.data-blogger.com")
    linksvisited.append("https://www.data-blogger.com/")
    

    # The URLs to start with
    start_urls = ["https://www.data-blogger.com/"]
       
    
    # Method for parsing items
    def parse(self, response):
        items=[]
        links = response.css('a::attr(href)').extract()
        # append the link as visited
        DatabloggerSpider.linksvisited.append(response.url)
        linksn=[]
        if links:
           linksn= list(set(links)- set(DatabloggerSpider.linksvisited)) 
        
        for url in linksn:
           
           if url not in DatabloggerSpider.linksvisited  :
              # extract necessary information  from response
              item = []
              item.append(response.url)
              item.append(url)
              item.append(response.css('title::text').extract())
              yield scrapy.Request( url,callback=self.parse)     
             # store the information into csv
              with open("datablogger_scraper/links.csv", 'a') as myfile:
                 wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
                 wr.writerow(item) 
                 

#### Executing the Spider
scrapy crawl datablogger -o links.csv -t csv

#### Data preparation for Pagerank algorithm

1. Read the collect data from 'links.csv' file into a pandas DataFrame
2. create the Stochastic matrix to perfrom the analyis

In [3]:
# read the data into DataFrame
data = pd.read_csv("datablogger_scraper/links.csv")
data.head()

Unnamed: 0,from_url,to_url,title
0,https://www.data-blogger.com/,http://twitter.com/kmjjacobs/status/8754672718...,['Data Blogger - An interesting blog with tuto...
1,https://www.data-blogger.com/,https://twitter.com/share?text=A%20Comparative...,['Data Blogger - An interesting blog with tuto...
2,https://www.data-blogger.com/,https://twitter.com/search?q=%23euler&src=hash,['Data Blogger - An interesting blog with tuto...
3,https://www.data-blogger.com/,https://www.data-blogger.com/2017/07/17/a-comp...,['Data Blogger - An interesting blog with tuto...
4,https://www.data-blogger.com/,https://www.data-blogger.com/2017/11/01/pokerb...,['Data Blogger - An interesting blog with tuto...


In [9]:
# Dertermine the unique parent links to build stochastic matrix
unique_fromurl=data['from_url'].unique()


In [10]:
# calculate the number of out going links from each parent link
links_out=data[['from_url','to_url']].groupby(['from_url'],as_index=False).count()

In [11]:
links_out.head()

Unnamed: 0,from_url,to_url
0,https://www.data-blogger.com/,154
1,https://www.data-blogger.com/2016/01/20/spam-d...,60
2,https://www.data-blogger.com/2016/01/20/the-ma...,58
3,https://www.data-blogger.com/2016/01/21/buildi...,64
4,https://www.data-blogger.com/2016/01/21/facebo...,58


In [12]:
# create stochastic matrix with nodes as links/pages and values of matrix indicating link between two website pages
H=np.zeros((links_out.shape[0],links_out.shape[0]))
url_from=links_out['from_url']


Below code fills the stocastic matrix

Example:

create the Stostic matrix to perfrom the analyis
<img src="https://cs7083.files.wordpress.com/2013/01/b.png">
create the Stostic matrix to perfrom the analyis
<img src="https://cs7083.files.wordpress.com/2013/01/a.png">    

In [13]:
# creating Stochastic matrix
for i in range (len(url_from)-1):
    prob=1/int(links_out.loc[[i]]['to_url']) 
    temp = data[data['from_url']==url_from[i]]['to_url']
    index = list(links_out[links_out['from_url'].isin(temp)].index)
    H[i][index]=prob
    H[i][i]=0

Below is the pagerank algorithm, for which will we pass will pass created stochastic matrix to get top 5 urls
Refer below link to understand pagerank algorithm
https://cs7083.wordpress.com/2013/01/31/demystifying-the-pagerank-and-hits-algorithms/

In [14]:
from numpy import *
 
def pagerank(H):
    n= len(H)
    w = zeros(n)
    rho = 1./n * ones(n);
    for i in range(n):
      if multiply.reduce(H[i]== zeros(n)):
        w[i] = 1
    newH = H + outer((1./n * w),ones(n))
 
    theta=0.85
    G = (theta * newH) + ((1-theta) * outer(1./n * ones(n), ones(n)))
    print (rho)
    for j in range(10):
        rho = dot(rho,G)
        print (rho)
        if j==9:
            return rho

#### call function pagerank on stochastic matrix and store the 10th iteration result in 'result'

In [15]:
result=pagerank(H)

[ 0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358  0.00377358
  0.00377358  0.00377358  0.00377358  0.00377358  0

#### get the top 5 urls from the sample using argsort function

In [16]:
links_out.loc[np.argsort(result)[-5:]]['from_url']

18    https://www.data-blogger.com/2016/08/13/apache...
58       https://www.data-blogger.com/become-a-blogger/
60    https://www.data-blogger.com/category/data-sci...
77    https://www.data-blogger.com/category/uncatego...
32    https://www.data-blogger.com/2017/03/15/pi-day...
Name: from_url, dtype: object