<h1>STA 220 ASSIGNMENT 2</h1>

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests_cache import CachedSession
import concurrent.futures
import re
from queue import Queue
import numpy as np
import operator
import pandas as pd
import multiprocessing
from lxml import html as lx
import itertools
from scipy.sparse import lil_matrix, csr_matrix

(a) Use the special AllPages page and understand its logic to retrieve the url of all articles in the sinhalese wikipedia. Make sure to skip redirections.

How many articles are there?

In [2]:
base_url = 'https://si.wikipedia.org'

In [3]:
session = CachedSession()

In [4]:
def count_articles_in_a_page(url):
    response = session.get(url, allow_redirects=False)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the 'ul' element with the class 'mw-allpages-chunk'
    unordered_list_articles = soup.find('ul', class_='mw-allpages-chunk')

    # Count the number of 'a' elements (links) in the 'ul' element
    hyper_links = unordered_list_articles .find_all('a')
    article_urls = set()
    for link in hyper_links:
        href = link.get('href')
        article_url = base_url + href
        # Check if the link is a redirection link or a link to a special page, image, or external website
        if href and href.startswith('/wiki/') and not ':' in href and 'mw-redirect' not in link.get('class', ''):
            # This is a non-redirection link to a regular article
            article_urls.add(article_url)
    return article_urls

In [5]:
def fetch_all_articles(url):
    all_article_urls = set()
    while url:
        article_urls = count_articles_in_a_page(url)
        all_article_urls.update(article_urls)
        response = session.get(url, allow_redirects=False)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the 'next page' link
        next_page_div = soup.find('div', class_='mw-allpages-nav')
        next_page_link = next_page_div.find('a', string=lambda value: value and "මීළඟ පිටුව" in value)
        if next_page_link:
            url = base_url + next_page_link.get('href')
        else:
            url = None
    return all_article_urls

Calculating the number of articles in Sinhala Wikipedia

In [6]:
all_page_url = 'https://si.wikipedia.org/wiki/%E0%B7%80%E0%B7%92%E0%B7%81%E0%B7%9A%E0%B7%82:%E0%B7%83%E0%B7%92%E0%B6%BA%E0%B7%85%E0%B7%94_%E0%B6%B4%E0%B7%92%E0%B6%A7%E0%B7%94'
all_article_urls = fetch_all_articles(all_page_url)
print(f'The Sinhala Wikipedia contains {len(all_article_urls)} articles.')

The Sinhala Wikipedia contains 24186 articles.


In [7]:
def fetch_url_content(url):
    response = session.get(url)
    if response.status_code == 200:
        return lx.fromstring(response.content)
    else:
        return None

def extract_links_from_content(content, article_url, all_article_urls):
    other_links = set()
    body_content_div = content.xpath('//div[@id="bodyContent"]')[0]
    a_tags = body_content_div.xpath('.//a[@href]')
    href_pattern = re.compile(r'^/wiki/[^:]*$')

    for a_tag in a_tags:
        href = a_tag.get('href')
        if href_pattern.match(href) and not href.endswith('.jpg'):
            full_url = urljoin(article_url, href)
            if full_url in all_article_urls:
                other_links.add(full_url)
            elif 'mw-redirect' in a_tag.get('class', ''):
                redirect_content = fetch_url_content(full_url)
                if redirect_content is not None:
                    redirected_link = redirect_content.xpath('//span[@class="mw-redirectedfrom"]/a/@href')
                    if redirected_link:
                        new_url = urljoin(full_url, redirected_link[0])
                        if new_url in all_article_urls:
                            other_links.add(new_url)
    return other_links

def fetch_article_and_extract_links(article_url, all_article_urls):
    try:
        content = fetch_url_content(article_url)
        if content is not None:
            other_links = extract_links_from_content(content, article_url, all_article_urls)
            return article_url, other_links
    except Exception as e:
        print(f"Error processing {article_url}: {e}")
        return article_url, set()


In [8]:
def fetch_all_articles_and_extract_links(all_article_urls):
    dict_links_other_articles = {}
    num_threads = 6

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        future_to_url = {executor.submit(fetch_article_and_extract_links, url, all_article_urls): url for url in all_article_urls}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                _, other_links = future.result()
                if other_links:
                    dict_links_other_articles[url] = other_links
            except Exception as e:
                print(f"Error processing {url}: {e}")

    return dict_links_other_articles

dict_links_other_articles = fetch_all_articles_and_extract_links(all_article_urls)

total_articles = sum(len(links) for links in dict_links_other_articles.values())
print("Total number of other articles:", total_articles)

Total number of other articles: 268642


(c) Compute the transition matrix (see here and here for step-by-step instructions). Make sure to tread dangling nodes

In [11]:
def compute_transition_matrix(adjacency_csr):
    # Compute row sums
    row_sums = np.array(adjacency_csr.sum(axis=1)).flatten()
    
    # Compute inverse row sums
    inverse_row_sums = np.divide(1, row_sums, where=row_sums!=0)
    
    # Compute row stochastic matrix
    row_stochastic_matrix = csr_matrix(np.diag(inverse_row_sums)) @ adjacency_csr

    # Handle dangling nodes
    dangling_nodes = np.where(row_sums == 0)[0]
    num_nodes = adjacency_csr.shape[0]
    dangling_matrix = np.full(num_nodes, 1 / num_nodes)
    row_stochastic_matrix = row_stochastic_matrix.tolil()
    row_stochastic_matrix[dangling_nodes, :] = dangling_matrix
    row_stochastic_matrix = row_stochastic_matrix.tocsr()

    return row_stochastic_matrix


In [12]:
all_article_urls = list(set(all_article_urls))

# Creating a mapping from URLs to indices
url_to_index = {url: index for index, url in enumerate(all_article_urls)}

# Initializing an empty adjacency matrix
adjacency_matrix = lil_matrix((len(all_article_urls), len(all_article_urls)), dtype=int)

# Filling in the adjacency matrix
for article_url, linked_articles in dict_links_other_articles.items():
    from_index = url_to_index.get(article_url)
    for linked_article in linked_articles:
        to_index = url_to_index.get(linked_article)
        if from_index is not None and to_index is not None:
            adjacency_matrix[from_index, to_index] = 1

# Converting to CSR format for efficient operations
adjacency_csr = adjacency_matrix.tocsr()

# Computing transition matrix
transition_matrix = compute_transition_matrix(adjacency_csr)

print(transition_matrix)

  (0, 0)	4.134623335814107e-05
  (0, 1)	4.134623335814107e-05
  (0, 2)	4.134623335814107e-05
  (0, 3)	4.134623335814107e-05
  (0, 4)	4.134623335814107e-05
  (0, 5)	4.134623335814107e-05
  (0, 6)	4.134623335814107e-05
  (0, 7)	4.134623335814107e-05
  (0, 8)	4.134623335814107e-05
  (0, 9)	4.134623335814107e-05
  (0, 10)	4.134623335814107e-05
  (0, 11)	4.134623335814107e-05
  (0, 12)	4.134623335814107e-05
  (0, 13)	4.134623335814107e-05
  (0, 14)	4.134623335814107e-05
  (0, 15)	4.134623335814107e-05
  (0, 16)	4.134623335814107e-05
  (0, 17)	4.134623335814107e-05
  (0, 18)	4.134623335814107e-05
  (0, 19)	4.134623335814107e-05
  (0, 20)	4.134623335814107e-05
  (0, 21)	4.134623335814107e-05
  (0, 22)	4.134623335814107e-05
  (0, 23)	4.134623335814107e-05
  (0, 24)	4.134623335814107e-05
  :	:
  (24185, 24161)	4.134623335814107e-05
  (24185, 24162)	4.134623335814107e-05
  (24185, 24163)	4.134623335814107e-05
  (24185, 24164)	4.134623335814107e-05
  (24185, 24165)	4.134623335814107e-05
  (24185,

(d, i) Set the damping factor to 0.85 and compute the PageRank for each article, using fourty iterations and starting with a vector with equal entries

In [13]:
# Setting the damping factor
damping_factor = 0.85

# Initializing the PageRank vector with equal entries
pagerank_vector = np.full((len(all_article_urls), 1), 1 / len(all_article_urls))

# Computing the PageRank for each article using 40 iterations
for _ in range(40):
    pagerank_vector = (1 - damping_factor) / len(all_article_urls) + damping_factor * transition_matrix @ pagerank_vector

print(pagerank_vector)

[[4.13462334e-05]
 [4.13462334e-05]
 [4.13462334e-05]
 ...
 [4.13462334e-05]
 [4.13462334e-05]
 [4.13462334e-05]]


(d,ii) Obtain the top ten articles in terms of PageRank, and, retrieving the articles again, find the correponding English article, if available.

Return the corresponding English article titles of the top ten articles from the Sinhalese wikipedia.

In [15]:
# Getting the indices of the top 10 articles
top_10_indices = np.argsort(pagerank_vector, axis=0)[-10:]

# Getting the URLs of the top 10 articles
top_10_urls = [all_article_urls[i] for i in top_10_indices.flatten()]

df = pd.DataFrame(top_10_urls, columns=['URL'])

print(df)

                                                 URL
0  https://si.wikipedia.org/wiki/%E0%B7%83%E0%B7%...
1  https://si.wikipedia.org/wiki/%E0%B6%A2%E0%B7%...
2  https://si.wikipedia.org/wiki/%E0%B6%89%E0%B6%...
3  https://si.wikipedia.org/wiki/%E0%B6%B6%E0%B7%...
4  https://si.wikipedia.org/wiki/%E0%B6%B4%E0%B7%...
5  https://si.wikipedia.org/wiki/%E0%B6%B8%E0%B7%...
6  https://si.wikipedia.org/wiki/%E0%B6%A0%E0%B7%...
7  https://si.wikipedia.org/wiki/%E0%B6%B4%E0%B6%...
8  https://si.wikipedia.org/wiki/%E0%B7%81%E0%B7%...
9  https://si.wikipedia.org/wiki/%E0%B6%86%E0%B6%...


In [16]:
# Initializing a dictionary to store the English versions of the articles
english_versions = {}

# Fetching the English version for each article
for url in top_10_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    english_link = soup.find('a', {'hreflang': 'en'})
    if english_link is not None:
        english_versions[url] = english_link['href']

# Printing the English versions
for url, english_version in english_versions.items():
    print(f"URL: {url}")
    print(f"English version: {english_version}")
    print()

URL: https://si.wikipedia.org/wiki/%E0%B6%A2%E0%B7%94%E0%B6%BD%E0%B7%92%E0%B6%BA%E0%B7%8F_%E0%B6%B8%E0%B7%8F%E0%B6%9C%E0%B6%BB%E0%B6%A7%E0%B7%8A_%E0%B6%9A%E0%B7%90%E0%B6%B8%E0%B6%BB%E0%B6%B1%E0%B7%8A
English version: https://en.wikipedia.org/wiki/Julia_Margaret_Cameron

URL: https://si.wikipedia.org/wiki/%E0%B6%B4%E0%B7%93%E0%B6%B1%E0%B7%92%E0%B7%82%E0%B7%92%E0%B6%BA%E0%B7%8F%E0%B7%80
English version: https://en.wikipedia.org/wiki/Phoenicia

URL: https://si.wikipedia.org/wiki/%E0%B6%B8%E0%B7%8F%E0%B6%BB%E0%B7%8A%E0%B6%9C%E0%B6%9C%E0%B6%AD_%E0%B6%B4%E0%B7%94%E0%B7%80%E0%B6%AD%E0%B7%8A%E0%B6%B4%E0%B6%AD%E0%B7%8A
English version: https://en.wikipedia.org/wiki/Online_newspaper

URL: https://si.wikipedia.org/wiki/%E0%B6%86%E0%B6%BD%E0%B7%9D%E0%B6%9A%E0%B6%BA%E0%B7%9A_%E0%B7%80%E0%B7%9A%E0%B6%9C%E0%B6%BA
English version: https://en.wikipedia.org/wiki/Speed_of_light

