Building a crawler to analyze a wikipage

In [None]:
pip install pyserini

In [None]:
import pandas as pd
import numpy as np
from pyserini.search.lucene import LuceneSearcher

In [None]:
pip install scrapy

In [None]:
import scrapy
class WikiSpider(scrapy.Spider):
    name = "wiki_spider"
    start_urls = ['https://en.wikipedia.org/wiki/Law']

    # scrapy settings. scrapy runspider looks for variable called "custom_settings" as a dictionary with specific keys
    custom_settings = {
        'CLOSESPIDER_PAGECOUNT': 100000, # stops running after reaching pagecount value
        'CONCURRENT_REQUESTS': 16, # allows the spider to download 16 pages at the same time. This is standard and keeps wiki from blocking the bot
        'FEEDS': {'wiki_data2.jsonl': {'format': 'jsonlines', 'overwrite': True}}, # handles the file saving for so i don't have to write open() or write() in the code
        'ROBOTSTXT_OBEY': False, # ignores wiki's rules for bots scraping
        'DEPTH_PRIORITY': 1, # prioritize shallow links to keep the content as relevant to 'law' as possible
        # force first-in-first-out queue to keep the content more relevant
        'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue', # store urls in RAM
        'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue', # If RAM gets full, Scrapy dumps the overflow URLs onto your Hard Drive
    }

    def parse(self, response):
        # get title
        # 'string()' operates like a function and grabs text even if it's trapped inside a <span> tag.
        # Ttakes the H1 element and converts the entire tree structure into a single flat string.
        # uses universal xpath formula //tag[@attribute=value] so that its applcable to all titles
        # html attribute and value for title is id="firstHeading". Use inspect on wiki page to get this
        title = response.xpath('string(//h1[@id="firstHeading"])').get()

        # get text data
        # Grab every <p>, convert to string, join them, and clean whitespace
        # div#mw-content-text is the name for the main body of the page
        raw_text = [p.xpath('string(.)').get() for p in response.css('div#mw-content-text p')]
        text_content = " ".join(" ".join(raw_text).split())

        # save and set limit to wiki pages with over 500 words
        if title and len(text_content) > 500:
            yield {
                'url': response.url,
                'title': title,
                'text': text_content
            }

        # Follow Links
        # CSS telling crawler to look inside the body of the wiki page (div#mw-content-text this can be
        # found using inspector on the wiki page). this looks for <div>, then identifies id mw-content-text
        # a::attr(href) tells it to find the url in the body and go there
        for link in response.css('div#mw-content-text a::attr(href)').getall():
            if link.startswith('/wiki/') and ':' not in link:
                yield response.follow(link)

In [None]:
import json
import os
import subprocess
from pyserini.search.lucene import LuceneSearcher

# Creates a folder called "input" to store the json file being indexed using pyserini
os.makedirs("input", exist_ok=True)

# Convert output into pyserini format (url, title, and text)
with open("wiki_data2.jsonl", "r", encoding="utf-8") as wiki_file, \
     open("input/wiki_docs2.jsonl", "w", encoding="utf-8") as dest_file:

# Apply to each line in each scrapped wikipidia page
    for line in wiki_file:
        data = json.loads(line)

        # Create Pyserini document
        pyserini_doc = {
            "id": data["url"],            # url acts as unique identifier
            "contents": data["text"],     # contents to be indexed
            "title": data["title"]        # title
        }

        # Write JSON object per line
        dest_file.write(json.dumps(pyserini_doc) + "\n")


# Build Lucene index
command = [
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection", # input format
    "--input", "input", # directory containing jsonl docs
    "--index", "indexes/wiki_index", # output into indexes/wiki_index folder
    "--generator", "DefaultLuceneDocumentGenerator", 
    "--threads", "1",
    "--storePositions",
    "--storeDocvectors",
    "--storeRaw"
]

# Run commands
subprocess.run(command, check=True)

In [None]:
# Load LuceneSearcher
searcher = LuceneSearcher("indexes/wiki_index")

# State keyword query
query = "constitutional law"
hits = searcher.search(query) # Search for stated query

# for each hit, retrieve the stored raw docuemnts, extract the title, and print rank, score, url, and title
# loops over each retrived document
for i, hit in enumerate(hits):
    doc = searcher.doc(hit.docid)

    if doc is not None:
        doc_json = json.loads(doc.raw()) # load raw doc
        title = doc_json.get("title", "No title") # extracts title from doc
    else:
        title = "No doc found" # return if no title is found

    # Print results
    print(
        f"{i+1:<3} "
        f"Score: {hit.score:.4f} "
        f"URL: {hit.docid} "
        f"Title: {title}"
    )

In [None]:
import time
import matplotlib.pyplot as plt

# Define source, input, and index
source = "wiki_data2.jsonl"
input = "input"
index = "indexes/wiki_index"

# Creates a folder/ directory called "input" to store the json file being indexed using pyserini
os.makedirs(input, exist_ok=True)

# Document counts to test 
# Test in increments to measure how indexing time grows as document count increases
doc_counts = [100, 500, 1000, 2000, 5000, 10000]

# Define variables to store times and lines
times = []
lines = []

# Read source file and store lines in memory
with open(source, "r", encoding="utf-8") as fin:
    for line in fin:
        lines.append(json.loads(line))

# Loop through document counts
for n in doc_counts:

    # Write first n documents to pyserini input file
    # Opens a new JSONL file that pyserini will index. File is overwritten for each value of n
    with open(os.path.join(input, "wiki_docs_subset.jsonl"), "w", encoding="utf-8") as fout:
        for i in range(n):
            data = lines[i]

            # convert original wikipedia doc to pyserini format
            pyserini_doc = {
                "id": data["url"],                 # url acts as unique identifier
                "contents": f"{data['title']}\n\n{data['text']}",  # title + text combined for indexing
                "title": data["title"]             # title
            }

            # writes one JSON object per line
            fout.write(json.dumps(pyserini_doc) + "\n")

    # Remove previous index if exists
    # Indexing starts from scratch
    if os.path.exists(index):
        subprocess.run(["rm", "-rf", index])

    # Build Lucene index and record time
    cmd = [
        "python", "-m", "pyserini.index.lucene",
        "--collection", "JsonCollection",   # input format
        "--input", input,               # directory containing jsonl docs
        "--index", index,               # output into indexes/wiki_index folder
        "--generator", "DefaultLuceneDocumentGenerator",
        "--threads", "1",
        "--storePositions",
        "--storeDocvectors",
        "--storeRaw"
    ]

    # Starts timer before indexing
    start_time = time.time()

    # Run indexing commands
    subprocess.run(cmd, check=True)

    # Stops timer after indexing finishes
    end_time = time.time()

    # Append runtime for current document count
    times.append(end_time - start_time)
    print(f"Indexed {n} docs in {times[-1]:.2f} seconds")

# Plot runtime vs document count
plt.figure(figsize=(8, 5))
plt.plot(doc_counts, times, marker="x", color="red")
plt.title("Lucene Indexing Runtime vs Number of Documents")
plt.xlabel("Number of Documents")
plt.ylabel("Runtime (seconds)")
plt.grid(True)
plt.show()
