# Web Scraper for docs.logrhythm.com

This notebook contains a script to crawl docs.logrhythm.com, focusing on English content within the domain. The script scrapes data from the website and stores it in MongoDB, avoiding duplicate entries.

In [1]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
from langdetect import detect, DetectorFactory
import pymongo
import time
from datetime import datetime

## MongoDB Connection Setup

Establishing a connection to MongoDB and defining the database and collections to be used.

In [2]:
# MongoDB connection
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["WS_Data_DB"]  # Database name
collection = db["LogRhythm7_15Docs"]  # Collection name

## Initial Setup for Language Detection

Set the seed for the language detection library to ensure consistent results.

In [3]:
# Set seed for consistent language detection
DetectorFactory.seed = 0

## Function Definitions

Defining the functions needed for web crawling. This includes functions to get page content, check robots.txt compliance, extract information from HTML, detect if the content is English, and extract valid links.

In [4]:
# Function to fetch page content with retry logic
def get_page_content(url, max_retries=3, delay=1):
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.content
            else:
                print(f"Failed to fetch content. Status code: {response.status_code}")
                retries += 1
                time.sleep(delay)
        except Exception as e:
            print(f"Error fetching content: {e}")
            retries += 1
            time.sleep(delay)
    return None

# Function to check robots.txt compliance
def is_allowed_by_robots(url):
    base_url = urljoin(url, '/')
    if not base_url.endswith('/'):
        base_url += '/'
    robots_txt_url = urljoin(base_url, 'robots.txt')
    try:
        response = requests.get(robots_txt_url)
        response.raise_for_status()
        robots_txt_content = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching robots.txt: {e}")
        return False
    rp = RobotFileParser()
    rp.parse(robots_txt_content.splitlines())
    return rp.can_fetch("*", url)

# Function to extract information from HTML content
def extract_information(html_content, url):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove script and style elements
    for script_or_style in soup(['script', 'style']):
        script_or_style.extract()

    # Extract the title
    title = soup.title.string if soup.title else 'No Title'

    # Extract sectioned content
    sectioned_content = extract_sectioned_content(html_content)

    # Create the document to be stored in MongoDB
    document = {
        "url": url,
        "title": title,
        "content_sections": sectioned_content,
        "retrieved_on": datetime.utcnow()
    }

    return document

# Function to extract sectioned content
def extract_sectioned_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    sections = {}
    current_header = 'General'  # Default section if no headers are found
    for element in soup.find_all(['h1', 'h2', 'h3', 'p', 'li', 'table']):
        if element.name in ['h1', 'h2', 'h3']:
            current_header = element.get_text(strip=True)
            sections[current_header] = ''  # Initialize the section with an empty string
        else:
            # Ensure the current header exists in the dictionary before appending content
            if current_header not in sections:
                sections[current_header] = ''
            if element.name == 'table':
                table_text = process_table(element)
                sections[current_header] += table_text
            else:
                sections[current_header] += element.get_text(separator='\n', strip=True) + '\n'

    return sections

# Function to process found table information
def process_table(table_tag):
    table_text = ''
    for row in table_tag.find_all('tr'):
        row_cells = [cell.get_text(strip=True) for cell in row.find_all(['th', 'td'])]
        row_text = ', '.join(row_cells)
        table_text += row_text + '\n'
    return table_text

# Function to check if content is in English
def is_english_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    meaningful_text = soup.body.get_text(separator=' ', strip=True) if soup.body else ''
    try:
        return detect(meaningful_text[:1000]) == 'en'  # Sample first 1000 characters
    except:
        return False

# Function to extract valid links
def extract_links(html_content, base_url):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = [link.get('href') for link in soup.find_all('a', href=True)]
    valid_links = []
    for link in links:
        if not link.startswith('http'):
            link = urljoin(base_url, link)
        parsed_link = urlparse(link)
        if parsed_link.netloc == 'docs.logrhythm.com' and parsed_link.path.startswith('/lrsiem/7.15.0/'):
            valid_links.append(link)
    return valid_links

## Crawl Function

Define the crawl function to recursively crawl the website, adhering to the specified rules such as staying within the docs.logrhythm.com domain and processing only English content.

In [5]:
# Recursive crawl function with rate limiting
def crawl(url, visited, delay=0.5):
    if url in visited or not url.startswith('http'):
        return
    visited.add(url)
    time.sleep(delay)  # Rate limiting
    if is_allowed_by_robots(url) and urlparse(url).netloc == 'docs.logrhythm.com':
        content = get_page_content(url)
        if content and is_english_content(content):
            document = extract_information(content, url)
            if collection.count_documents({"url": url}) == 0:
                collection.insert_one(document)
            for link in extract_links(content, url):
                crawl(link, visited, delay)
    else:
        print(f"Crawling not allowed or outside domain for: {url}")

## Start Crawling

Initiate the web crawler from the main page of docs.logrhythm.com. The crawler will recursively visit each link, staying within the specified domain and processing only English content.

In [6]:
# Start Crawling
start_url = "https://docs.logrhythm.com/lrsiem/7.15.0/"
visited_urls = set()  # Set of visited URLs to avoid revisiting
crawl(start_url, visited_urls)  # Start the web scraping process