In [4]:
import sys

required_packages = [
    "requests",
    "beautifulsoup4",
    "langdetect",
    "pymongo"
]

# Check if each package is installed, install missing ones
missing_packages = [package for package in required_packages if package not in sys.modules]
if missing_packages:
    print("The following packages are missing and will be installed:")
    print(missing_packages)
    for package in missing_packages:
        try:
            __import__(package)
        except ImportError:
            !pip install {package}

    # Check again if all packages are installed
    missing_packages = [package for package in required_packages if package not in sys.modules]
    if missing_packages:
        print("Error: The following packages could not be installed:")
        print(missing_packages)
        sys.exit(1)
else:
    print("All required packages are already installed. You can proceed with running the program.")

The following packages are missing and will be installed:
['beautifulsoup4']
Error: The following packages could not be installed:
['beautifulsoup4']


SystemExit: 1

# Web Crawler for docs.logrhythm.com
This notebook contains a script to crawl `docs.logrhythm.com`, focusing on English content within the domain.

# Web Scraping and Data Storage in MongoDB
This notebook contains a script to scrape data from `docs.logrhythm.com` and store it in MongoDB, avoiding duplicate entries.


In [1]:
# Install necessary libraries (uncomment if not already installed)
#!pip install requests beautifulsoup4 langdetect pymongo

# Import libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
from langdetect import detect, DetectorFactory
import pymongo


**MongoDB Connection Setup**

Establishing a connection to MongoDB and defining the database and collections to be used.


In [2]:
# MongoDB connection
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["WS_Data_DB"]  # Database name
collection = db["LogRhythmDocs"]  # Collection name


**Initial Setup for Language Detection**

Set the seed for the language detection library to ensure consistent results.

In [3]:
# Set seed for consistent language detection
DetectorFactory.seed = 0

**Function Definitions**

Define the functions needed for web crawling. This includes functions to get page content, check `robots.txt` compliance, extract information from HTML, detect if the content is English, and extract valid links.

In [4]:
# Function to fetch page content
def get_page_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.content
        else:
            print(f"Failed to fetch content. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching content: {e}")
        return None

# Function to check robots.txt compliance
def is_allowed_by_robots(url):
    base_url = urljoin(url, '/')
    if not base_url.endswith('/'):
        base_url += '/'
    robots_txt_url = urljoin(base_url, 'robots.txt')
    try:
        response = requests.get(robots_txt_url)
        response.raise_for_status()
        robots_txt_content = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching robots.txt: {e}")
        return False
    rp = RobotFileParser()
    rp.parse(robots_txt_content.splitlines())
    return rp.can_fetch("*", url)

# Function to extract information from HTML content
def extract_information(html_content, url):
    soup = BeautifulSoup(html_content, 'html.parser')
    text_content = soup.get_text(separator='\n', strip=True)

    # Check if the document already exists to avoid duplicates
    if collection.count_documents({"url": url}) == 0:
        document = {
            "url": url,
            "content": text_content
        }
        collection.insert_one(document)
        
# Function to check if content is in English
def is_english_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text_samples = soup.get_text(separator=' ', strip=True)
    try:
        return detect(text_samples[:1000]) == 'en'  # Sample first 1000 characters
    except:
        return False

# Function to extract valid links
def extract_links(html_content, base_url):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = [link.get('href') for link in soup.find_all('a', href=True)]
    valid_links = []
    for link in links:
        if not link.startswith('http'):
            link = urljoin(base_url, link)
        parsed_link = urlparse(link)
        if parsed_link.netloc == 'docs.logrhythm.com':
            valid_links.append(link)
    return valid_links


**Crawl Function**

Define the `crawl` function to recursively crawl the website, adhering to the specified rules such as staying within the `docs.logrhythm.com` domain and processing only English content.

In [5]:
# Recursive crawl function
def crawl(url, visited):
    if url in visited or not url.startswith('http'):
        return
    visited.add(url)
    if is_allowed_by_robots(url) and urlparse(url).netloc == 'docs.logrhythm.com':
        content = get_page_content(url)
        if content and is_english_content(content):
            extract_information(content, url)
            for link in extract_links(content, url):
                crawl(link, visited)
    else:
        print(f"Crawling not allowed or outside domain for: {url}")

**Start Crawling**

Initiate the web crawler from the main page of `docs.logrhythm.com`. The crawler will recursively visit each link, staying within the specified domain and processing only English content.

In [6]:
# Start URL
start_url = "https://docs.logrhythm.com/"

# Set of visited URLs to avoid revisiting
visited_urls = set()

# Start the web scraping process
crawl(start_url, visited_urls)


Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status co

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

Failed to fetch content. Status code: 404


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Failed to fetch content. Status code: 404


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Failed to fetch content. Status code: 404


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
Error fetching content: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error fetching robots.txt: HTTPSConnectionPool(host='docs.logrhythm.com', port=443): Max retries exceeded with url: /robots.txt (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001D211A946D0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Crawling not allowed or outside domain for: https://docs.logrhythm.com/lrsiem/7.13.0/object
Failed to fetch content. Status code: 404
Failed to fetch content. Status code: 404
