# Web Scraping Script for Data Extraction

This script is designed for web scraping and extracting information from a docs.logrhythm.com about the 7.15 version of the software. It includes various utility functions and a crawling mechanism to navigate and process web pages. The script uses libraries such as `requests`, `BeautifulSoup`, `urlparse`, and `pandas`. Please ensure that these libraries are installed in your Jupyter environment before running the script.

The data scraped from the website is stored in a CSV file for later use in processing.  The file is named 'section_data.csv'.

### Imports and set Seed for Language Detection

In [None]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
from langdetect import detect, DetectorFactory
import pandas as pd
import time
from datetime import datetime

# Set seed for consistent language detection
DetectorFactory.seed = 0

### Utility Functions

The following cells define several utility functions that are essential for the web scraping process. These include functions to fetch page content, check compliance with `robots.txt`, extract and filter sectioned content, check if the content is in English, and extract valid links from a page.

In [None]:
# Function to fetch page content with retry logic
def get_page_content(url, max_retries=3, delay=1):
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.content
            else:
                retries += 1
                time.sleep(delay)
        except Exception as e:
            retries += 1
            time.sleep(delay)
    return None

# Function to check if crawling a URL is allowed by the site's robots.txt
def is_allowed_by_robots(url):
    base_url = urljoin(url, '/')
    if not base_url.endswith('/'):
        base_url += '/'
    robots_txt_url = urljoin(base_url, 'robots.txt')
    try:
        response = requests.get(robots_txt_url)
        response.raise_for_status()
        robots_txt_content = response.text
    except requests.exceptions.RequestException:
        return False
    rp = RobotFileParser()
    rp.parse(robots_txt_content.splitlines())
    return rp.can_fetch("*", url)

# List of keywords to exclude in content sections, to filter out irrelevant sections
exclude_keywords = ['general', 'release notes', 'key highlights', 'maintenance', 'retrived']  

# Function to parse and extract relevant sections from HTML content
def extract_sectioned_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    sections = {}
    current_header = None
    for element in soup.find_all(['h1', 'h2', 'h3', 'p', 'li']):
        if element.name in ['h1', 'h2', 'h3']:
            header_text = element.get_text(strip=True).lower()
            if "general" in header_text or any(keyword.lower() in header_text for keyword in exclude_keywords):
                current_header = None
            else:
                current_header = header_text
                sections[current_header] = ''
        elif current_header:
            sections[current_header] += element.get_text(separator='\n', strip=True) + '\n'

    return sections

# Function to determine if the content is in English for processing
def is_english_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    meaningful_text = soup.body.get_text(separator=' ', strip=True) if soup.body else ''
    try:
        return detect(meaningful_text[:1000]) == 'en'
    except:
        return False

# Function to extract and validate hyperlinks from the HTML content
def extract_links(html_content, base_url):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = [link.get('href') for link in soup.find_all('a', href=True)]
    valid_links = []
    for link in links:
        if not link.startswith('http'):
            link = urljoin(base_url, link)
        parsed_link = urlparse(link)
        if parsed_link.netloc == 'docs.logrhythm.com' and parsed_link.path.startswith('/lrsiem/7.15.0/'):
            valid_links.append(link)
    return valid_links

### Crawling Function

The `crawl` function is the core of the script. It recursively visits links, checks for `robots.txt` compliance, and processes pages within the specified domain. Extracted content is aggregated into a pandas DataFrame.

In [None]:
# Function to extract and concatenate content from different sections of a webpage
def extract_information(html_content, url):
    sectioned_content = extract_sectioned_content(html_content)
    concatenated_content = '\n'.join([section for section in sectioned_content.values()])
    return concatenated_content

# Initialize a list to store the scraped content
scraped_data = []

# Recursive function to crawl and process webpages
def crawl(url, visited, delay=0.5):
    if url in visited or not url.startswith('http'):
        return
    visited.add(url)
    print(f"Visiting and processing: {url}")
    time.sleep(delay)  # Rate limiting
    if is_allowed_by_robots(url) and urlparse(url).netloc == 'docs.logrhythm.com':
        content = get_page_content(url)
        if content and is_english_content(content):
            concatenated_content = extract_information(content, url)
            scraped_data.append(concatenated_content)
            for link in extract_links(content, url):
                crawl(link, visited, delay)
    else:
        print(f"Crawling not allowed or outside domain for: {url}")

# Start URL for Crawling
start_url = "https://docs.logrhythm.com/lrsiem/7.15.0/"
visited_urls = set()

# Initiating the crawling process from the start URL
crawl(start_url, visited_urls)

### Data Processing and Output

After crawling, the script processes the scraped data, saves it to a CSV file, and displays samples from the DataFrame. The pandas settings are adjusted for better display of the DataFrame.

In [None]:
# Check if any data was scraped and process it
if scraped_data:
    section_data = pd.DataFrame(scraped_data, columns=['content'])
    section_data.to_csv('section_data.csv', index=False, encoding='utf-8')
    print("Dataframe created and .csv file saved.")
else:
    print("No data was scraped.")

# Setting display options for better visualization of the DataFrame
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 10)

# Displaying the first and last few documents in the DataFrame
print('Examples from start of the dataframe:\n')
print(section_data.head(3))  
print('\nExamples from end of the dataframe:')
print(section_data.tail(3))  

# Resetting display options to default
pd.reset_option('display.max_colwidth')