In [1]:

from bs4 import BeautifulSoup

def extract_article_links(file_path, output_file_path):
    # Read the HTML content from the file
    with open(file_path, 'r') as file:
        html_content = file.read()

    # Parsing the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extracting all links starting with '/philosophy/'
    articles = soup.find_all('a', href=lambda href: href and href.startswith('/philosophy/'))

    # Base URL
    base_url = "https://www.gnu.org"

    # Constructing full URLs
    full_urls = [base_url + article['href'] for article in articles]

    # Write the URLs to the output file
    with open(output_file_path, 'w') as file:
        for url in full_urls:
            file.write(url + '\n')

# Example usage
input_file_path = './gnu-source.html' # Replace this with the path to your HTML file
output_file_path = './extracted_urls.txt' # The file where URLs will be saved
extract_article_links(input_file_path, output_file_path)

In [3]:
import requests
from bs4 import BeautifulSoup
import os
import re  # Importing the regular expressions module

def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        main_content = soup.find('div', {'id': 'content'})  # Adjust this selector based on the common container
        if main_content:
            text = ' '.join([p.get_text().strip() for p in main_content.find_all('p')])
            # Replace multiple newlines or spaces with a single space
            cleaned_text = re.sub(r'\s+', ' ', text)
            return cleaned_text
        else:
            return "Main content not found."
    except Exception as e:
        return f"Error: {e}"

def read_urls_from_file(file_path):
    with open(file_path, 'r') as file:
        return file.read().splitlines()

def save_text_to_file(folder, filename, text):
    if not os.path.exists(folder):
        os.makedirs(folder)
    with open(os.path.join(folder, filename), 'w') as file:
        file.write(text)

# Main process
urls_file_path = './extracted_urls.txt'  # Change this to your file path
urls = read_urls_from_file(urls_file_path)
articles_folder = 'articles'

for url in urls:
    content = extract_text_from_url(url)
    file_name = url.split('/')[-1].split('?')[0] + '.txt'  # Creating a filename from the URL
    save_text_to_file(articles_folder, file_name, content)


In [2]:
import requests
from bs4 import BeautifulSoup
import os
import re

def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        main_content = soup.find('div', {'id': 'content'})
        if main_content:
            # Extracting text and formatting as Markdown
            markdown_text = '\n\n'.join(['**' + p.get_text().strip() + '**' if p.find('strong') else p.get_text().strip() for p in main_content.find_all('p')])
            # Replace multiple newlines or spaces with a single space
            cleaned_text = re.sub(r'\s+', ' ', markdown_text)
            return cleaned_text
        else:
            return "Main content not found."
    except Exception as e:
        return f"Error: {e}"

def read_urls_from_file(file_path):
    with open(file_path, 'r') as file:
        return file.read().splitlines()

def save_text_to_file(folder, filename, text):
    if not os.path.exists(folder):
        os.makedirs(folder)
    with open(os.path.join(folder, filename + '.md'), 'w') as file:  # Save as Markdown file
        file.write(text)

# Main process
urls_file_path = './extracted_urls.txt'
urls = read_urls_from_file(urls_file_path)
articles_folder = 'articles'

for url in urls:
    content = extract_text_from_url(url)
    file_name = url.split('/')[-1].split('?')[0]  # Creating a filename from the URL
    save_text_to_file(articles_folder, file_name, content)
