### HTML to PDF converter
This script is designed to convert saved HTML files from ScienceDirect into PDF format.

In [2]:
import os
from bs4 import BeautifulSoup
from fpdf import FPDF
import html2text
import re

In [None]:
DEMO = True
NUMBER_OF_PAPERS = 3

#### Functions

parsing, text cleaning, etc

In [3]:
def clean_text(text):
    # Remove multiple newlines and spaces
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r' +', ' ', text)
    return text.strip()

def html_to_pdf(html_file, output_pdf):
    # Read HTML file
    with open(html_file, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Parse HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract title from the specific span class
    title = soup.find('span', class_='title-text')
    title_text = title.text if title else "No title found"
    title_text = f"## Title: {title_text}"
    
    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()

    # Convert HTML to plain text
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    text = h.handle(str(soup))
    
    # Clean the text
    text = clean_text(text)

    # Create PDF
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    
    # Add title with larger font size and bold
    pdf.set_font("Arial", 'B', size=16)  # 'B' for bold
    pdf.multi_cell(0, 10, txt=f"{title_text}")
    pdf.ln(5)  # Add some space after title
    
    # Set font back to normal for main text
    pdf.set_font("Arial", size=11)
    
    # Add text to PDF
    # Split text into lines to avoid overflow
    lines = text.split('\n')
    for line in lines:
        if line.strip():  # Only add non-empty lines
            try:
                if title_text not in line:
                    try:
                        pdf.multi_cell(0, 5, txt=line.encode('latin-1', 'replace').decode('latin-1'))
                    except:
                        pdf.multi_cell(0, 5, txt=line.encode('latin-1', 'replace').decode('latin-1', 'ignore'))

            except:
                # If encoding fails, try to clean the text further
                clean_line = ''.join(char for char in line if ord(char) < 128)
                # clean_line = line.encode('ascii', 'replace').decode('ascii')
                # clean_line = line.encode('ascii', 'ignore').decode('ascii')
                # clean_line = ''.join(char if ord(char) < 128 else '-' for char in line)
                if title_text not in clean_line:
                    pdf.multi_cell(0, 5, txt=clean_line)

    # Save PDF
    pdf.output(output_pdf)

def process_html_files(input_directory, output_directory):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Process all HTML files in the input directory
    c = 0
    for filename in os.listdir(input_directory):
        if DEMO and c > NUMBER_OF_PAPERS:
            break
        filename = filename.replace(' ', '_')
        if filename.endswith('.html'):
            html_path = os.path.join(input_directory, filename)
            pdf_path = os.path.join(output_directory, filename.replace('.html', '.pdf'))
            print(f"Converting {filename} to PDF...")
            try:
                html_to_pdf(html_path, pdf_path)
                print(f"Successfully created {pdf_path}")
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

        c += 1


### Main script

This script will convert raw html into pdf

In [4]:
input_dir = os.path.join(os.path.dirname(os.path.abspath('.')), "x_research_papers_save", "1_raw_files")
output_dir = os.path.join(os.path.dirname(os.path.abspath('.')), "x_research_papers_save", "1_raw_files")
process_html_files(input_dir, output_dir)

Converting science_direct_102.html to PDF...
Successfully created e:\ITU-BOOK\Data in Wild\Project\DataInWild\SUBMISSION\1_scraping\x_research_papers_save\1_raw_files\science_direct_102.pdf
Converting science_direct_135.html to PDF...
Successfully created e:\ITU-BOOK\Data in Wild\Project\DataInWild\SUBMISSION\1_scraping\x_research_papers_save\1_raw_files\science_direct_135.pdf
Converting science_direct_166.html to PDF...
Successfully created e:\ITU-BOOK\Data in Wild\Project\DataInWild\SUBMISSION\1_scraping\x_research_papers_save\1_raw_files\science_direct_166.pdf
Converting science_direct_173.html to PDF...
Successfully created e:\ITU-BOOK\Data in Wild\Project\DataInWild\SUBMISSION\1_scraping\x_research_papers_save\1_raw_files\science_direct_173.pdf
Converting science_direct_20.html to PDF...
Successfully created e:\ITU-BOOK\Data in Wild\Project\DataInWild\SUBMISSION\1_scraping\x_research_papers_save\1_raw_files\science_direct_20.pdf
Converting science_direct_234.html to PDF...
Success