<a href="https://colab.research.google.com/github/ivanmladek/Sentinel-Intelligence-Codex/blob/main/process_refactor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library Processing Pipeline

## 1. Setup and Dependencies

In [1]:
#@title Install System Dependencies
!apt-get install -y poppler-utils tesseract-ocr libmagic-dev unrar

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
libmagic-dev is already the newest version (1:5.41-3ubuntu0.1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.8).
unrar is already the newest version (1:6.1.5-1ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [2]:
#@title Install Python Libraries (Part 1)
!pip install numpy==1.26.4



In [3]:
#@title Install Python Libraries (Part 2)
!pip install transformers==4.38.2 pyarrow==14.0.1 timm==0.5.4 requests==2.31.0 albumentations==1.0.0 git+https://github.com/facebookresearch/nougat
!pip install textblob langdetect beautifulsoup4 huggingface_hub tqdm pandas

Collecting git+https://github.com/facebookresearch/nougat
  Cloning https://github.com/facebookresearch/nougat to /tmp/pip-req-build-zdsuwx5n
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/nougat /tmp/pip-req-build-zdsuwx5n
  Resolved https://github.com/facebookresearch/nougat to commit 5a92920d342fb6acf05fc9b594ccb4053dbe8e7a
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers==4.38.2
  Using cached transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
Collecting pyarrow==14.0.1
  Using cached pyarrow-14.0.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting timm==0.5.4
  Using cached timm-0.5.4-py3-none-any.whl.metadata (36 kB)
Collecting requests==2.31.0
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting albumentations==1.0.0
  Using cached albumentations-1.0.0-py3-none-any.whl.metadata (31 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.2)
  Using cached to

## 2. Imports and Configuration

In [19]:
import os
import re
import json
import logging
import shutil
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed

import nltk
import pandas as pd
import requests
from bs4 import BeautifulSoup
from huggingface_hub import HfApi
from langdetect import detect, LangDetectException
from nltk.corpus import words, brown
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob
from tqdm import tqdm
from google.colab import drive

# --- Configuration ---
BASE_URL = "https://the-eye.eu/public/Books/Bibliotheca%20Alexandrina/"
HUGGING_FACE_REPO = "ivanmladek/Sentinel-Intelligence-Codex"  # Replace with your Hugging Face repo
GARBAGE_THRESHOLD = 0.8
LENWORD = 50

# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.propagate = True # Ensure messages are propagated to the root logger

# --- Mount Google Drive ---
#drive.mount('/content/drive')

# --- Download NLTK Data ---
nltk.download('punkt')
nltk.download('words')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## 3. Helper Functions

### 3.1. File and Web Operations

In [14]:
import requests
from bs4 import BeautifulSoup
import os
import re
import subprocess
import logging

logger = logging.getLogger(__name__)

def get_file_list(url):
    """Recursively get a list of files from a URL and its subdirectories."""
    rar_files = []
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a'):
            href = link.get('href')
            if href:
                # Handle relative and absolute links
                full_url = requests.compat.urljoin(url, href)
                if full_url.endswith('.rar'):
                    rar_files.append(full_url)
                elif full_url.endswith('/'):
                    # Recursively call for subdirectories, avoiding infinite loops
                    if url != full_url: # Avoid processing the same directory again
                         rar_files.extend(get_file_list(full_url))
    except requests.exceptions.RequestException as e:
        logger.error(f"Error accessing URL {url}: {e}")
    return rar_files

def download_file(url, output_path):
    """Download a file from a URL."""
    if os.path.exists(output_path):
        logger.info(f"{output_path} already exists. Skipping download.")
        return
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        logger.info(f"Downloaded {url} to {output_path}")
    except requests.exceptions.RequestException as e:
        logger.error(f"Error downloading file from {url}: {e}")


def extract_rar(file_path, output_path):
    """Extract a RAR file."""
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    try:
        subprocess.run(['unrar', 'x', '-o+', file_path, output_path], check=True, capture_output=True, text=True) # Added -o+ to overwrite without prompting
        logger.info(f"Extracted {file_path} to {output_path}")
    except subprocess.CalledProcessError as e:
        logger.error(f"Error extracting {file_path}: {e.stderr}")

def sanitize_filename(filename):
    """Sanitize a filename."""
    return re.sub(r'[^a-zA-Z0-9_.-]', '_', filename)

### 3.2. PDF Processing (Nougat)

In [6]:
def process_pdf(pdf_path, output_dir):
    """Process a single PDF file with Nougat."""
    sanitized_filename = sanitize_filename(os.path.basename(pdf_path))
    mmd_path = os.path.join(output_dir, f"{os.path.splitext(sanitized_filename)[0]}.mmd")
    if os.path.exists(mmd_path):
        logger.info(f"{mmd_path} already exists. Skipping Nougat processing.")
        return mmd_path

    try:
        subprocess.run(['nougat', pdf_path, '-o', output_dir, '--no-skipping', '--recompute'], check=True, capture_output=True, text=True)
    except subprocess.CalledProcessError as e:
        logger.error(f"Error processing {pdf_path} with Nougat: {e.stderr}")
        return None
    return mmd_path

### 3.3. Text Cleaning and Quality Control

In [7]:
def clean_text(text):
    """Clean the extracted text."""
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = text.strip()
    text = re.sub(r'\[[^\]]*\]', '', text)
    text = re.sub(r'\(\d+\)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\[[A-Za-z0-9]+\]', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\([\w\s]+et\s+al\., \d{4}\)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\(\w+\s+and\s+\w+\s+\d{4}\)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\(see\s+equations\s+\(\d+\)\s+and\s+\(\d+\)\)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\(\w+\s+et\s+al\., \d{4};\s*\w+\s+et\s+al\., \d{4}\)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Table\s+\d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\[FIGURE:[^]]+\]', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\[\d+(,\s*\d+)*\]', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\[.*arxiv.*\]', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\x00-\x7F]+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[\.,;:!?]{2,}', '', text, flags=re.IGNORECASE)
    return text

def is_garbage(text, threshold=GARBAGE_THRESHOLD, lenword=LENWORD):
    """Check if the text is garbage."""
    if not text or len(text.split()) < 10:
        return True
    try:
        if detect(text) != 'en':
            return True
    except LangDetectException:
        return True
    return False

### 3.4. Text Chunking

In [10]:
def chunk_text(content, max_size=8192):
    """Chunk the text into smaller segments."""
    segments = []
    current_segment = ""
    lines = content.split('\n')

    for line in lines:
        if line.startswith(("#", "##", "###")):
            if current_segment:
                segments.extend(split_segment(current_segment.strip(), max_size))
            current_segment = ""
        else:
            current_segment += line + " "

    if current_segment:
        segments.extend(split_segment(current_segment.strip(), max_size))

    return segments

def split_segment(segment, max_size):
    """Split a segment into smaller chunks."""
    sentences = sent_tokenize(segment)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_size:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def process_and_chunk_mmd(mmd_path, output_dir):
    """Process and chunk an MMD file."""
    if not mmd_path or not os.path.exists(mmd_path):
        logger.warning(f"MMD file not found: {mmd_path}. Skipping.")
        return None, None

    cleaned_jsonl_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(mmd_path))[0]}_cleaned.jsonl")
    garbage_jsonl_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(mmd_path))[0]}_garbage.jsonl")

    with open(mmd_path, 'r', encoding='utf-8') as f:
        content = f.read()

    chunks = chunk_text(content)
    with open(cleaned_jsonl_path, 'w') as cleaned_f, open(garbage_jsonl_path, 'w') as garbage_f:
        for chunk in chunks:
            cleaned_chunk = clean_text(chunk)
            if is_garbage(cleaned_chunk):
                garbage_f.write(json.dumps({"text": cleaned_chunk}) + '\n')
            else:
                cleaned_f.write(json.dumps({"text": cleaned_chunk}) + '\n')

    return cleaned_jsonl_path, garbage_jsonl_path

### 3.5. Hugging Face Integration

In [11]:
def upload_to_huggingface(file_path, repo_id):
    """Upload a file to a Hugging Face repository."""
    api = HfApi()
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=os.path.basename(file_path),
        repo_id=repo_id,
        repo_type="dataset",
    )
    logger.info(f"Uploaded {file_path} to {repo_id}")

## 4. Main Processing Loop

In [None]:
def main():
    """Main function to process the library."""
    logger.info(f"Scanning for RAR files at {BASE_URL}")
    rar_files = get_file_list(BASE_URL)
    logger.info(f"Found {len(rar_files)} RAR files.")

    with tqdm(total=len(rar_files), desc="Processing RAR Files") as pbar:
        for rar_file_url in rar_files:
            rar_filename = rar_file_url.split('/')[-1]
            rar_path = sanitize_filename(rar_filename)
            extract_path = os.path.splitext(rar_path)[0]

            logger.info(f"Downloading {rar_file_url} to {rar_path}")
            download_file(rar_file_url, rar_path)

            logger.info(f"Extracting {rar_path} to {extract_path}")
            extract_rar(rar_path, extract_path)

            pdf_files = [os.path.join(root, file) for root, _, files in os.walk(extract_path) for file in files if file.endswith('.pdf')]
            logger.info(f"Found {len(pdf_files)} PDF files in {extract_path}")

            for pdf_path in pdf_files:
                logger.info(f"Processing PDF: {pdf_path}")
                mmd_path = process_pdf(pdf_path, extract_path)
                cleaned_jsonl, garbage_jsonl = process_and_chunk_mmd(mmd_path, extract_path)

                if cleaned_jsonl and os.path.exists(cleaned_jsonl):
                    upload_to_huggingface(cleaned_jsonl, HUGGING_FACE_REPO)
                ##if garbage_jsonl and os.path.exists(garbage_jsonl):
                ##    upload_to_huggingface(garbage_jsonl, HUGGING_FACE_REPO)

            os.remove(rar_path)
            shutil.rmtree(extract_path)
            pbar.update(1)

if __name__ == "__main__":
    main()