Helper to create IMG files from PDF files and orginize them into folders


In [1]:
from pdf2image import convert_from_path
import os
from PIL import Image
import os
from tqdm import tqdm

In [None]:
import unidecode


def img_preprocessing(img):
    """
    Optional imaage preprocessing. This function is called before saving the image. Azure services do not require this preprocessing. 
    """
    return img


def move_dataset(folder, destination):
    """
    Moves files from the source folder to the destination folder, performing transformations if necessary.

    If the file is a PDF, it is converted to images and preprocessing is applied before saving them in the destination folder.
    If the file is an image, preprocessing is applied before saving it in the destination folder.

    :param folder: str
        Path of the source folder containing the files to be moved.

    :param destination: str
        Path of the destination folder where the transformed files will be saved.

    Example:
    >>> move_dataset("input_folder", "output_folder")
    """
    files = os.listdir(folder)
    for file in tqdm(files):
        try:
            pass
            file_path = f"{folder}/{file}"
            nombres = file.split(".")
            basename = "".join(nombres[:-1])
            ext = nombres[-1]
            folder_last_name = folder.split("/")[-1]

            if ext == "pdf":
                try:
                    images = convert_from_path(
                        file_path, jpegopt="optimize", size=(1500, 2000), thread_count=-1)
                    for i in range(len(images)):
                        os.makedirs(
                            f"{destination}/{basename}", exist_ok=True)
                        destin = f"{destination}/{basename}/page_{i}.jpg"
                        destin = unidecode.unidecode(destin)
                        img_preprocessing(images[i]).save(destin)
                except Exception as e:
                    print(file)
                    print(e)
            else:
                try:
                    im = Image.open(file_path)
                    os.makedirs(
                        f"{destination}/{basename}", exist_ok=True)
                    destin = f"{destination}/{basename}/page_0.jpg"
                    destin = unidecode.unidecode(destin)
                    img_preprocessing(im).save(destin)
                except Exception as e:
                    pass
        except Exception as e:
            print("ERROR!!!!", e)


def convert_newspaper(of, destination):
    """
    Converts newspaper files and moves the processed set to a new location.

    Creates a destination folder (if it does not exist) and uses the `move_dataset` function to perform the conversion
    and move the files from the source folder to the destination folder.

    :param of: str
        Path of the source folder containing the newspaper files.

    :param destination: str
        Path of the destination folder where the converted files will be saved.

    Example:
    >>> convert_newspaper("input_folder", "output_folder")
    """
    try:
        os.mkdir(destination)
    except Exception as e:
        pass
    move_dataset(of, destination)

In [None]:
"""
Processes and converts the files of each newspaper in the 'NEWSPAPERS' folder and saves the result in the 'data' folder.
"""

FOLDER = "NEWSPAPERS"
OUTPUT_FOLDER = "data"
try:
    os.mkdir(f"{OUTPUT_FOLDER}")
except Exception as e:

    pass
periodicos = os.listdir(FOLDER)
for newspaper in tqdm(periodicos[11:]):

    print(newspaper)
    dest = f"./{OUTPUT_FOLDER}/{unidecode.unidecode(newspaper)}"
    convert_newspaper(f"{FOLDER}/{newspaper}", dest)