<a href="https://colab.research.google.com/github/kshakib22/Newspaper-Template-Matching/blob/main/Extract_Shipping_report_Star_of_Chile_v_1107.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing external libraries through `!pip`, connecting google drive storage, and importing modules.

In [2]:
%%capture
!pip install opencv-python
!pip install opencv-contrib-python
!pip install pdf2image Pillow
!apt-get install poppler-utils
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import cv2
import os
import time
import tensorflow as tf
import sys
import numpy as np
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from pdf2image import convert_from_path


# Conversion of input files (.pdf) to .jpg format using 300dpi

In [None]:
"""
Convert PDF files in a given folder to individual images and save them in an output folder.

Parameters:
    pdf_folder (str): The path to the folder containing the PDF files.
    output_folder (str): The path to the folder where the converted images will be saved.

Returns:
    None

Dependencies:
    This function requires the 'os' module and the 'pdf2image' library to be installed.
    You can install 'pdf2image' using the following command: !pip install pdf2image

Description:
    This function takes the path to a folder containing PDF files (pdf_folder) and an output folder path (output_folder).
    It then converts each PDF file to images and saves them in the output folder as JPEG images.
    The function uses the 'pdf2image' library to perform the PDF to image conversion.
"""
def convert_pdf_to_images(pdf_folder, output_folder):
    # Get a list of PDF files in the input folder
    pdf_files = [filename for filename in os.listdir(pdf_folder) if filename.endswith('.pdf')]

    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    for filename in pdf_files:
        # Construct the full paths for input PDF and output images
        pdf_path = os.path.join(pdf_folder, filename)
        output_path = os.path.join(output_folder, os.path.splitext(filename)[0])

        # Convert PDF pages to images using pdf2image
        images = convert_from_path(pdf_path, dpi=300, grayscale=False)

        # Save each image as JPEG in the output folder
        for i, image in enumerate(images):
            image_path = f"{output_path}_{i}.jpg"
            image.save(image_path, "JPEG")

        print(f"PDF '{filename}' converted to images.")

    print("Conversion completed.")

In [None]:
pdf_path = "/content/drive/MyDrive/Summer 2023 project/complete_pdfs"
output= "/content/drive/MyDrive/Summer 2023 project/Images/converted_pdf_images"

convert_pdf_to_images(pdf_path, output)

# Template Matching

In [None]:
def perform_template_matching(image, template):
    """
    Perform template matching on the given image using the provided template.

    Parameters:
        image (numpy array): The input image on which template matching will be performed.
        template (numpy array): The template image used for matching.

    Returns:
        Tuple: A tuple containing the maximum correlation value and the location of the best match.
    """
    result = cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)
    _, max_val, _, max_loc = cv2.minMaxLoc(result)
    return max_val, max_loc

def crop_and_export(image, top_left, bottom_right, output_path):
    """
    Crop a specified region from the input image and export it as a new image.

    Parameters:
        image (numpy array): The input image from which the region will be cropped.
        top_left (tuple): The coordinates of the top-left corner of the region to be cropped.
        bottom_right (tuple): The coordinates of the bottom-right corner of the region to be cropped.
        output_path (str): The path to save the cropped image.

    Returns:
        None
    """
    cropped_image = image[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0]]
    cv2.imwrite(output_path, cropped_image)
    print(f"Image saved: {output_path}")


def second_scenario(image, template_path, output_folder):
    """
    Perform the second scenario for template matching and crop the image if a match is found.
    Second scenario is considered when shipping report (template 1) is only a decent match but
    charters (template 2) is a pretty good match.

    Parameters:
        image (numpy array): The input image on which template matching will be performed.
        template_path (str): The path to the template image used for matching.
        output_folder (str): The path to the folder where the cropped image will be saved.

    Returns:
        None
    """
    # Load the template image
    template = cv2.imread(template_path)

    # Convert the template and image to grayscale
    template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
    image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Perform template matching
    result = cv2.matchTemplate(image_gray, template_gray, cv2.TM_CCOEFF_NORMED)
    _, max_val, _, max_loc = cv2.minMaxLoc(result)

    if max_val >= 0.4:
        height, width = image.shape[:2]
        line_length = int(width * 0.385)
        mid_point = (max_loc[0] + template_gray.shape[1] // 2, max_loc[1])
        top_left = (mid_point[0] - line_length // 2, 0)
        bottom_right = (mid_point[0] + line_length // 2, max_loc[1])

        if top_left[0] < 0:
            top_left = (0, top_left[1])

        try:
            crop_and_export(image, top_left, bottom_right, output_folder)
        except Exception as e:
            print("Error occurred for second scenario:", str(e))
    else:
        print("Template could not be used even in second scenario")

def third_scenario(image, template_path, output_folder):
    """
    Perform the third scenario for template matching and crop the image if a match is found.
    Third scenario is considered when shipping report (template 1) is a pretty good match but
    charters (template 2) is only a decent match.

    Parameters:
        image (numpy array): The input image on which template matching will be performed.
        template_path (str): The path to the template image used for matching.
        output_folder (str): The path to the folder where the cropped image will be saved.

    Returns:
        None
    """
    template = cv2.imread(template_path)
    template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
    image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Perform template matching
    result = cv2.matchTemplate(image_gray, template_gray, cv2.TM_CCOEFF_NORMED)
    _, max_val, _, max_loc = cv2.minMaxLoc(result)

    if max_val >= 0.34:
        x, y, w, h = max_loc[0], max_loc[1], template.shape[1], template.shape[0]
        bottom_x, bottom_y = x + w, image.shape[0]

        # Crop the portion within the green box
        cropped_image = image[y:bottom_y, x:bottom_x]

        # Save the cropped portion
        try:
          file_name = os.path.basename(image_path)
          output_path = os.path.join(output_folder, file_name)
          cv2.imwrite(output_path, cropped_image)
          print(f"Image saved: {output_path}")
        except Exception as e:
            print("Error occurred for third scenario:", str(e))
    else:
        print("Template match not found in third scenario")

def perform_image_processing(image_path, template1_path, template2_path, output_folder):
    """
    The main function to perform image processing and template matching on the input image
    using two template images.

    Parameters:
        image_path (str): The path to the input image for template matching.
        template1_path (str): The path to the first template image used for matching.
        template2_path (str): The path to the second template image used for matching.
        output_folder (str): The path to the folder where the cropped image will be saved.

    Returns:
        None
    """
    image = cv2.imread(image_path)
    template1 = cv2.imread(template1_path)
    template2 = cv2.imread(template2_path)

    # Convert images to grayscale for template matching
    image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    template1_gray = cv2.cvtColor(template1, cv2.COLOR_BGR2GRAY)
    template2_gray = cv2.cvtColor(template2, cv2.COLOR_BGR2GRAY)

    # Perform template matching
    similarity1, max_loc_1 = perform_template_matching(image_gray, template1_gray)
    similarity2, max_loc_2 = perform_template_matching(image_gray, template2_gray)

    print(f"Similarity1: {similarity1}")
    print(f"Similarity2: {similarity2}")

    # Ideal match conditions
    if similarity1 >= 0.78 and similarity2 >= 0.6:
        top = max_loc_1[1]
        bottom = max_loc_2[1]
        left = min(max_loc_1[0], max_loc_2[0])
        right = max(max_loc_1[0] + template1_gray.shape[1], max_loc_2[0] + template2_gray.shape[1])

        output_filename = os.path.splitext(os.path.basename(image_path))[0] + ".jpg"
        output_path = os.path.join(output_folder, output_filename)
        try:
            crop_and_export(image, (left, top), (right, bottom), output_path)
        except Exception as e:
            # This is when first condition fulfilled, but invalid dimensions.
            print("Error occurred in the first scenario: Invalid cropping dimensions")
            try:
                # Condition if shipping report is a decent match but charters is a good match
                if similarity1 >= 0.75 and similarity2 >= 0.62:
                    print("\nMoving to the second scenario...")
                    second_scenario(image, template2_path, output_path)

                # Condition if shipping report is a good match but charters is a decent match
                elif similarity1 >= 0.8 and similarity2 >= 0.58:
                    print("\nMoving to the third scenario...")
                    third_scenario(image, template1_path, output_folder)
                else:
                    print("Template matches not found or below similarity threshold.")
            except Exception as e:
                print("Error occurred:", str(e))

    # If first condition not satisfied, but no dimension error
    # Condition if shipping report is a decent match but charters is a good match
    elif similarity1 >= 0.75 and similarity2 >= 0.62:
        output_filename = os.path.splitext(os.path.basename(image_path))[0] + ".jpg"
        output_path = os.path.join(output_folder, output_filename)
        print("\nMoving to the second scenario...")
        second_scenario(image, template2_path, output_path)

    # Condition if shipping report is a good match but charters is a decent match
    elif similarity1 >= 0.8 and similarity2 >= 0.58:
        output_filename = os.path.splitext(os.path.basename(image_path))[0] + ".jpg"
        output_path = os.path.join(output_folder, output_filename)
        print("\nMoving to the third scenario...")
        third_scenario(image, template1_path, output_folder)
    else:
        print("Template matches not found or below similarity threshold.")


# Individual matching
Perform template matching on a single image. This saves the cropped portion in the `output_path` if there is a valid match.

In [None]:
image_path = "/content/drive/MyDrive/Summer 2023 project/Images/17charter8.jpg"
template1_path = "/content/drive/MyDrive/Summer 2023 project/Images/17template_0.jpg"
template2_path = "/content/drive/MyDrive/Summer 2023 project/Images/17template.jpg"
output_path = "/content/drive/MyDrive/Summer 2023 project/Images/test/"
# Usage:
perform_image_processing(image_path, template1_path ,template2_path , output_path)

Similarity1: 0.9372795224189758
Similarity2: 0.6756752729415894
Image saved: /content/drive/MyDrive/Summer 2023 project/Images/test/17charter8.jpg


# Iterating through a folder containing images

Perform template matching on a folder containing multiple images in a loop. All the matches are saved in the folder path given as `output_folder`

In [None]:
import os
import time
# Folder path containing images
input_folder = "/content/drive/MyDrive/Summer 2023 project/Images/converted_pdf_images/PE0001108/"
output_folder = "/content/drive/MyDrive/Summer 2023 project/Images/all_shipping"
template1_path = "/content/drive/MyDrive/Summer 2023 project/Images/17template_0.jpg"
template2_path = "/content/drive/MyDrive/Summer 2023 project/Images/17template.jpg"

# Start the timer
start_time = time.time()

# Iterate through the image files in the input folder
for image_file in os.listdir(input_folder):
    # Get the full path of the image file
    image_path = os.path.join(input_folder, image_file)
    print(image_path)

    # Call the perform_image_processing function
    perform_image_processing(image_path, template1_path, template2_path, output_folder)

# Calculate the total time taken in hours
end_time = time.time()
time_taken = (end_time - start_time) / 3600

print(f"Total time taken: {time_taken} hours")


/content/drive/MyDrive/Summer 2023 project/Images/converted_pdf_images/PE0001108/PE0001108_0014_9.jpg
Similarity1: 0.5159093737602234
Similarity2: 0.4501591920852661
Template matches not found or below similarity threshold.
/content/drive/MyDrive/Summer 2023 project/Images/converted_pdf_images/PE0001108/PE0001108_0014_5.jpg
Similarity1: 0.40202441811561584
Similarity2: 0.39501848816871643
Template matches not found or below similarity threshold.
/content/drive/MyDrive/Summer 2023 project/Images/converted_pdf_images/PE0001108/PE0001108_0005_8.jpg
Similarity1: 0.34662091732025146
Similarity2: 0.361337810754776
Template matches not found or below similarity threshold.
/content/drive/MyDrive/Summer 2023 project/Images/converted_pdf_images/PE0001108/PE0001108_0013_6.jpg
Similarity1: 0.4691278338432312
Similarity2: 0.3009428381919861
Template matches not found or below similarity threshold.
/content/drive/MyDrive/Summer 2023 project/Images/converted_pdf_images/PE0001108/PE0001108_0012_8.jpg
