In [3]:
import os
from bing_image_downloader import downloader
import shutil
from PIL import Image
import cv2

class ImageDownloader:
    def __init__(self):
        pass

    def quality_check(self, image_path):
        try:
            # Using Pillow library for a basic check
            img = Image.open(image_path)
            img.verify()  # Verify image integrity

            # Using OpenCV for blur detection (Laplacian variance)
            img_cv = cv2.imread(image_path)
            gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
            variance = cv2.Laplacian(gray, cv2.CV_64F).var()
            # Adjust the threshold as needed (higher values mean sharper images)
            if variance > 200:  # Example threshold, adjust as needed.
                return True  # Image is likely sharp
            else:
                return False # Image is likely blurry
        except Exception as e:  # Catch any image processing errors
            print(f"Error checking image {image_path}: {e}")
            return False

    def fetch_bing_images(self):
        # Create category folders for image downloads
        os.makedirs("financial_data/salary_slips", exist_ok=True)
        os.makedirs("financial_data/bank_statements", exist_ok=True)
        os.makedirs("financial_data/cheques", exist_ok=True)
        os.makedirs("financial_data/profit_loss_statements", exist_ok=True)
        os.makedirs("financial_data/transaction_history", exist_ok=True)

        # Define search terms for each category
        search_terms = {
            "salary_slips": "salary slip document",
            "bank_statements": "bank statement document",
            "cheques": "cheques document",
            "profit_loss_statements": "profit and loss statements",
            "transaction_history": "transaction history document"
        }

        # Download images for each search term
        for category, term in search_terms.items():
            downloader.download(term, limit=150,
                                output_dir=f"financial_data/{category}",
                                adult_filter_off=True,
                                force_replace=False,
                                timeout=60)

            temp_folder = f"financial_data/{category}/{term}"
            if os.path.exists(temp_folder):
                image_counter = 1
                for filename in os.listdir(temp_folder):
                    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                        source = os.path.join(temp_folder, filename)
                        destination = os.path.join(f"financial_data/{category}", f"image{image_counter}.jpg")

                        if self.quality_check(source): # Quality check before moving
                            shutil.move(source, destination)
                            image_counter += 1
                        else:
                            print(f"Skipping blurry image: {source}")
                            os.remove(source)  # Remove the blurry image

                shutil.rmtree(temp_folder)

        print("Images downloaded and organized successfully")

if __name__ == "__main__":
    image_downloader = ImageDownloader()
    image_downloader.fetch_bing_images()

[%] Downloading Images to c:\Users\JASNEET ARORA\OneDrive\Desktop\Milestone 1\1.  Web Scraping (Using Bing Image Downloader)\financial_data\salary_slips\salary slip document


[!!]Indexing page: 1

[%] Indexed 35 Images on Page 1.


[%] Downloading Image #1 from https://images.examples.com/wp-content/uploads/2018/05/Fully-Editable-Sample-Salary-Slip-Example1.jpg
[%] File Downloaded !

[%] Downloading Image #2 from https://www.wordtemplates4u.org/wp-content/uploads/2013/04/Salary-Slip-Template.png
[%] File Downloaded !

[%] Downloading Image #3 from https://images.examples.com/wp-content/uploads/2018/05/Editable-Salary-Slip-Example1.jpg
[%] File Downloaded !

[%] Downloading Image #4 from https://howtowiki.net/wp-content/uploads/2019/02/Customizable-Salary-Slip-Example-1-2048x1578.png
[%] File Downloaded !

[%] Downloading Image #5 from https://howtowiki.net/wp-content/uploads/2019/02/Blank-Salary-Slip-Example-1-1131x1536.png
[%] File Downloaded !

[%] Downloading Image #6 from https://