# Image Combing Notebook 
This notebook aims to retrieve images from the internet for the purpose of dataset creation due to the scarcity of accessible, quality food dataset

## Import statements

In [None]:
from googleapiclient.discovery import build
import configparser
import os
import requests
from io import BytesIO
from PIL import Image
import time

## Read configs

In [None]:
config = configparser.ConfigParser()
config.read('configs.ini')
api_key = config['API']['custom_search_api_key']
cx = config['API']['custom_search_cx']
user_agent = config['UserAgents']['user_agent']
geolocation = config['API']['geolocation']
host_language = config['API']['host_language']

## Set up a Custom Search JSON API clien

In [None]:
service = build("customsearch", "v1", developerKey=api_key)

## Function Declarations

In [None]:
def fetch_images(query, total_num_images, geolocation, host_language, start_num=1, retries=3):
    search_results = []
    start_index = start_num  # Start at the specified index of the results

    while len(search_results) < total_num_images:
        try:
            # Make a request to the API
            res = service.cse().list(
                q=query,  # Query string
                cx=config['API']['custom_search_cx'],  # Custom search engine ID from the config file
                searchType='image',  # Search for images
                num=min(10, total_num_images - len(search_results)),  # Number of results per request (max 10)
                start=start_index  # Start index for results
            ).execute()

            # Add the results to our list, and increment the start index
            search_results.extend(res.get('items', []))
            start_index += len(res.get('items', []))

            # If there are no more results, break the loop
            if 'nextPage' not in res:
                break

        except Exception as e:
            print(f"An error occurred: {e}")
            retries -= 1
            if retries <= 0:
                print("Max retries reached. Exiting.")
                break
            print("Retrying...")
            time.sleep(1)  # Wait for 1 second before retrying

    return search_results[:total_num_images]

def download_images(image_urls, save_folder):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    
    for i, img_url in enumerate(image_urls):
        try:
            response = requests.get(img_url, headers={'User-Agent': user_agent})
            response.raise_for_status()  # Raise an exception for HTTP errors
            
            # You could also check MIME type here to ensure result is an image
            content_type = response.headers['Content-Type']
            if 'image' in content_type:
                image = Image.open(BytesIO(response.content))
                image_format = content_type.split('/')[-1]  # 'jpeg', 'png', etc.
                image.save(os.path.join(save_folder, f'image_{i+1}.{image_format}'))
                print(f"Downloaded image {i+1}")
            else:
                print(f"URL {i+1} does not seem to be an image.")
        except requests.HTTPError as http_err:
            print(f"HTTP error occurred: {http_err}")
        except Exception as e:
            print(f"Other error occurred: {e}")

## Main Program

### Define params

In [None]:
target_food = 'hainanese chicken rice'
search_queries = ['Singaporean chicken rice', 'Malaysian chicken rice', 'chinese chicken rice']
no_of_imgs = 190
#This refers to the number of images you already have. 
#So if you already have 30 images, this value will be 30
# Note that the api really only allows you to fetch 200 images, even though the documentation says 100 images of results.
start_num = 200
# download_directory = f'../data/{search_query.replace(" ", "_")}' # Replace directory with desired path
download_directory = f'../data/{target_food.replace(" ", "_")}' # Replace directory with desired path


In [None]:
images = []
for search_query in search_queries:
    images = fetch_images(cx, search_query, no_of_imgs, geolocation, host_language)
    image_links = [image['link'] for image in images]
    download_images(image_links, download_directory, start_num)
    start_num += no_of_imgs

### Call fetch function

In [None]:
images = fetch_images(cx, search_query, no_of_imgs, geolocation, host_language)

### Retrieve images links

In [None]:
image_links = [image['link'] for image in images]

### Download the images

In [None]:
download_images(image_links, download_directory, start_num)