### I. Get urls to txts

In [None]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import urllib.request

import time
from threading import Thread
import os

In [28]:
def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def get_all_images(url):
    """
    Returns all image URLs on a single `url`
    """
    soup = BeautifulSoup(urllib.request.urlopen(url), "html.parser")
    urls = []
    for img in soup.find_all("img"):
        img_url = img.attrs.get("src")
        
        if not img_url:
            # if img does not contain src attribute, just skip
            continue
        
        # make the URL absolute by joining domain with the URL that is just extracted
        img_url = urljoin(url, img_url)
        # remove URLs like '/hsts-pixel.gif?c=3.2.5'
        try:
            pos = img_url.index("?")
            img_url = img_url[:pos]
        except ValueError:
            pass
        
        # finally, if the url is valid
        if is_valid(img_url):
            urls.append(img_url)
    
    return urls

In [29]:
class GetImagesfromPages():
    def __init__(self, nThreads, npage, url_page):
        self.nThreads = nThreads
        self.npage = npage
        self.url_page = url_page

        self.result_urls = []
    
    # Func target
    def main(self, start, end):
        
        for i in range(start,end):
            try:
                self.result_urls.extend(get_all_images(self.url_page + str(i)))
            except:
                pass
        
    def __call__(self):
        
        # Create Threads
        threads = []
        
        batch = self.npage//self.nThreads
        for i in range(0, self.npage, batch):
            start = i
            end = i + batch
           
            if end >= self.npage:
                end = self.npage + 1

            threads.append(Thread(target=self.main, args = (start, end)))
        
        start = time.time()
        for i in range(self.nThreads):
            threads[i].start()
        for i in range(self.nThreads):
            threads[i].join()
        end = time.time()
        
        print(f"Time handle pages = {end - start:.2f}s", )
    
        return self.result_urls

In [30]:
animal  =  ["Monkey",  "Elephant",  "cows",  
"Cat",  "Dog",  "bear",  "fox",  "Civet", 
"Pangolins", "Rabbit", "Bats", "Whale", 
"Cock", "Owl", "flamingo", "Lizard", "Turtle", 
"Snake", "Frog", "Fish", "shrimp", "Crab", "Snail", 
"Coral", "Jellyfish", "Butterfly", "Flies", "Mosquito", 
"Ants", "Cockroaches", "Spider", "scorpion", "tiger", 
"bird",  "horse", "pig", "Alligator" ,"Alpaca" , 
"Anteater", "donkey", "Bee", "Buffalo", "Camel", 
"Caterpillar", "Cheetah", "Chicken",  "Dragonfly", 
"Duck", "panda", "Giraffe"]

plant = ["Bamboo", "Apple", "Apricot", "Banana", "Bean", 
"Wildflower", "Flower", "Mushroom", "Weed", "Fern" , "Reed", 
"Shrub", "Moss", "Grass", "Palm_tree", "Corn", "Tulip", "Rose",
"Clove", "Dogwood", "Durian", "Ferns", "Fig", "Flax", "Frangipani", 
"Lantana", "Hibiscus", "Bougainvillea", "Pea", "Orchid_Tree", "Rangoon_Creeper",
"Jack_fruit", "Cotton_plant", "Cornelian_tree", "Coffee_plant", "Coconut"
, "wheat", "watermelon", "radish", "carrot"]

furniture = ["bed", "cabinet", "chair", "chests", "clock", 
"desks", "table", "Piano", "Bookcase", "Umbrella", "Clothes", 
"cart", "sofa", "ball", "spoon", "Bowl", "fridge", "pan", "book"]

scenery = ["Cliff", "Bay", "Coast", "Mountains", "Forests", 
"Waterbodies", "Lake", "desert", "farmland", "river", "hedges", 
"plain", "sky", "cave", "cloud", "flower_garden", "glacier", 
"grassland", "horizon", "lighthouse", "plateau", "savannah", "valley", "volcano", "waterfall"]

In [107]:
urltopic = {
    "freepik": "https://www.freepik.com/search?from_query={name}&query={name}&sort=popular&type=photo&page=",
    "freeimages": "https://www.freeimages.com/search/{name}/"
}

In [108]:
n_threads = 3
n_page = 6

In [109]:
for dir, names in zip(["animal", "plant", "furniture", "scenery"], [animal, plant, furniture, scenery]):

    dir_path_urls = f"{dir}/urls"
    if not os.path.exists(dir_path_urls):
        os.makedirs(dir_path_urls)

    for name in names[:2]:
        result_of_name = GetImagesfromPages(min(n_threads, n_page//2), n_page, 
                            urltopic["freeimages"].format(name = name))()
        
        result_of_name = list(set(result_of_name))
        
        print(f"{dir_path_urls}/{dir}_{name}.txt have {len(result_of_name)} images \n")
        strResult = '\n'.join(result_of_name)
        with open(f"{dir_path_urls}/{dir}_{name}.txt", "w") as f:
            f.write(strResult)
    
    break

Time handle pages = 1.11s
animal/urls/animal_Monkey.txt have 77 images 

Time handle pages = 1.05s
animal/urls/animal_Elephant.txt have 77 images 



### II. Get images from txts

In [122]:
import urllib.request

In [123]:
from threading import Thread
import time
import requests
import random
import os

class DownloadImagesFromUrls():
    def __init__(self, nThreads, urls, destinate_folder):
        self.nThreads = nThreads
        self.urls = urls
        self.n = len(urls)
        self.destinate_folder = destinate_folder
        
    # Func target
    def download_url(self, start, end):

        for i in range(start, end):
            a = random.random()
            urllib.request.urlretrieve(self.urls[i], f"{self.destinate_folder}/{a}.jpg")

            print('.', end=" ")         
                 
    def __call__(self):

        threads = []
        batch = self.n//self.nThreads
        for i in range(0, self.n, batch):
            start = i
            end = i + batch

            if end >= self.n:
                end = self.n + 1

            threads.append(Thread(target=self.download_url, args = (start, end)))

        start = time.time()
        for i in range(self.nThreads):
            threads[i].start()
        for i in range(self.nThreads):
            threads[i].join() 
        end = time.time()

        print(f"\nTime handle download urls = {end - start:.2f}s\n", )


In [124]:
for dir, names in zip(["animal", "plant", "furniture", "scenery"], [animal, plant, furniture, scenery]):

    dir_path_images = f"images"
    dir_path_urls = f"{dir}/urls"
    if not os.path.exists(dir_path_images):
        os.makedirs(dir_path_images)

    txts = [name for name in os.listdir(dir_path_urls) if name.endswith(".txt")]

    for txt in txts[:2]:
        folder_txt = f"{dir_path_urls}/{txt}"
        with open(folder_txt, "r") as f:
            content_txt = f.readlines()

        folder_image = f"{dir_path_images}/{txt}"
        if not os.path.exists(folder_image[:-4]):
            os.makedirs(folder_image[:-4])
        
        n_threads = 10
        DownloadImagesFromUrls(min(n_threads, len(content_txt)//2), content_txt, folder_image[:-4])()
    
    break

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
Time handle download urls = 0.73s

. . . . . . ..  . . . . . . . . . . . . . . . . . . . ..  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ..  . . . . . . . . . 
Time handle download urls = 0.64s

