In [1]:
import os
from googleapiclient.discovery import build
from dotenv import load_dotenv
import pickle
import random
import collections
from tqdm import tqdm

random.seed(333)

In [2]:
load_dotenv()

api_key = os.environ['GOOGLE_API']
cse_id = os.environ['GOOGLE_CSE']

In [3]:
# create a search query
def google_search(search_term, api_key, cse_id, **kwargs):
    
    """
    Creates a search link for the custom Google search.
    """
    
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return res['items']

In [4]:
# search_results = google_search(palm_species[0], api_key=api_key, cse_id=cse_id)

In [4]:
queries = [
    "",
    "description",
    "diagnosis",
    "attributes"
]

## PALMS

In [5]:
folder = "../../../data/OpenAI/Species/"

palm_species = pickle.load(open(F"{folder}palm_species.pkl", 'rb'))
# palm_species_random = random.sample(palm_species, 50)


try:
    folder = "../../../data/OpenAI/Urls/"
    palm_species_google = pickle.load(open(F"{folder}google_urls_palms.pkl", 'rb'))
    palm_species_urls = pickle.load(open(F"{folder}urls_palms.pkl", 'rb'))
    palms_done = list(palm_species_urls.keys())
except:
    palm_species_urls = collections.defaultdict(list)
    palm_species_google = collections.defaultdict(list)
    palms_done = []

In [6]:
palms_done

['Actinokentia divaricata',
 'Aiphanes minima',
 'Archontophoenix alexandrae',
 'Archontophoenix maxima',
 'Archontophoenix myolensis',
 'Archontophoenix purpurea',
 'Archontophoenix tuckeri',
 'Areca catechu',
 'Areca laosensis',
 'Arenga australasica',
 'Astrocaryum acaule',
 'Astrocaryum aculeatum',
 'Astrocaryum chambira',
 'Astrocaryum huaimi',
 'Astrocaryum jauari',
 'Astrocaryum javarense',
 'Astrocaryum murumuru',
 'Astrocaryum paramaca',
 'Astrocaryum sciophilum',
 'Astrocaryum sociale',
 'Astrocaryum standleyanum',
 'Astrocaryum vulgare',
 'Attalea butyracea',
 'Attalea cohune',
 'Attalea dahlgreniana',
 'Attalea geraensis',
 'Attalea maripa',
 'Attalea oleifera',
 'Attalea phalerata',
 'Attalea septuagenata',
 'Attalea speciosa',
 'Attalea tessmannii',
 'Attalea vitrivir',
 'Bactris dianeura',
 'Bactris gastoniana',
 'Bactris gracilior',
 'Bactris grayumii',
 'Bactris hirta',
 'Bactris horridispatha',
 'Bactris kunorum',
 'Bactris longiseta',
 'Bactris macroacantha',
 'Bactr

In [7]:
for palm in tqdm(palm_species):

    if palm in palms_done:
        print(palm, "done")
        continue

    for query in queries:
        search_query = F"{palm} {query}"

        # print(search_query)

        # Search results (10 per search)
        search_results = google_search(search_query, api_key=api_key, cse_id=cse_id)
        # Google Urls
        palm_species_google[palm] = search_results
        # Just the links
        for result in search_results:
            palm_species_urls[palm].append(result['link'])


folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_urls, f)

with open(F"{folder}google_urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_google, f)


  0%|          | 0/333 [00:00<?, ?it/s]

Actinokentia divaricata done
Aiphanes minima done
Archontophoenix alexandrae done
Archontophoenix maxima done
Archontophoenix myolensis done
Archontophoenix purpurea done
Archontophoenix tuckeri done
Areca catechu done
Areca laosensis done
Arenga australasica done
Astrocaryum acaule done
Astrocaryum aculeatum done
Astrocaryum chambira done
Astrocaryum huaimi done
Astrocaryum jauari done
Astrocaryum javarense done
Astrocaryum murumuru done
Astrocaryum paramaca done
Astrocaryum sciophilum done
Astrocaryum sociale done
Astrocaryum standleyanum done
Astrocaryum vulgare done
Attalea butyracea done
Attalea cohune done
Attalea dahlgreniana done
Attalea geraensis done
Attalea maripa done
Attalea oleifera done
Attalea phalerata done
Attalea septuagenata done
Attalea speciosa done
Attalea tessmannii done
Attalea vitrivir done
Bactris dianeura done
Bactris gastoniana done
Bactris gracilior done
Bactris grayumii done
Bactris hirta done
Bactris horridispatha done
Bactris kunorum done
Bactris longis

 80%|████████  | 268/333 [00:38<00:09,  6.96it/s] 


HttpError: <HttpError 429 when requesting https://customsearch.googleapis.com/customsearch/v1?q=Ravenea+hildebrandtii+diagnosis&cx=13a87041f5a6df559&key=AIzaSyAiKMet5uvNWBKxmPC3OurJRs2amYdJaFA&alt=json returned "Quota exceeded for quota metric 'Queries' and limit 'Queries per day' of service 'customsearch.googleapis.com' for consumer 'project_number:830720044910'.". Details: "[{'message': "Quota exceeded for quota metric 'Queries' and limit 'Queries per day' of service 'customsearch.googleapis.com' for consumer 'project_number:830720044910'.", 'domain': 'global', 'reason': 'rateLimitExceeded'}]">

In [9]:
folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_urls, f)

with open(F"{folder}google_urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_google, f)


## CARIBBEAN

In [None]:
folder = "../../../data/OpenAI/Species/"

caribbean_species = pickle.load(open(F"{folder}caribbean_species.pkl", 'rb'))

try:
    folder = "../../../data/OpenAI/Urls/"
    caribbean_species_google = pickle.load(open(F"{folder}google_urls_caribbean.pkl", 'rb'))
    caribbean_species_urls = pickle.load(open(F"{folder}urls_caribbean.pkl", 'rb'))
    caribbean_done = list(caribbean_species_urls.keys())
except:
    caribbean_species_urls = collections.defaultdict(list)
    caribbean_species_google = collections.defaultdict(list)
    caribbean_done = []

In [None]:
caribbean_done

In [None]:
for caribbean in tqdm(caribbean_species):

    if caribbean in caribbean_done:
        print(caribbean, "done")
        continue

    for query in queries:
        search_query = F"{caribbean} {query}"

        # print(search_query)

        # Search results (10 per search)
        search_results = google_search(search_query, api_key=api_key, cse_id=cse_id)
        # Google Urls
        caribbean_species_google[caribbean] = search_results
        # Just the links
        for result in search_results:
            caribbean_species_urls[caribbean].append(result['link'])


folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_urls, f)

with open(F"{folder}google_urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_google, f)

In [None]:
folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_urls, f)

with open(F"{folder}google_urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_google, f)

## PlantNet

In [None]:
folder = "../../../data/OpenAI/Species/"

plantnet_species = pickle.load(open(F"{folder}plantnet_species.pkl", 'rb'))

try:
    folder = "../../../data/OpenAI/Urls/"
    plantnet_species_google = pickle.load(open(F"{folder}google_urls_plantnet.pkl", 'rb'))
    plantnet_species_urls = pickle.load(open(F"{folder}urls_plantnet.pkl", 'rb'))
    plantnet_done = list(plantnet_species_urls.keys())
except:
    plantnet_species_urls = collections.defaultdict(list)
    plantnet_species_google = collections.defaultdict(list)
    plantnet_done = []

In [None]:
plantnet_done

In [None]:
for plantnet in tqdm(plantnet_species):

    if plantnet in plantnet_done:
        print(plantnet, "done")
        continue

    for query in queries:
        search_query = F"{plantnet} {query}"

        # print(search_query)

        # Search results (10 per search)
        search_results = google_search(search_query, api_key=api_key, cse_id=cse_id)
        # Google Urls
        plantnet_species_google[plantnet] = search_results
        # Just the links
        for result in search_results:
            plantnet_species_urls[plantnet].append(result['link'])


folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_plantnet.pkl", 'wb') as f:
    pickle.dump(plantnet_species_urls, f)

with open(F"{folder}google_urls_plantnet.pkl", 'wb') as f:
    pickle.dump(plantnet_species_google, f)

In [None]:
folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_plantnet.pkl", 'wb') as f:
    pickle.dump(plantnet_species_urls, f)

with open(F"{folder}google_urls_plantnet.pkl", 'wb') as f:
    pickle.dump(plantnet_species_google, f)