In [1]:
import os
from googleapiclient.discovery import build
from dotenv import load_dotenv
import pickle
import random
import collections
from tqdm import tqdm

random.seed(333)

In [2]:
load_dotenv()

api_key = os.environ['GOOGLE_API']
cse_id = os.environ['GOOGLE_CSE']

In [3]:
# create a search query
def google_search(search_term, api_key, cse_id, **kwargs):
    
    """
    Creates a search link for the custom Google search.
    """
    
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return res['items']

In [4]:
# search_results = google_search(palm_species[0], api_key=api_key, cse_id=cse_id)

In [4]:
queries = [
    "",
    "description",
    "diagnosis",
    "attributes"
]

## PALMS

In [5]:
folder = "Species/"

palm_species = pickle.load(open(F"{folder}palm_species.pkl", 'rb'))
# palm_species_random = random.sample(palm_species, 50)


try:
    folder = "Urls/"
    palm_species_google = pickle.load(open(F"{folder}google_urls_palms.pkl", 'rb'))
    palm_species_urls = pickle.load(open(F"{folder}urls_palms.pkl", 'rb'))
    palms_done = list(palm_species_urls.keys())
except:
    palm_species_urls = collections.defaultdict(list)
    palm_species_google = collections.defaultdict(list)
    palms_done = []

In [6]:
palms_done

['Actinokentia divaricata',
 'Aiphanes minima',
 'Archontophoenix alexandrae',
 'Archontophoenix maxima',
 'Archontophoenix myolensis',
 'Archontophoenix purpurea',
 'Archontophoenix tuckeri',
 'Areca catechu',
 'Areca laosensis',
 'Arenga australasica',
 'Astrocaryum acaule',
 'Astrocaryum aculeatum',
 'Astrocaryum chambira',
 'Astrocaryum huaimi',
 'Astrocaryum jauari',
 'Astrocaryum javarense',
 'Astrocaryum murumuru',
 'Astrocaryum paramaca',
 'Astrocaryum sciophilum',
 'Astrocaryum sociale',
 'Astrocaryum standleyanum',
 'Astrocaryum vulgare',
 'Attalea butyracea',
 'Attalea cohune',
 'Attalea dahlgreniana',
 'Attalea geraensis',
 'Attalea maripa',
 'Attalea oleifera',
 'Attalea phalerata',
 'Attalea septuagenata',
 'Attalea speciosa',
 'Attalea tessmannii',
 'Attalea vitrivir',
 'Bactris dianeura',
 'Bactris gastoniana',
 'Bactris gracilior',
 'Bactris grayumii',
 'Bactris hirta',
 'Bactris horridispatha',
 'Bactris kunorum',
 'Bactris longiseta',
 'Bactris macroacantha',
 'Bactr

In [None]:
for palm in tqdm(palm_species):

    if palm in palms_done:
        print(palm, "done")
        continue

    for query in queries:
        search_query = F"{palm} {query}"

        # print(search_query)

        # Search results (10 per search)
        search_results = google_search(search_query, api_key=api_key, cse_id=cse_id)
        # Google Urls
        palm_species_google[palm] = search_results
        # Just the links
        for result in search_results:
            palm_species_urls[palm].append(result['link'])


folder = "Urls/"

with open(F"{folder}urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_urls, f)

with open(F"{folder}google_urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_google, f)


In [9]:
folder = "Urls/"

with open(F"{folder}urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_urls, f)

with open(F"{folder}google_urls_palms.pkl", 'wb') as f:
    pickle.dump(palm_species_google, f)


## CARIBBEAN

In [None]:
folder = "Species/"

caribbean_species = pickle.load(open(F"{folder}caribbean_species.pkl", 'rb'))

try:
    folder = "Urls/"
    caribbean_species_google = pickle.load(open(F"{folder}google_urls_caribbean.pkl", 'rb'))
    caribbean_species_urls = pickle.load(open(F"{folder}urls_caribbean.pkl", 'rb'))
    caribbean_done = list(caribbean_species_urls.keys())
except:
    caribbean_species_urls = collections.defaultdict(list)
    caribbean_species_google = collections.defaultdict(list)
    caribbean_done = []

In [None]:
caribbean_done

In [None]:
for caribbean in tqdm(caribbean_species):

    if caribbean in caribbean_done:
        print(caribbean, "done")
        continue

    for query in queries:
        search_query = F"{caribbean} {query}"

        # print(search_query)

        # Search results (10 per search)
        search_results = google_search(search_query, api_key=api_key, cse_id=cse_id)
        # Google Urls
        caribbean_species_google[caribbean] = search_results
        # Just the links
        for result in search_results:
            caribbean_species_urls[caribbean].append(result['link'])


folder = "Urls/"

with open(F"{folder}urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_urls, f)

with open(F"{folder}google_urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_google, f)

In [None]:
folder = "../../../data/OpenAI/Urls/"

with open(F"{folder}urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_urls, f)

with open(F"{folder}google_urls_caribbean.pkl", 'wb') as f:
    pickle.dump(caribbean_species_google, f)

## West Africa

In [None]:
folder = "Species/"

west_species = pickle.load(open(F"{folder}west_species.pkl", 'rb'))

try:
    folder = "Urls/"
    west_species_google = pickle.load(open(F"{folder}google_urls_west.pkl", 'rb'))
    west_species_urls = pickle.load(open(F"{folder}urls_west.pkl", 'rb'))
    west_done = list(west_species_urls.keys())
except:
    west_species_urls = collections.defaultdict(list)
    west_species_google = collections.defaultdict(list)
    west_done = []

In [None]:
west_done

In [None]:
for west in tqdm(west_species):

    if west in westt_done:
        print(west, "done")
        continue

    for query in queries:
        search_query = F"{west} {query}"

        # print(search_query)

        # Search results (10 per search)
        search_results = google_search(search_query, api_key=api_key, cse_id=cse_id)
        # Google Urls
        west_species_google[west] = search_results
        # Just the links
        for result in search_results:
            west_species_urls[west].append(result['link'])


folder = "Urls/"

with open(F"{folder}urls_west.pkl", 'wb') as f:
    pickle.dump(west_species_urls, f)

with open(F"{folder}google_urls_west.pkl", 'wb') as f:
    pickle.dump(west_species_google, f)

In [None]:
folder = "Urls/"

with open(F"{folder}urls_west.pkl", 'wb') as f:
    pickle.dump(west_species_urls, f)

with open(F"{folder}google_urls_west.pkl", 'wb') as f:
    pickle.dump(west_species_google, f)