In [122]:
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple

In [123]:
def get_all_search_results(search_term: str) -> List[Tuple[str, str]]:
    
    # construct the search url
    base_search_url = 'https://www.swift.ac.uk/dead_portal/getobject.php'
    search_url = base_search_url + '?name=' + search_term + '&submit=Search+Names'
    
    # download the search page and parse it
    page_html = requests.get(search_url)
    search_soup = BeautifulSoup(page_html.text, features="lxml")
    
    # get the main results table
    results_table = search_soup.find("table", {"class": "chTable"})
    
    # ignore the first row with the names of the columns, and the last row with links for all of the data
    table_rows = results_table.find_all("tr")[1:-1]
    
    # .contents is a list, our table has only one element in it, so take contents[0]
    tids = [row.find("td", {"headers": "row_targ"}).contents[0] for row in table_rows]
    tnames = [row.find("td", {"headers": "row_name"}).contents[0] for row in table_rows]
    tobservations = [row.find("td", {"headers": "row_num"}).contents[0] for row in table_rows]
    
    # zips and returns the tids and tnames as a list of type Tuple
    all_targets_zip = zip(tids, tnames)
    return list(all_targets_zip)

In [124]:
def combine_results(list_1: list, list_2: list, list_3: list) -> List[Tuple[str, str]]:
    
    # combines all list and converts it to a set to remove any duplicate results
    # returns the combined set
    full_list = list_1 + list_2 + list_3
    combined_set = set(full_list)
    
    # converts the set back to a list to be able to iterate through the data
    combined_list = list(combined_set)
    return combined_list

In [125]:
def create_file(finalized_list: list) -> None:
    string = ''
    
    # writes the file called swift_comet_database.txt
    # creates the heading and iterates through the list for each tuple[obsid, tname]
    with open('swift_comet_database.txt', 'w') as f:
        f.write(f'Name of observation'.ljust(85) + f'Observation ID\n' + string.ljust(19, '-') + string.ljust(66) + string.ljust(14, '-')  + f'\n')
        i = 0
        while i < len(finalized_list):
            f.write(f'{finalized_list[i][1]}'.ljust(85) + f'{finalized_list[i][0]}' + f'\n')
            i += 1
    f.close()
    return

In [131]:
def update_list_elements(input_list: list):
    
    # creates a new_list to store the converted tid to obsid
    # input_list[tuples(tid, tname)] -> new_list[tuple(obsid, tname)]
    new_list = []
    i = 0
    j = 0
    
    # iterates through the input_list for each target tuple(tid, tname)
    # gets all obsid for each tid and appends it to new_list
    while i < len(input_list):
        j = 0
        obsids = get_obs_id(input_list[i][0])
        while j < len(obsids):
            new_list.append((obsids[j], input_list[i][1]))
            j += 1
        i += 1
    return new_list

In [132]:
def get_obs_id(tid: str):
    
    # for any given target id, there may be multiple observations in their own directories,
    # with the naming scheme {target id}001/, {target id}002/, etc.
    # so we let the server give us the appropriate wget commands because it knows how
    # many observations each target id has
    
    overwrite_option = '-nc'
    base_wget_url = f'https://www.swift.ac.uk/archive/download.sh?reproc=1&tid={tid}&source=obs&subdir=auxil'
    wget_response = requests.get(base_wget_url)
    wget_commands = [line for line in wget_response.text.splitlines() if 'wget' in line]
    urls = [command.split()[-1] for command in wget_commands]
    
    # iterates through each wget url created for all obsids
    obsids = []
    i = 0
    while i < len(urls):
        link = urls[i]
        obsids.append(link[38:-7])
        i += 1
    return obsids

In [133]:
search_term_1 = 'C/'
search_term_2 = 'Comet'
search_term_3 = 'P/'

all_search_1 = get_all_search_results(search_term_1)
all_search_2 = get_all_search_results(search_term_2)
all_search_3 = get_all_search_results(search_term_3)

combined_list = combine_results(all_search_1, all_search_2, all_search_3)
updated_list = update_list_elements(combined_list)
create_file(updated_list)