# Swift Archive Portal Search

The program allows the user to searches dead swift portal found [here](https://www.swift.ac.uk/dead_portal/index.php) and automatically downloads the data to the user's hard drive.  
  
To run the code blocks, go to ```Run``` -> ```Run All Cells```  
To hide code blocks, go to ```View``` -> ```Collapse All Code``` 

### Porgam Requirements
This program requires the installation the following packages: ipywidgets and wget. 
  
>To install ipywidgets run the following command in terminal: ```conda install -c conda-forge ipywidgets```  
>To install wget run the following command in terminal: ```brew install wget```  

***Note:*** *if you do not have Homebrew installed already, check this documentation found [here](https://brew.sh/) to install it first, then come back and install wget*

### User Interface
The user interface allows for a specified download type (```tar``` or ```uncompressed```) and multiple file types (```auxil``` and/or ```bat``` and/or```xrt``` and/or ```uvolt``` and/or ```log```).  
Also the user can specify the download location (it defaults to the current directory) and skip files that are already created by the program (in case of the program timing out when downloading).  
This user interface uses elements from ipywidgets and tkinter.

In [6]:
import pathlib
import requests
import subprocess
import os
import ipywidgets as widgets
from dataclasses import dataclass
from tkinter import filedialog
from bs4 import BeautifulSoup
from typing import List, Tuple

# data class to manage reading and passing the data around to methods
#
# -----------------------------------------------Parameters-----------------------------------------------
#
# search_term: string of striped result that user puts into the search box
# download_type: string result that user selects as their download type, download_type are ['uncompressed', 'tar']
# dtype_list: list of strings that represent all the data being downloaded, dtype_list are ['auxil', 'bat', 'xrt', 'uvot', 'log']
# overwrite: bool representing the to either overwrite a file if it alreadys exist
# dest_dir: path to store all downloaded files
# page_html: raw string literal of the html page
# search_soup: parse of the page_html to easier nagivate and search through it (using the BeautifulSoup class)
# search_type: string value representing if the server found 0 results, 1 result, or 2+ results as each has a different html
# tlist: list of type Tuple (for 2+ results) or type string (for 1 result) that contains all tname(s) and tid(s) to download
# tname(s): string(s) representing the target name found on the search page
# tid(s): string(s) representing the 8 digit long target identification number for each target found in tnames
#
# --------------------------------------------------------------------------------------------------------

@dataclass
class SearchInfo:
    search_term: str
    download_type: str
    dtype_list: list
    overwrite: bool
    dest_dir: str
    page_html: str
    search_soup: str
    search_type: str
    tlist: list
    
def get_swift_wget_commands(tid: str, dtype: str, overwrite: bool) -> List[str]:

    # for any given target id, there may be multiple observations in their own directories,
    # with the naming scheme {target id}001/, {target id}002/, etc.
    # so we let the server give us the appropriate wget commands because it knows how
    # many observations each target id has
    
    if overwrite is False:
        overwrite_option = '-nc'
    else:
        overwrite_option = ''
        
    # this page returns a script with wget commands to download our data
    base_wget_url = f'https://www.swift.ac.uk/archive/download.sh?reproc=1&tid={tid}&source=obs&subdir={dtype}'
    wget_response = requests.get(base_wget_url)
    wget_commands = [line for line in wget_response.text.splitlines() if 'wget' in line]
    urls = [command.split()[-1] for command in wget_commands]
    
    # -nc ==> no clobber: don't replace already downloaded files
    # -q ==> quiet mode, no output
    # -w 2 ==> wait 2 seconds between files
    # -nH ==> don't create a directory based on the host, in this case no folder named www.swift.ac.uk/
    # --cut-dirs=2 ==> remove the /archive/reproc/ folders on the server from being created locally
    # -r ==> recursive: grab everything under this folder on the server
    # --reject ... ==> specify files that we don't want from the server
    adjusted_wget_commands = ['wget ' + overwrite_option + ' -q -w 2 -nH --cut-dirs=2 -r --no-parent --reject index.html*,robots.txt* ' + url for url in urls]
    
    return adjusted_wget_commands

def swift_download_uncompressed(tid: str, dtype: str, dest_dir: pathlib.Path = None, overwrite: bool = False) -> None:
    
    # given a Swift target id and type of data, this function downloads the uncompressed
    # data to the directory dest_dir
    
    # get our download commands from the server
    wget_commands = get_swift_wget_commands(tid=tid, dtype=dtype, overwrite=overwrite)
    if wget_commands is None:
        print("No wget commands to execute, skipping downloads...")
        return
    
    # change folders if we need to
    old_cwd = os.getcwd()
    if dest_dir is not None:
        os.chdir(dest_dir)
    print(f"Downloading {dtype} data of target id {tid} to {os.getcwd()} ...")
    
    # run each command to grab the individual observations for this target id
    for command in wget_commands:
        presult = subprocess.run(command.split())
        if presult.returncode != 0:
            print(f"Non-zero return code {presult.returncode} for {command}!")
    
    # change folders back
    os.chdir(old_cwd)

def swift_download_compressed(tid: str, tname: str, dtype: str, archive_type: str, dest_dir: pathlib.Path, overwrite: bool = False) -> None:

    """
        Downloads an archive of Swift data from swift.ac.uk to dest_dir

        Parameters
        ----------
        tid : string
            The target ID to be downloaded, e.g. '00020405'
        tname: string
            The name of the target, e.g. 'CometC/2031US10(Catalina)'
        dtype: string
            The type of data being downloaded, e.g. 'uvot'
        archive_type: string
            One of 'zip' or 'tar' to download the corresponding type
        dest_dir: pathlib.Path
            Directory to place files
        overwrite: bool
            Whether or not to overwrite the file if it already exists
    """
    
    # change folders if we need to
    old_cwd = os.getcwd()
    if dest_dir is not None:
        os.chdir(dest_dir)
    
    # name the archive with the target id and data type, because the server returns 'download.tar' no matter what
    out_file_stem = pathlib.Path(tid + f"_{dtype}")
    
    # download
    if archive_type == 'zip':
        print(f"Downloading .zip archives is broken server-side so is currently unsupported.")
    if archive_type == 'tar':
        swift_download_compressed_tar(tid=tid, tname=tname, dtype=dtype, out_file_stem=out_file_stem, overwrite=overwrite)

    os.chdir(old_cwd)
    return

def swift_download_compressed_tar(tid: str, tname: str, dtype: str, out_file_stem: pathlib.Path, overwrite: bool) -> None:

    out_file = out_file_stem.with_suffix('.tar')
    if out_file.exists() and overwrite is False:
        print(f"Found {str(out_file)} and overwriting was forbidden, skipping download.")
        return
    
    # build our urls and params to send the server
    swift_referer_base_url = 'https://www.swift.ac.uk/archive/prepdata.php'
    swift_download_portal_base_url = 'https://www.swift.ac.uk/archive/download.tar'

    referer_url = f"{swift_referer_base_url}?tid={tid}&source=obs&name={tname}&referer=portal"
    params = {
        'reproc': '1',
        'tid': tid,
        'source': 'obs',
        'subdir': dtype,
    }

    # lie to the server
    request_header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/109.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': referer_url,
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Sec-GPC': '1',
    }

    print(f"Attempting to download {tid} of {tname} to {out_file}, please wait ...")
    response = requests.get(swift_download_portal_base_url, params=params, headers=request_header)
    print(f"Requested data from {response.url}, response code {response.status_code} ...")

    # name the output file if it wasn't passed in an argument
    with open(out_file, 'wb') as f:
        f.write(response.content)
    
    print(f"Wrote {str(out_file)}.")

    return

def search_page() -> None:
    global SearchClass
    # construct the search url
    base_search_url = 'https://www.swift.ac.uk/dead_portal/getobject.php'
    search_url = base_search_url + '?name=' + SearchClass.search_term + '&submit=Search+Names'
    
    # download the search page and parse it
    SearchClass.page_html = requests.get(search_url)
    SearchClass.search_soup = BeautifulSoup(SearchClass.page_html.text, features="lxml")
    
def results_type() -> str:
    global SearchClass
    string = ''
    # reads the html to see if the page showed that there is no results by displaying the string 
    # find() will return -1 if that exact string is not found     
    if(SearchClass.search_soup.get_text().find('No entry found in the database with object name matching') != -1):
        print(f'\nUnable to find any search results for \'{SearchClass.search_term}\'. \nPlease try again.\n')
        print(string.ljust(100, '-'))
        return '0 results'
    elif(SearchClass.search_soup.get_text().find('Download archive data') != -1):
        return '1 result'
    else:
        return '2 or more results'
    
def get_single_tlist() -> [str, str]:
    global SearchClass
    string = ''
    
    # searches the saved soup html of the page for the tid and tname
    page_head = str(SearchClass.search_soup.find_all('h1')[0])
    tname = page_head[30:-5]
    page_label = str(SearchClass.search_soup.find_all('label')[1])
    tid = page_label[29:37]
    
    # displays the results to the user
    print(f'\nFound a single data file for the search term \'{SearchClass.search_term}\':\n')
    print(string.ljust(100, '-'))
    print()
    print(f'Name of observation: {tname}'.ljust(67) + 'Total number of observations: 1\n')
    print(string.ljust(100, '-'))
    print(f'Confirm or cancel the download of the single file above to the directory {SearchClass.dest_dir}\n')
    
    # returns the tname and tid as a list
    return [tname, tid]

def get_multi_tlists() -> List[Tuple[str, str]]:
    global SearchClass
    string = ''
    
    # get the main results table
    results_table = SearchClass.search_soup.find("table", {"class": "chTable"})
    
    # ignore the first row with the names of the columns, and the last row with links for all of the data
    table_rows = results_table.find_all("tr")[1:-1]
    
    # .contents is a list, our table has only one element in it, so take contents[0]
    tids = [row.find("td", {"headers": "row_targ"}).contents[0] for row in table_rows]
    tnames = [row.find("td", {"headers": "row_name"}).contents[0] for row in table_rows]
    tobservations = [row.find("td", {"headers": "row_num"}).contents[0] for row in table_rows]
    
    # create a dict to show the user the total ammount of data for each observation in tname
    # iterates through tnames and tobservations to count the total times for each tname, storing these values in print_table
    print_table = {}
    i = 0
    while i < len(table_rows):
        print_table[f'{tnames[i]}'] = print_table.get(f'{tnames[i]}', 0) + int(tobservations[i])
        i += 1
    print(f'\nFound the following data for the search term \'{SearchClass.search_term}\':\n')
    print(string.ljust(100, '-'))
    print()
    
    # prints the table of files found for the user to see what their search results are
    # cast elements in the print_table to be able to index the tname and toal tobservations separately
    j = 0
    while j < len(print_table):
        print(f'Name of observation: {tuple(print_table.items())[j][0]}'.ljust(67) + f'Total number of observations: {tuple(print_table.items())[j][1]}\n')
        j += 1
    print(string.ljust(100, '-'))
    print(f'Confirm or cancel the download of all files above to the directory {SearchClass.dest_dir}\n')
    
    # zips and returns the tids and tnames as a list of type Tuple
    all_targets_zip = zip(tids, tnames)
    return list(all_targets_zip)

def download_single_file(tlist: str, dtype_list: str, dest_dir: pathlib.Path, download_type: str, overwrite=False) -> None:
    print()
    string = ''
    # downloads the file for a single result when searching
    for dtype in dtype_list:
        if download_type == 'uncompressed':
            swift_download_uncompressed(tid=tlist[1], dtype=dtype, dest_dir=dest_dir, overwrite=overwrite)
        if download_type in ['tar', 'zip']:
            swift_download_compressed(tid=tlist[1], tname=tlist[0], dtype=dtype, archive_type=download_type, dest_dir=dest_dir, overwrite=overwrite)
    print(f'Download Completed!\n')
    print(string.ljust(100, '-'))
    
def download_multi_files(tlist: str, dtype_list: str, dest_dir: pathlib.Path, download_type: str, overwrite=False) -> None:
    print()
    string = ''
    # downloads the files for 2+ results when searching
    # iterates over each requested data type and observation collected from get_multi_tlists()
    for dtype in dtype_list:
        for tid, tname in tlist:
            if download_type == 'uncompressed':
                swift_download_uncompressed(tid=tid, dtype=dtype, dest_dir=dest_dir, overwrite=overwrite)
            if download_type in ['tar', 'zip']:
                swift_download_compressed(tid=tid, tname=tname, dtype=dtype, archive_type=download_type, dest_dir=dest_dir, overwrite=overwrite)
    print(f'Download Completed!\n')
    print(string.ljust(100, '-'))
    
def confirm_button_click(button) -> None: 
    global SearchClass
    cancel_button.disabled = True
    confirm_button.disabled = True
    with output:
        string = ''
        # downloads files depending on type
        if (SearchClass.search_type == '1 result'):
            download_single_file(SearchClass.tlist, SearchClass.dtype_list, SearchClass.dest_dir, 
                                 SearchClass.download_type, SearchClass.overwrite)
        else:
            download_multi_files(SearchClass.tlist, SearchClass.dtype_list, SearchClass.dest_dir, 
                                 SearchClass.download_type, SearchClass.overwrite)
    enable_inputs()
    
def cancel_button_click(button) -> None:
    # cancels program run without downloading files
    cancel_button.disabled = True
    confirm_button.disabled = True
    with output:
        string = ''
        print(f'\nNo files downloaded\n')
        print(string.ljust(100, '-'))
        enable_inputs()
        
def get_page_information() -> None:
    global SearchClass
    output = widgets.Output()
    cancel_button.disabled = False
    confirm_button.disabled = False
    string = ''
    print(string.ljust(100, '-'))
    
    # gets all the results from the UI
    SearchClass.download_type = download_box.value
    SearchClass.dtype_list = list(dtype_box.value)
    
    # flips bool value for overwrite to fit common wget formatting
    if (overwrite_box.value == True):
        SearchClass.overwrite = False
    else:
        SearchClass.overwrite = True
    SearchClass.dest_dir = path_box.value
    
    # test to see if the search term is not empty
    if(search_box.value.strip() == ''):
        print(f'\nNo search term entered.\nPlease try again.\n')
        print(string.ljust(100, '-'))
        enable_inputs()
        return
        
    # gets page download and runs the downloads based on how many results come back (1 or 2+)
    else:
        # only gets the new page if the previous non empty search term stored in the data class is different 
        # than the current term in the box
        if (search_box.value.strip() != SearchClass.search_term):
            SearchClass.search_term = search_box.value.strip()
            search_page()
            
        # reads the page to see how many results are present
        SearchClass.search_type = results_type()
        
        # gets tlist from the search_soup and prints the results to the counsel 
        if (SearchClass.search_type == '1 result'):
            SearchClass.tlist = get_single_tlist()
        elif (SearchClass.search_type == '2 or more results'):
            SearchClass.tlist = get_multi_tlists()
        else:
            enable_inputs()
            return
        
        # creates UI elements to confirm/cancel download
        display(confirm_button, output)
        display(cancel_button, output)
        print(string.ljust(100, '-'))
        cancel_button.on_click(cancel_button_click)
        confirm_button.on_click(confirm_button_click)
        
def search_button_click(button) -> None:
    # disables all UI input elements while running to ensure multiple searches do not happen at the same time
    search_box.disabled = True
    dtype_box.disabled = True
    download_box.disabled = True
    path_button.disabled = True
    overwrite_box.disabled = True
    search_button.disabled = True
    with output:
        get_page_information()

def path_button_click(button) -> None:
    # gets the new user path if they select to change from current dir
    path_button.disabled = True
    with output:
        user_path = filedialog.askdirectory()
        output.clear_output()
        # test to see if they actually submitted a valid path
        if user_path == '' or user_path == '/':
            path_button.disabled = False
            return
        # updates path value if it is valid
        path_box.value = user_path
    path_button.disabled = False

def enable_inputs() -> None:
    # re enables all UI input elements to allow for user to attempt another download and clears the output
    search_box.disabled = False
    dtype_box.disabled = False
    download_box.disabled = False
    path_button.disabled = False
    overwrite_box.disabled = False
    search_button.disabled = False
    output.clear_output(wait=True)

# creates an empty new SearchInfo data class
SearchClass = SearchInfo(search_term='', download_type='', dtype_list = [], overwrite=False, 
                         dest_dir='', page_html='', search_soup='', search_type='', tlist = [])

# creates all UI elements using ipywidgets
output = widgets.Output()
search_box = widgets.Text(
    value='',
    placeholder='',
    description='Search Term:',
    disabled=False
)
download_box = widgets.ToggleButtons(
    # zip file download type is not working on the portal
    # on the small chance it does start working again, uncommit the below two lines and commit following two lines
    
    #options=['tar', 'uncompressed', 'zip'],
    #tooltips=['uncompressed', 'tar', 'zip'],
    options=['tar', 'uncompressed'],
    tooltips=['uncompressed', 'tar'],
    description='Download Type:',
    disabled=False
)
dtype_box = widgets.SelectMultiple(
    options=['auxil', 'bat', 'xrt', 'uvot', 'log'],
    value=['uvot'],
    description='File Type(s)',
    disabled=False
)
path_box = widgets.Text(
    value=f'{os.getcwd()}',
    placeholder='',
    style = {'description_width': 'initial'},
    description='Download Location:',
    disabled=True,
    layout={'width': '500px'}
)
path_button = widgets.Button(
    description="Change Download Location",
    disabled=False,
    button_style='',
    tooltip="Change the download path",
    layout={'width': '200px'}
)
overwrite_box = widgets.Checkbox(
    value=True,
    description='Skip already downloaded files',
    disabled=False,
    indent=False
)
search_button = widgets.Button(
    description="Search Archives",
    disabled=False,
    button_style='',
    tooltip="Search the Swift archives"
)
confirm_button = widgets.Button(
    description="Confirm",
    disabled=False,
    button_style='',
    tooltip="Confirm download"
)
cancel_button = widgets.Button(
    description="Cancel",
    disabled=False,
    button_style='',
    tooltip="Cancel download"
)

# displays the UI input elements
display(search_box)
display(download_box)
display(dtype_box)
print()
display(path_box)
display(path_button)
print()
display(overwrite_box)
print()
display(search_button, output)
path_button.on_click(path_button_click)
search_button.on_click(search_button_click)

Text(value='', description='Search Term:', placeholder='')

ToggleButtons(description='Download Type:', options=('tar', 'uncompressed'), tooltips=('uncompressed', 'tar'),…

SelectMultiple(description='File Type(s)', index=(3,), options=('auxil', 'bat', 'xrt', 'uvot', 'log'), value=(…




Text(value='/Users/jduffy0121/Desktop/Comet-Research', description='Download Location:', disabled=True, layout…

Button(description='Change Download Location', layout=Layout(width='200px'), style=ButtonStyle(), tooltip='Cha…




Checkbox(value=True, description='Skip already downloaded files', indent=False)




Button(description='Search Archives', style=ButtonStyle(), tooltip='Search the Swift archives')

Output()