In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2
import warnings; warnings.simplefilter('ignore')

# Gaia data visualization using voila and vaex
This small voila dashboard shown the basics of building a dashboard using:
 * <a href="https://gea.esac.esa.int/archive/">Gaia</a> dataset, crossmatched to the <a href="https://panstarrs.stsci.edu/">Pan-STARRS</a> dataset.
 * <a href="https://github.com/QuantStack/voila">voila</a>
 * [ipywidgets](https://ipywidgets.readthedocs.io/)

In [3]:
import ipywidgets as widgets
import matplotlib.pylab as plt
import vaex
import vaex.jupyter
import numpy as np
import getpass


In [4]:
powerful_machine = True
if powerful_machine:
    # If we are on a powerful machine, we can download and process the data locally
    profile_name = 'stsci' if getpass.getuser() == 'maartenbreddels' else 'default'
    df = vaex.open(f's3://astrosurveydata/gaia_ps1_nochunk.hdf5?profile_name={profile_name}')    
else:
    # otherwise we connect to the STScI Gaia machine (a remote dataframe)
    with open('token.txt') as f:
        token = f.read().strip()
    df = vaex.open(f'ws://ec2-18-222-183-211.us-east-2.compute.amazonaws.com:9000/gaia_ps1_nochunk?token_trusted={token}')#[:10_000_000]

In [5]:
# df = df[:20_000_000] # if you want to experiment quickly with a smaller subset

In [6]:
print(f'The full dataset contains {len(df):,} stars')

The full dataset contains 928,000,000 stars


In [7]:
columns_all = df.get_column_names()
columns_colors = [col for col in columns_all if "mean_mag" in col ]
print("We found the following photometry columns in the Gaia dataset:\n\t" + "\n\t".join(columns_colors))

We found the following photometry columns in the Gaia dataset:
	phot_bp_mean_mag
	phot_g_mean_mag
	phot_rp_mean_mag


In [8]:
df['distance'] = 1/df.parallax
df['M_g'] = df.phot_g_mean_mag - 5 * np.log10(df.distance) - 10

# Filtering the data
We take only a subset of the data, where $\varpi \over \sigma_\varpi > 5$, meaning we cut out negative distances, and very uncertain distances.

In [9]:
# Filter the dataset, ignore negative or very uncertain distances
df = df[df.parallax_over_error>5] 
print(f'The filtered dataset contains {len(df):,} stars')

The filtered dataset contains 93,458,483 stars


# Interactive Hertzsprung Russell
Below, we show an [Hertzsprung Russell](https://en.wikipedia.org/wiki/Hertzsprung%E2%80%93Russell_diagram) where you can choose the colors, an extra expression to filter by, or decide to plot the log density or just density.

In [10]:
extra_filter = "distance < 10"
df.select(extra_filter)

In [11]:
limits_color = [-1, 4]
last_filter = extra_filter

@widgets.interact_manual(magnitude1=columns_colors, magnitude2=columns_colors)
def plot(magnitude1=columns_colors[1], magnitude2=columns_colors[0], log=True, extra_filter=extra_filter):
    global last_filter
    try:
        df.validate_expression(extra_filter)
    except Exception as e:
        print("oops, invalid expression: " +str(e))
        return
    
    # a performance trick, if we do not change the selection
    # vaex will use the selection cache
    if extra_filter != last_filter:
        df.select(extra_filter)
    last_filter = extra_filter

    color = df[magnitude2] - df[magnitude1]
    # instead of calling plot, we call count manually, so we can count the number
    # of stars in the plot.
    counts = df.count(binby=[color, df.M_g], limits=[limits_color, [20, -7]], shape=128,
                        selection=True)
    df.plot(color, df.M_g, limits=[limits_color, [20, -7]], shape=128,
             f="log" if log else "identity", grid=counts)
    N = counts.sum()
    plt.title(f'This plot contains {N:,} stars')
    plt.show()

interactive(children=(Dropdown(description='magnitude1', index=1, options=('phot_bp_mean_mag', 'phot_g_mean_ma…

In [12]:
# Manually trigger the first time
plot.widget.children[-2].click()

_(Note that there currently is a bug in voila that causes the old plot to stay on the page)_