# Load data from Vizier

#### Index<a name="index"></a>
1. [Import packages](#imports)
2. [Load raw data](#loadData)
3. [Create and save data](#createData)
    1. [True clusters](#createTrue)
    2. [Non clusters - Uniform](#createUniformNonClusters)
    3. [Non clusters - Functional](#createFunctionalNonClusters)

## 1. Import packages<a name="imports"></a>

In [1]:
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from astroquery.vizier import Vizier

#### Aestetic settings

In [2]:
%matplotlib inline

sns.set(font_scale=1.3, style="ticks")

## 2. Load raw data<a name="loadData"></a>

In [3]:
Vizier.ROW_LIMIT = 3000000
catalogs = Vizier.get_catalogs('J/A+A/618/A93/members')
print(catalogs)  # astroquery.utils.commons.TableList

TableList with 1 tables:
	'0:J/A+A/618/A93/members' with 16 column(s) and 401448 row(s) 


Since `catalogs` only has 1 table, we are only interested in `catalogs[0]`. We will transform it to a `pandas` Data Frame for ease of handling.

In [4]:
catalog = catalogs[0].to_pandas()
catalog = catalog.applymap(lambda x: x.decode() if isinstance(x, bytes) else x)
catalog.head()

Unnamed: 0,RA_ICRS,DE_ICRS,Source,GLON,GLAT,plx,pmRA,pmDE,o_Gmag,Gmag,BP-RP,PMemb,Cluster,SimbadName,_RA.icrs,_DE.icrs
0,301.229391,-11.186849,4190669036038417152,30.982256,-21.322445,2.1358,1.268,-8.56,348,14.804825,1.412944,0.7,Alessi_10,,301.229385,-11.186812
1,301.014346,-11.419202,4190657903483144960,30.660791,-21.229051,2.2071,1.136,-8.043,331,17.35053,2.605459,1.0,Alessi_10,Gaia DR2 4190657903483144960,301.014341,-11.419167
2,300.801432,-11.1955,4190723255705770240,30.783967,-20.945461,2.0438,1.259,-8.064,329,12.473157,0.865277,0.8,Alessi_10,,300.801426,-11.195466
3,301.006976,-11.037219,4190732120518267776,31.028893,-21.06143,2.3256,1.189,-8.1,301,16.400896,1.93503,1.0,Alessi_10,Gaia DR2 4190732120518267776,301.006971,-11.037184
4,301.678011,-10.865093,4190774206899418880,31.495512,-21.58557,2.2585,1.598,-7.63,322,17.435835,2.370981,1.0,Alessi_10,Gaia DR2 4190774206899418880,301.678004,-10.86506


## 3. Create and save data<a name="createData"></a>

First choose if you want to see in the notebook the scatter plots and histograms created.

In [5]:
plot_scatter = False
plot_hist = False

In [6]:
data_folder = '../data'

### 3.1. True clusters<a name="createTrue"></a>

In [7]:
clusters = np.unique(catalog['Cluster'])
number_clusters = len(clusters)

true_clusters_2d_hist = []
for i, cluster in enumerate(clusters):
    is_cluster = catalog['Cluster'] == cluster
    data_cluster = catalog[is_cluster]
    
    # Remove stars for which Gaia didn't measure magnitude or colour
    gmag = data_cluster['Gmag']
    bp_rp = data_cluster['BP-RP']
    is_nan = (np.isnan(bp_rp)) | (np.isnan(gmag))
    data_cluster = data_cluster[~is_nan]
    
    gmag = data_cluster['Gmag']
    bp_rp = data_cluster['BP-RP']
    
    if plot_scatter:
        plt.scatter(bp_rp, gmag, marker='.')
        plt.xlabel(r'$B_p - R_p$ (mag)')
        plt.ylabel(r'$G$ (mag)')
        plt.title(f'Cluster {cluster}')
        plt.ylim([18,8])
        plt.show()

    hist, xedges, yedges = np.histogram2d(bp_rp, gmag, bins=20, 
                                          density=True)

    plt.imshow(hist, interpolation='nearest')
    plt.title(f'Cluster {cluster}')
    if i < 10:
        #plt.savefig(f'{data_folder}/example_plots/true_clasters/{cluster}.jpg', bbox_inches='tight')
        pass
    if plot_hist:
        plt.show()
    plt.close()

    hist_linear = hist.reshape((400))
    true_clusters_2d_hist.append(hist_linear)

true_clusters_2d_hist_file = f'{data_folder}/true_clusters_2d_hist_{number_clusters}.npy'
#np.save(true_clusters_2d_hist_file, true_clusters_2d_hist)

Confirm the file was well saved by loading it. If the output of the cell is False, the saving went wrong.

In [8]:
true_clusters_2d_hist_saved = np.load(true_clusters_2d_hist_file, allow_pickle=True)
np.allclose(true_clusters_2d_hist_saved, true_clusters_2d_hist)

True

In [9]:
print('Number of positive samples: ', len(true_clusters_2d_hist))

Number of positive samples:  1229


### 3.2. Non clusters - Uniform<a name="createUniformNonClusters"></a>

In [10]:
non_clusters_2d_hist = []

In [11]:
for i in range (0, number_clusters):
    number_stars = int(random.uniform (30, 500)) 
    gmag = np.random.uniform (8, 18, size=number_stars)
    bp_rp = np.random.uniform (0, 5, size=number_stars)
    
    hist, xedges, yedges = np.histogram2d(bp_rp, gmag, bins=20, 
                                          density=True)

    if i <= 10:
        plt.imshow(hist, interpolation='nearest')
        plt.title(f'Non Cluster {i}')
#         plt.savefig(f'{data_folder}/example_plots/non_clusters/unif_non_cluster_{i}.jpg', 
#                     bbox_inches='tight')
    
    if plot_hist:
        plt.show()
    plt.close()
    
    hist_linear = hist.reshape((400))
    non_clusters_2d_hist.append(hist_linear)
    
non_clusters_2d_hist_file = f'{data_folder}/unif_non_clusters_2d_hist_{number_clusters}.npy'
#np.save(non_clusters_2d_hist_file, non_clusters_2d_hist)

Confirm the file was well saved by loading it. If the output of the cell is False, the saving went wrong.

In [None]:
non_clusters_2d_hist_saved = np.load(non_clusters_2d_hist_file, allow_pickle=True)
np.allclose(non_clusters_2d_hist_saved, non_clusters_2d_hist)

### 3.3. Non clusters - Functional<a name="createFunctionalNonClusters"></a>

In [13]:
non_clusters_2d_hist = []

In [14]:
for i in range (0, number_clusters):
    number_stars = int(random.uniform (30, 500)) 

    gmag = [np.arcsin(i) + random.gauss(1, 0.5) for i in np.linspace(-1, 1, number_stars)]
    bp_rp = np.linspace(-1, 1, number_stars)
    
    hist, xedges, yedges = np.histogram2d(bp_rp, gmag, bins=20, 
                                          density=True)

    if i <= 10:
        plt.imshow(hist, interpolation='nearest')
        plt.title(f'Non Cluster {i}')
        plt.savefig(f'{data_folder}/example_plots/non_clusters/func_non_cluster_{i}.jpg', 
                    bbox_inches='tight')
    
    if plot_hist:
        plt.show()
    plt.close()
    
    hist_linear = hist.reshape((400))
    non_clusters_2d_hist.append(hist_linear)
    
non_clusters_2d_hist_file = f'{data_folder}/func_non_clusters_2d_hist_{number_clusters}.npy'
np.save(non_clusters_2d_hist_file, non_clusters_2d_hist)

Confirm the file was well saved by loading it. If the output of the cell is False, the saving went wrong.

In [15]:
non_clusters_2d_hist_saved = np.load(non_clusters_2d_hist_file, allow_pickle=True)
np.allclose(non_clusters_2d_hist_saved, non_clusters_2d_hist)

True

Go to [Index](#index)