# Day 6, Part 1 - Data processing
Let's take a step back and look at some larger planet data to see how we might process a large list of data.  In this example, we'll use more Kepler data.

While filtering is important only sometimes for simulated data, it is generally necessary for observational data & if we want to compare our simulations to observations.

We'll use the "pandas" package to do this which can be a useful thing to know how to use anyway!

In [1]:
# import our usual stuffs
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# now, import pandas
import pandas as pd

In [5]:
# now let's read in the kepler confirmed planets dataset
planets = pd.read_csv('https://jnaiman.github.io/csci-p-14110/lesson06/data/planets_2019.07.12_17.16.25.csv', 
                     sep=",", comment="#")
#note: feel free to download this and read from your download as well

In [6]:
planets
# formatting here is sort of nice

Unnamed: 0,pl_hostname,pl_letter,pl_name,pl_discmethod,pl_controvflag,pl_pnum,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,...,st_sperr,st_splim,st_lum,st_lumerr1,st_lumerr2,st_lumlim,st_age,st_ageerr1,st_ageerr2,st_agelim
0,11 Com,b,11 Com b,Radial Velocity,0,1,326.030000,0.320000,-0.320000,0.0,...,,0.0,2.243,0.071,-0.085,0.0,,,,
1,11 UMi,b,11 UMi b,Radial Velocity,0,1,516.219970,3.200000,-3.200000,0.0,...,,0.0,,,,0.0,,,,0.0
2,14 And,b,14 And b,Radial Velocity,0,1,185.840000,0.230000,-0.230000,0.0,...,,0.0,1.763,,,0.0,,,,
3,14 Her,b,14 Her b,Radial Velocity,0,1,1773.400020,2.500000,-2.500000,0.0,...,,0.0,,,,0.0,,,,0.0
4,16 Cyg B,b,16 Cyg B b,Radial Velocity,0,1,798.500000,1.000000,-1.000000,0.0,...,,0.0,,,,0.0,,,,0.0
5,18 Del,b,18 Del b,Radial Velocity,0,1,993.300000,3.200000,-3.200000,0.0,...,,0.0,1.602,,,0.0,,,,
6,1RXS J160929.1-210524,b,1RXS J160929.1-210524 b,Imaging,0,1,,,,,...,,0.0,-0.370,0.150,-0.150,0.0,0.005,,,
7,24 Boo,b,24 Boo b,Radial Velocity,0,1,30.350600,0.007800,-0.007700,0.0,...,,0.0,1.774,0.047,-0.053,0.0,6.920,4.830,-2.750,0.0
8,24 Sex,b,24 Sex b,Radial Velocity,0,2,452.800000,2.100000,-4.500000,0.0,...,,0.0,1.164,0.003,-0.003,0.0,2.700,0.400,-0.400,
9,24 Sex,c,24 Sex c,Radial Velocity,0,2,883.000000,32.400000,-13.800000,0.0,...,,0.0,1.164,0.003,-0.003,0.0,2.700,0.400,-0.400,


In [7]:
# how many entries are there? as an iterable
planets.index

RangeIndex(start=0, stop=4016, step=1)

In [8]:
planets.loc[0:3] #easy to grab subsets - here by label
#planets.loc? #easy to grab subsets - here by label

Unnamed: 0,pl_hostname,pl_letter,pl_name,pl_discmethod,pl_controvflag,pl_pnum,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,...,st_sperr,st_splim,st_lum,st_lumerr1,st_lumerr2,st_lumlim,st_age,st_ageerr1,st_ageerr2,st_agelim
0,11 Com,b,11 Com b,Radial Velocity,0,1,326.03,0.32,-0.32,0.0,...,,0.0,2.243,0.071,-0.085,0.0,,,,
1,11 UMi,b,11 UMi b,Radial Velocity,0,1,516.21997,3.2,-3.2,0.0,...,,0.0,,,,0.0,,,,0.0
2,14 And,b,14 And b,Radial Velocity,0,1,185.84,0.23,-0.23,0.0,...,,0.0,1.763,,,0.0,,,,
3,14 Her,b,14 Her b,Radial Velocity,0,1,1773.40002,2.5,-2.5,0.0,...,,0.0,,,,0.0,,,,0.0


In [9]:
planets.columns
# names of columns

Index(['pl_hostname', 'pl_letter', 'pl_name', 'pl_discmethod',
       'pl_controvflag', 'pl_pnum', 'pl_orbper', 'pl_orbpererr1',
       'pl_orbpererr2', 'pl_orbperlim', 'pl_orbsmax', 'pl_orbsmaxerr1',
       'pl_orbsmaxerr2', 'pl_orbsmaxlim', 'pl_orbeccen', 'pl_orbeccenerr1',
       'pl_orbeccenerr2', 'pl_orbeccenlim', 'pl_orbincl', 'pl_orbinclerr1',
       'pl_orbinclerr2', 'pl_orbincllim', 'pl_bmassj', 'pl_bmassjerr1',
       'pl_bmassjerr2', 'pl_bmassjlim', 'pl_bmassprov', 'pl_radj',
       'pl_radjerr1', 'pl_radjerr2', 'pl_radjlim', 'pl_dens', 'pl_denserr1',
       'pl_denserr2', 'pl_denslim', 'ra_str', 'ra', 'dec_str', 'dec',
       'st_dist', 'st_disterr1', 'st_disterr2', 'st_distlim', 'gaia_dist',
       'gaia_disterr1', 'gaia_disterr2', 'gaia_distlim', 'st_optmag',
       'st_optmagerr', 'st_optmaglim', 'st_optband', 'gaia_gmag',
       'gaia_gmagerr', 'gaia_gmaglim', 'st_teff', 'st_tefferr1', 'st_tefferr2',
       'st_tefflim', 'st_mass', 'st_masserr1', 'st_masserr2', 'st_mass