In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
%ls Data

high.csv  highp_finalresults.csv  low.csv  lowp_finalresults.csv


# Data Formatting
Here we give format to data for better manipulation.

Few things are important to state about the following process of formatting:

### Mass parameters were passed to logarithmic scale as it is a usual practice in astrophysics for a better analysis of various relations. 

### Planetary parameters [a(i),emegas(i),rplanet(i)/radtie] were omitted as they are not necessary to calculate quantities to predict, nor they are initial conditions of the system (we want to predict)**

### Parameters ['ident', 't', 'gama', 'apert', 'fpert', 'constmigI'] were dropped as well. Identity index is an arbitrary identifier and is not taken into account. The rest of omitted parameters are the same for all systems and have no prediction power. For these parameters, we calculated the standard deviation and verified that it is zero. (Data_Formatting.ipynb)


----------------------------

----------------------------

In [None]:
# Loads data (Fills NaN values as 0 for simplicity) THERE IS ONLY ONE NaN VALUE 
data=pd.read_csv("./Data/highp_finalresults.csv",sep = ',',
                 engine = 'python',skipinitialspace = True,na_values = 'NaN ').fillna(0)
# Each row, (for now is a different planet)
# Adds column named n_planet___ it is later going to be summed
data["n_planets"] = 1

# Selects useful planet data only to produce: number of planets, total mass of planets. 
# Identity of system is conserved
planet_data = data[['ident','emepla(i)/emet','n_planets']]

# Selects system data only. 
# System columns corresponding to initial condition
# According to repository README.md, we have to drop: a(i),emegas(i),emepla(i)/emed,rplanet(i)/radtie
system_data = data.drop(['a(i)','emegas(i)','emepla(i)/emet','rplanet(i)/radtie','n_planets'],1)

# System data is repeated as rows represent planets -> so we take the mean
system_data = system_data.groupby(system_data['ident']).mean()

# From planets data it is only necesary the total data (total mass, total number of planets)
planet_data = planet_data.groupby(planet_data['ident']).sum()

# Concatenate two data frames to obtain final formatted data
data = pd.concat([system_data,planet_data], axis=1).reset_index()
# Important position of parameters
print(np.where(data.keys()=='n_planets'),np.where(data.keys()=='emepla(i)/emet'))

# Drops unimportant parameters with null std()
# 0,2,8,9,10,11 are unimportant parameters
# Data for the first point
print(data.keys()[[0,2,8,9,10,11,-1]])
data = data.drop(data.keys()[[0,2,8,9,10,11,-1]],1)

# Logarithmic scales for mass-related parameters (1e-5 to avoid log(0)=-infty)
data['emepla(i)/emet'] = np.log(data['emepla(i)/emet']+1e-5)
data['emestar'] = np.log(data['emestar']+1e-5)
data['sigmag_0'] = np.log(data['sigmag_0']+1e-5)