# Extract Precisions with tabular

A trail of tabular to extract the RV precision from our previous paper 1511.07468v1.pdf.


In [1]:
from tabula import read_pdf, read_pdf_table
import tabula
# Tabular needs java 6 or 7! 
# This is a hack that will not work everywhere. 
# I included it becasue my system java is version 9.
# You will need to point to own loaction of java. 
# https://stackoverflow.com/questions/31414041/how-to-prepend-a-path-to-sys-path-in-python

# May need to manually prepend java location
# using export to PATH before launching jupyter
import sys
b = sys.path
sys.path = ['/opt/java/jre1.7.0_79/bin'] + b
# print(sys.path)
import pandas as pd

In [2]:
# Specify paper
paper = "/home/jneal/Phd/Codes/eniric/docs/1511.07468v1.pdf"
paper_home = "/home/jneal/Phd/Codes/eniric2017/docs/1511.07468v1.pdf"
pages = [15, 16, 17]


In [3]:
# Read in the table from the pdf
try:
    df = read_pdf(paper, pages=pages, guess=True)
except:
    df = read_pdf(paper_home, pages=pages, guess=True)


In [4]:
# There is an extra line of headings which need removed.
# There is also a couple more futher in the data from 
# the top of each table as it spans 3 pages.
df.head()

Unnamed: 0,Simulation,σRV (Cond. 1),σRV (Cond. 2),σRV (Cond. 3)
0,(SpTp - Band - v.sini - R),[m/s],[m/s],[m/s]
1,M0-Z-1.0-60k,8.9,26.1,9.3
2,M0-Z-1.0-80k,6.0,17.1,6.2
3,M0-Z-1.0-100k,4.5,12.8,4.6
4,M0-Z-5.0-60k,13.6,38.9,14.0


In [5]:
# Remove mistakenly added title rows
# Easily done beacuse they do not start with "M"
df = df[df.Simulation.str.startswith("M")]
df.head()

Unnamed: 0,Simulation,σRV (Cond. 1),σRV (Cond. 2),σRV (Cond. 3)
1,M0-Z-1.0-60k,8.9,26.1,9.3
2,M0-Z-1.0-80k,6.0,17.1,6.2
3,M0-Z-1.0-100k,4.5,12.8,4.6
4,M0-Z-5.0-60k,13.6,38.9,14.0
5,M0-Z-5.0-80k,10.6,30.5,10.9


In [6]:
# Format the column names
print(df.columns)
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('σ', '')
df.columns = df.columns.str.replace('(', '')
df.columns = df.columns.str.replace(')', '')
df.columns = df.columns.str.replace('.', '')
df.columns

Index(['Simulation', 'σRV (Cond. 1)', 'σRV (Cond. 2)', 'σRV (Cond. 3)'], dtype='object')


Index(['Simulation', 'RV_Cond_1', 'RV_Cond_2', 'RV_Cond_3'], dtype='object')

In [7]:
# Turing RV precision values to floats
print("Before:\n", df.dtypes)

df["RV_Cond_1"] = df.RV_Cond_1.astype(float)
df["RV_Cond_2"] = df.RV_Cond_2.astype(float)
df["RV_Cond_3"] = df.RV_Cond_3.astype(float)

print("\nAfter:\n", df.dtypes)

Before:
 Simulation    object
RV_Cond_1     object
RV_Cond_2     object
RV_Cond_3     object
dtype: object

After:
 Simulation     object
RV_Cond_1     float64
RV_Cond_2     float64
RV_Cond_3     float64
dtype: object


In [8]:
# Add units to headers to save
hdr = df.columns
new_header = [hdr[0], hdr[1]+"[m/s]", hdr[2]+"[m/s]", hdr[3]+"[m/s]"]   # Adjust header to save results
new_header

['Simulation', 'RV_Cond_1[m/s]', 'RV_Cond_2[m/s]', 'RV_Cond_3[m/s]']

In [9]:
# Save Results to file
f = "../data/precision_data_paper2015.txt"

df.to_csv(f, mode='w', sep="\t", float_format="%6.2f", header=new_header, index=False)
