# Notebook to create csv files for TDB

Creates four csv files following the schema below:


![db schema](db.png "Title")

## Imports

In [4]:
import pandas as pd
import numpy as np

from biosignature_db import plots
from biosignature_db import data

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create csv files

In [None]:
# creates four empty dataframes
biosignature_df = pd.DataFrame()
research_df = pd.DataFrame()
researcher_df = pd.DataFrame()
environment_df = pd.DataFrame()

### For `research_df`

In [None]:
column_names = ['research_id', 'researcher_id', 'methods', 'environment_id',
                'biosignature_id', 'category_of_material', 'subcat_of_material',
                'number_of_samples', 'min_age_of_samples', 'max_age_of_samples', 'url']

In [None]:
# fills in all dataframes with my research as example
research_id = [1]
researcher_id = [1]
methods = ['GC-MS']
environment_id = [[1,2]]
biosignature_id = [1]
category_of_material = ['geological']
subcat_of_material = ['sedimentary']
number_of_samples = [7]
min_age_of_samples = [0]
max_age_of_samples = [9400]
url = ['https://www.semanticscholar.org/paper/Biomolecules-from-Fossilized-Hot-Spring-Sinters%3A-on-Teece-George/9135d76c4e366de27c8d6f2cb09b3299e2ae3d3b']
columns = [research_id, researcher_id, methods, environment_id, biosignature_id,
           category_of_material, subcat_of_material, number_of_samples, min_age_of_samples, max_age_of_samples, url]

In [None]:
for index, column in enumerate(column_names):
    research_df[column] = columns[index]

In [None]:
research_df

### For `researcher_df`

In [None]:
column_names = ['researcher_id', 'name', 'affiliation', 'url']

In [None]:
# fills in all dataframes with my research as example
researcher_id = [1]
name = ['B. L. Teece']
affiliation = ['Australian Centre for Astrobiology (ACA) and PANGEA Research Centre,\
                School of Biological, Earth and Environmental Sciences, University of\
                New South Wales Sydney, Sydney, Australia']
url = ['https://www.semanticscholar.org/author/B.-Teece/89948855']
columns = [researcher_id, name, affiliation, url]

In [None]:
for index, column in enumerate(column_names):
    researcher_df[column] = columns[index]

In [None]:
researcher_df

### For `biosignature_df`


In [None]:
column_names = ['biosignature_id', 'category', 'sub_category', 'biosignatures']

In [None]:
# fills in all dataframes with my research as example
biosignature_id = [1]
category = ['organics']
sub_category = ['hydrocarbons']
biosignatures = [['n-alkanes', 'methylalkanes', 'aromatics']]

columns = [biosignature_id, category, sub_category, biosignatures]

In [None]:
for index, column in enumerate(column_names):
    biosignature_df[column] = columns[index]

In [None]:
biosignature_df

### For `environment_df`

In [None]:
column_names = ['environment_id', 'extreme_conditions', 'location_name', 'latitude', 'longitude', 'et_counterpart']

In [None]:
# fills in all dataframes with my research as example
environment_id = [1]
extreme_conditions = ['fossilized hot spring sinter']
location_name = ['El Tatio, Chile']
latitude = [-20.333333]
longitude = [-68.016667]
et_counterpart = ['Columbia Hills, Mars']

new_line = [2, 'fossilized hot spring sinter', 'Taupo Volcanic Zone, New Zealand', -38.4, 176.216667, 'Columbia Hills, Mars']

columns = [environment_id, extreme_conditions, location_name, latitude, longitude, et_counterpart]

In [None]:
environment_df.loc[len(environment_df)] = new_line
environment_df

### Export dataframes to csv

In [None]:
research_df.to_csv('../raw_data/research.csv')
researcher_df.to_csv('../raw_data/researcher.csv')
biosignature_df.to_csv('../raw_data/biosignature.csv')
environment_df.to_csv('../raw_data/environment.csv')

## Export dataframes to JSON

In [2]:
#publication_df = pd.read_csv('../raw_data/publication.csv')
#author_df = pd.read_csv('../raw_data/author.csv')
#environment_df = pd.read_csv('../raw_data/environment.csv')
biosignature_df = pd.read_csv('../raw_data/biosignature.csv')

In [3]:
#publication_df.to_json('../biosignature_db/data/publication.json')
#author_df.to_json('../biosignature_db/data/author.json')
biosignature_df.to_json('../biosignature_db/data/biosignature.json')
#environment_df.to_json('../biosignature_db/data/environment.json')

In [6]:
biosignature_json = data.read_json_data('../biosignature_db/data/biosignature.json')
biosignature_json

{'biosignature_id': {'0': 1, '1': 2, '2': 3, '3': 4},
 'category': {'0': 'geological',
  '1': 'geological',
  '2': 'geological',
  '3': 'geological'},
 'sub-category': {'0': 'hydrocarbons',
  '1': 'hydrocarbons',
  '2': 'hydrocarbons',
  '3': 'hydrocarbons'},
 'name': {'0': 'n-alkanes',
  '1': 'methylalkanes',
  '2': 'n-alkanes',
  '3': 'methylalkanes'},
 'indicative_of': {'0': 'plants',
  '1': 'cyanobacteria',
  '2': 'algae',
  '3': 'cyanobacteria'},
 'detection_methods': {'0': 'gc-ms', '1': 'gc-ms', '2': 'gc-ms', '3': 'gc-ms'},
 'sample_type': {'0': 'geological',
  '1': 'geological',
  '2': 'geological',
  '3': 'geological'},
 'sample_subtype': {'0': 'sedimentary',
  '1': 'sedimentary',
  '2': 'sedimentary',
  '3': 'sedimentary'},
 'number of samples': {'0': 3, '1': 3, '2': 4, '3': 4},
 'min_age': {'0': 0, '1': 0, '2': 0, '3': 0},
 'max_age': {'0': 9400, '1': 9400, '2': 9400, '3': 9400},
 'pub_url': {'0': 'https://www.liebertpub.com/doi/full/10.1089/ast.2018.2018',
  '1': 'https://ww

In [7]:
bio_df = pd.DataFrame(biosignature_json)
bio_df

Unnamed: 0,biosignature_id,category,sub-category,name,indicative_of,detection_methods,sample_type,sample_subtype,number of samples,min_age,max_age,pub_url,env_conditions,location_name,latitude,longitude,et_counterpart
0,1,geological,hydrocarbons,n-alkanes,plants,gc-ms,geological,sedimentary,3,0,9400,https://www.liebertpub.com/doi/full/10.1089/as...,fossilized hot springs,"El Tatio, Chile",-20.333333,-68.016667,"Columbia Hills, Mars"
1,2,geological,hydrocarbons,methylalkanes,cyanobacteria,gc-ms,geological,sedimentary,3,0,9400,https://www.liebertpub.com/doi/full/10.1089/as...,fossilized hot springs,"El Tatio, Chile",-20.333333,-68.016667,"Columbia Hills, Mars"
2,3,geological,hydrocarbons,n-alkanes,algae,gc-ms,geological,sedimentary,4,0,9400,https://www.liebertpub.com/doi/full/10.1089/as...,fossilized hot springs,"Taupo Volcanic Zone, New Zealand",-38.4,176.216667,"Columbia Hills, Mars"
3,4,geological,hydrocarbons,methylalkanes,cyanobacteria,gc-ms,geological,sedimentary,4,0,9400,https://www.liebertpub.com/doi/full/10.1089/as...,fossilized hot springs,"Taupo Volcanic Zone, New Zealand",-38.4,176.216667,"Columbia Hills, Mars"


## Plot coordinates on 3D globe

In [None]:
plots.plot_interactive_map(bio_df, projection = 'natural earth')

In [8]:
colorscale =[[0.0, 'rgb(30, 59, 117)'],

                 [0.1, 'rgb(46, 68, 21)'],
                 [0.2, 'rgb(74, 96, 28)'],
                 [0.3, 'rgb(115,141,90)'],
                 [0.4, 'rgb(122, 126, 75)'],

                 [0.6, 'rgb(122, 126, 75)'],
                 [0.7, 'rgb(141,115,96)'],
                 [0.8, 'rgb(223, 197, 170)'],
                 [0.9, 'rgb(237,214,183)'],

                 [1.0, 'rgb(255, 255, 255)']]

In [None]:
from PIL import Image

def sphere(size, texture): 
    N_lat = int(texture.shape[0])
    N_lon = int(texture.shape[1])
    theta = np.linspace(0,2*np.pi,N_lat)
    phi = np.linspace(0,np.pi,N_lon)
    
    # Set up coordinates for points on the sphere
    x0 = size * np.outer(np.cos(theta),np.sin(phi))
    y0 = size * np.outer(np.sin(theta),np.sin(phi))
    z0 = size * np.outer(np.ones(N_lat),np.cos(phi))
    
    # Set up trace
    return x0,y0,z0

texture = np.asarray(Image.open('earth.jpg'.format(planet_name))).T

x,y,z = sphere(radius,texture)
surf = go.Surface(x=x, y=y, z=z,
                  surfacecolor=texture,
                  colorscale=colorscale)    

layout = go.Layout(scene=dict(aspectratio=dict(x=1, y=1, z=1)))

fig = go.Figure(data=[surf], layout=layout)

fig.show()