# Notebook to create csv files for TDB

Creates four csv files following the schema below:


![db schema](db.png "Title")

## Imports

In [24]:
import pandas as pd
import numpy as np

In [25]:
# creates four empty dataframes
biosignature_df = pd.DataFrame()
research_df = pd.DataFrame()
researcher_df = pd.DataFrame()
environment_df = pd.DataFrame()

### For `research_df`

In [63]:
column_names = ['research_id', 'researcher_id', 'methods', 'environment_id',
                'biosignature_id', 'category_of_material', 'subcat_of_material',
                'number_of_samples', 'min_age_of_samples', 'max_age_of_samples', 'url']

In [64]:
# fills in all dataframes with my research as example
research_id = [1]
researcher_id = [1]
methods = ['GC-MS']
environment_id = [[1,2]]
biosignature_id = [1]
category_of_material = ['geological']
subcat_of_material = ['sedimentary']
number_of_samples = [7]
min_age_of_samples = [0]
max_age_of_samples = [9400]
url = ['https://www.semanticscholar.org/paper/Biomolecules-from-Fossilized-Hot-Spring-Sinters%3A-on-Teece-George/9135d76c4e366de27c8d6f2cb09b3299e2ae3d3b']
columns = [research_id, researcher_id, methods, environment_id, biosignature_id,
           category_of_material, subcat_of_material, number_of_samples, min_age_of_samples, max_age_of_samples, url]

In [65]:
for index, column in enumerate(column_names):
    research_df[column] = columns[index]

In [66]:
research_df

Unnamed: 0,research_id,researcher_id,methods,environment_id,biosignature_id,category_of_material,subcat_of_material,number_of_samples,min_age_of_samples,max_age_of_samples,url
0,1,1,GC-MS,"[1, 2]",1,geological,sedimentary,7,0,9400,https://www.semanticscholar.org/paper/Biomolec...


### For `researcher_df`

In [30]:
column_names = ['researcher_id', 'name', 'affiliation', 'url']

In [31]:
# fills in all dataframes with my research as example
researcher_id = [1]
name = ['B. L. Teece']
affiliation = ['Australian Centre for Astrobiology (ACA) and PANGEA Research Centre,\
                School of Biological, Earth and Environmental Sciences, University of\
                New South Wales Sydney, Sydney, Australia']
url = ['https://www.semanticscholar.org/author/B.-Teece/89948855']
columns = [researcher_id, name, affiliation, url]

In [32]:
for index, column in enumerate(column_names):
    researcher_df[column] = columns[index]

In [33]:
researcher_df

Unnamed: 0,researcher_id,name,affiliation,url
0,1,B. L. Teece,Australian Centre for Astrobiology (ACA) and P...,https://www.semanticscholar.org/author/B.-Teec...


### For `biosignature_df`


In [39]:
column_names = ['biosignature_id', 'category', 'sub_category', 'biosignatures']

In [44]:
# fills in all dataframes with my research as example
biosignature_id = [1]
category = ['organics']
sub_category = ['hydrocarbons']
biosignatures = [['n-alkanes', 'methylalkanes', 'aromatics']]

columns = [biosignature_id, category, sub_category, biosignatures]

In [46]:
for index, column in enumerate(column_names):
    biosignature_df[column] = columns[index]

In [47]:
biosignature_df

Unnamed: 0,biosignature_id,category,sub_category,biosignatures
0,1,organics,hydrocarbons,"[n-alkanes, methylalkanes, aromatics]"


### For `environment_df`

In [49]:
column_names = ['environment_id', 'extreme_conditions', 'location_name', 'latitude', 'longitude', 'et_counterpart']

In [57]:
# fills in all dataframes with my research as example
environment_id = [1]
extreme_conditions = ['fossilized hot spring sinter']
location_name = ['El Tatio, Chile']
latitude = [-20.333333]
longitude = [-68.016667]
et_counterpart = ['Columbia Hills, Mars']

new_line = [2, 'fossilized hot spring sinter', 'Taupo Volcanic Zone, New Zealand', -38.4, 176.216667, 'Columbia Hills, Mars']

columns = [environment_id, extreme_conditions, location_name, latitude, longitude, et_counterpart]

In [60]:
environment_df.loc[len(environment_df)] = new_line
environment_df

Unnamed: 0,environment_id,extreme_conditions,location_name,latitude,longitude,et_counterpart
0,1,fossilized hot spring sinter,"El Tatio, Chile",-20.333333,-68.016667,"Columbia Hills, Mars"
1,2,fossilized hot spring sinter,"Taupo Volcanic Zone, New Zealand",-38.4,176.216667,"Columbia Hills, Mars"


### Export dataframes to csv

In [67]:
research_df.to_csv('../raw_data/research.csv')
researcher_df.to_csv('../raw_data/researcher.csv')
biosignature_df.to_csv('../raw_data/biosignature.csv')
environment_df.to_csv('../raw_data/environment.csv')