# EDI to jrn_metabase

Below are some tests for populating JRN Metabase from EDI using python. The basic idea is to pull metadata from JRN packages in EDI using the PASTA+ API, arrange that into a pandas dataframe, and then insert that metadata into JRN Metabase using psycopg2. It works best if the pandas dataframe matches the JRN Metabase tables being copied to.

In [1]:
import sys
sys.path.append('/home/greg/GitHub/')
import pyEDIutils.search as edi
import pandas as pd
import numpy as np

In [4]:
# Query edi for all data packages in jrn scope

#import importlib
#importlib.reload(edi)
df = edi.request_search(fields=['packageid','title','pubdate','keyword','author',
                                   'begindate','enddate','doi'],
                        rows=1000)

https://pasta.lternet.edu/package/search/eml?defType=edismax&q=%2A&fq=scope%3Aknb-lter-jrn&fl=packageid%2Ctitle%2Cpubdate%2Ckeyword%2Cauthor%2Cbegindate%2Cenddate%2Cdoi&sort=packageid%2Casc&rows=1000


In [5]:
df.head()

Unnamed: 0,packageid,title,pubdate,keywords,authors,begindate,enddate,doi
0,knb-lter-jrn.100.3,High resolution shrub cover raster maps of the...,2020,canopy cover;land surface properties;plant cov...,"Ji, Wenjie;Hanan, Niall P.",2011-01-01,2011-12-31,doi:10.6073/pasta/313fec8669bc7b4d8debf7393dd2...
1,knb-lter-jrn.210001001.62,Plant cover on 2 x 2 meter rainfall runoff plo...,2020,termites;runoff;plant cover;deserts;grasslands...,"Ward, Tim;Bolton, Susan;Schlesinger, William",1989-10-27,1990-10-19,doi:10.6073/pasta/d320f815d3183ea09fa2f18e0030...
2,knb-lter-jrn.210001002.76,Rainfall runoff and sediment deposition from 2...,2019,land cover;runoff;plant communities;deserts;gr...,"Ward, Tim",1982-09-09,1994-10-15,doi:10.6073/pasta/a6b6175f8a064fe2bd35a62e2de0...
3,knb-lter-jrn.210001003.79,Rainfall runoff water chemistry from 2 x 2 met...,2020,termites;runoff;chemical properties;deserts;gr...,"Ward, Tim;Bolton, Susan;Schlesinger, William",1988-06-27,1990-09-11,doi:10.6073/pasta/db814c25fb4bf8f6bdd7addbbcbd...
4,knb-lter-jrn.210002001.127,Graduated rain gauge (GRG) precipitation obser...,2019,precipitation;climate;rain;hydrological proces...,"Huenneke, Laura;Anderson, John",1989-01-03,2019-04-16,doi:10.6073/pasta/39920e433f27d19f0c196d431a4a...


In [129]:
# Split the dataset id and revisions out
df[['scope', 'datasetid', 'revision']] = df['packageid'].str.split('.',expand=True)

In [130]:
# Format the dataframe to look like the DataSet table in jrn_metabase
df['abstract'] = 'abstract.docx'
df['updatefrequency'] = 'notPlanned'
df['maintenancedesc'] = ''
df['abstracttype'] = 'file'
df['boilerplate'] = 'default'
df['shortname'] = ''

In [158]:
# Remove the met packages John/geovany use
import numpy as np
nonmet1 = ~df['datasetid'].str.contains('210437')
nonmet2 = ~df['datasetid'].str.contains('210548')
nonmet = np.logical_and(nonmet1, nonmet2)
nonmet.sum()

116

In [159]:
# Rearrange columns
df_in = df.loc[nonmet,['datasetid','revision','title','pubdate','abstract','shortname','updatefrequency','maintenancedesc','abstracttype','boilerplate']]

In [160]:
# Reformat dates in pubdate
df_in.pubdate = df_in.pubdate + '-12-31'
df_in.head()

Unnamed: 0,datasetid,revision,title,pubdate,abstract,shortname,updatefrequency,maintenancedesc,abstracttype,boilerplate
0,210011002,106,Seasonal non-destructive vegetation measuremen...,2019-12-31,abstract.docx,,notPlanned,,file,default
1,210011003,104,Annual mean estimates of aboveground net prima...,2020-12-31,abstract.docx,,notPlanned,,file,default
2,210011004,82,Seasonal reference harvest measurements of veg...,2019-12-31,abstract.docx,,notPlanned,,file,default
3,210011005,105,Annual ground-based photographs taken at 15 ne...,2019-12-31,abstract.docx,,notPlanned,,file,default
4,210012001,139,Raw neutron counts from a soil water content h...,2019-12-31,abstract.docx,,notPlanned,,file,default


In [161]:
df_in.shape

(116, 10)

In [9]:
#Import database engine and set up connection info
import psycopg2 as pg
import os
conninfo = {
    "host"      : "hostname",
    "database"  : "jrn_metabase_dev",
    "user"      : "username",
    "password"  : "password"
}

# For more info...
# see here: https://naysan.ca/2020/06/21/pandas-to-postgresql-using-psycopg2-copy_from/
# and here: https://www.postgresqltutorial.com/postgresql-python/insert/
# and here: https://gist.github.com/jakebrinkmann/de7fd185efe9a1f459946cf72def057e

In [2]:
# Establish database connection
conn = pg.connect(**conninfo)

In [5]:
# Do a test query of the database
sql = 'select * from lter_metabase."DataSet";'
dat = pd.read_sql_query(sql, conn)

In [7]:
dat.head()

Unnamed: 0,DataSetID,Revision,Title,PubDate,Abstract,ShortName,UpdateFrequency,MaintenanceDescription,AbstractType,BoilerplateSetting
0,99013,21,SBC LTER: TEST: Water temperature at the bottom,2019-08-07,abstract.99013.docx,Reef bottom water temperature,annually,Ongoing time series.,file,default
1,99021,11,SBC LTER: TEST: NPP dataset with 3 tables,2016-09-08,abstract.99021.docx,Beach wrack IV 2005-06,notPlanned,Completed timeseries. No future data updates a...,file,default
2,99024,17,SBC LTER: TEST: kelp CHN,2019-01-15,abstract.99024.docx,Kelp - algal weights and CHN,annually,Ongoing time series. Data updates may be delay...,file,default
3,99054,4,SBC LTER: TESTOLEUM: Giant kelp canopy biomass...,2014-01-14,abstract.99054.docx,Satellite kelp canopy biomass,notPlanned,Completed timeseries. No future data updates a...,file,default
4,210011002,106,Seasonal non-destructive vegetation measuremen...,2019-12-31,abstract.docx,,notPlanned,,file,default


In [163]:
# This is a function to use the "copy_from" command to add a
# csv to a database table
def copy_from_file(conn, df, table):
    """
    Here we are going save the dataframe on disk as 
    a csv file, load the csv file  
    and use copy_from() to copy it to the table
    """
    # Save the dataframe to disk
    tmp_df = "./tmp_dataframe.csv"
    df.to_csv(tmp_df, index=False, sep=';', header=False)
    f = open(tmp_df, 'r')
    cursor = conn.cursor()
    try:
        cursor.copy_from(f, table, sep=";")
        conn.commit()
    except (Exception, pg.DatabaseError) as error:
        os.remove(tmp_df)
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("copy_from_file() done")
    cursor.close()
    os.remove(tmp_df)

In [164]:
# Use the function to load df_in (from EDI) into the DataSet 
# table in jrn_metabase. Errors will be printed here if the 
# copy operation can't be completed.
copy_from_file(conn, df_in, 'lter_metabase."DataSet"') # copy the dataframe to SQL

copy_from_file() done


In [8]:
# Close the database connection
conn.close()