# Datasets - from EDI to jrn_metabase

Below are some tests for populating JRN Metabase from EDI using python. The basic idea is to pull metadata from JRN packages in EDI using the PASTA+ API, arrange that into a pandas dataframe, and then insert that metadata into JRN Metabase using psycopg2. It works best if the pandas dataframe matches the JRN Metabase tables being copied to.

In [35]:
import sys
sys.path.append('/home/greg/GitHub/')
import pyEDIutils.search as edi
import pandas as pd
import numpy as np

## Create a DataSets table

In [36]:
# Query edi for all data packages in jrn scope

#import importlib
#importlib.reload(edi)
df = edi.request_search(fields=['packageid','title','pubdate','keyword','author',
                                   'begindate','enddate','doi'],
                        rows=1000)

https://pasta.lternet.edu/package/search/eml?defType=edismax&q=%2A&fq=scope%3Aknb-lter-jrn&fl=packageid%2Ctitle%2Cpubdate%2Ckeyword%2Cauthor%2Cbegindate%2Cenddate%2Cdoi&sort=packageid%2Casc&rows=1000


In [37]:
df.head()

Unnamed: 0,packageid,title,pubdate,keywords,authors,begindate,enddate,doi
0,knb-lter-jrn.100.3,High resolution shrub cover raster maps of the...,2020,canopy cover;land surface properties;plant cov...,"Ji, Wenjie;Hanan, Niall P.",2011-01-01,2011-12-31,doi:10.6073/pasta/313fec8669bc7b4d8debf7393dd2...
1,knb-lter-jrn.210001001.62,Plant cover on 2 x 2 meter rainfall runoff plo...,2020,termites;runoff;plant cover;deserts;grasslands...,"Ward, Tim;Bolton, Susan;Schlesinger, William",1989-10-27,1990-10-19,doi:10.6073/pasta/d320f815d3183ea09fa2f18e0030...
2,knb-lter-jrn.210001002.76,Rainfall runoff and sediment deposition from 2...,2019,land cover;runoff;plant communities;deserts;gr...,"Ward, Tim",1982-09-09,1994-10-15,doi:10.6073/pasta/a6b6175f8a064fe2bd35a62e2de0...
3,knb-lter-jrn.210001003.79,Rainfall runoff water chemistry from 2 x 2 met...,2020,termites;runoff;chemical properties;deserts;gr...,"Ward, Tim;Bolton, Susan;Schlesinger, William",1988-06-27,1990-09-11,doi:10.6073/pasta/db814c25fb4bf8f6bdd7addbbcbd...
4,knb-lter-jrn.210002001.127,Graduated rain gauge (GRG) precipitation obser...,2019,precipitation;climate;rain;hydrological proces...,"Huenneke, Laura;Anderson, John",1989-01-03,2019-04-16,doi:10.6073/pasta/39920e433f27d19f0c196d431a4a...


In [38]:
# Split the dataset id and revisions out
df[['scope', 'datasetid', 'revision']] = df['packageid'].str.split('.',expand=True)

In [39]:
# Format the dataframe to look like the DataSet table in jrn_metabase
df['abstract'] = 'abstract.docx'
df['updatefrequency'] = 'notPlanned'
df['maintenancedesc'] = ''
df['abstracttype'] = 'file'
df['boilerplate'] = 'default'
df['shortname'] = ''

In [40]:
# Remove the met packages John/geovany use
import numpy as np
nonmet1 = ~df['datasetid'].str.contains('210437')
nonmet2 = ~df['datasetid'].str.contains('210548')
nonmet = np.logical_and(nonmet1, nonmet2)
nonmet.sum()

117

In [41]:
# Rearrange columns
ds_in = df.loc[nonmet,['datasetid','revision','title','pubdate','abstract','shortname','updatefrequency','maintenancedesc','abstracttype','boilerplate']]

In [42]:
# Reformat dates in pubdate
ds_in.pubdate = ds_in.pubdate + '-12-31'
ds_in.head()

Unnamed: 0,datasetid,revision,title,pubdate,abstract,shortname,updatefrequency,maintenancedesc,abstracttype,boilerplate
0,100,3,High resolution shrub cover raster maps of the...,2020-12-31,abstract.docx,,notPlanned,,file,default
1,210001001,62,Plant cover on 2 x 2 meter rainfall runoff plo...,2020-12-31,abstract.docx,,notPlanned,,file,default
2,210001002,76,Rainfall runoff and sediment deposition from 2...,2019-12-31,abstract.docx,,notPlanned,,file,default
3,210001003,79,Rainfall runoff water chemistry from 2 x 2 met...,2020-12-31,abstract.docx,,notPlanned,,file,default
4,210002001,127,Graduated rain gauge (GRG) precipitation obser...,2019-12-31,abstract.docx,,notPlanned,,file,default


In [44]:
ds_in.shape
ds_in[ds_in.datasetid=='210351004']

Unnamed: 0,datasetid,revision,title,pubdate,abstract,shortname,updatefrequency,maintenancedesc,abstracttype,boilerplate
73,210351004,2,Quadrat-based monitoring of desert grassland v...,2021-12-31,abstract.docx,,notPlanned,,file,default


## Create a data entity table

In [10]:
# direct api calls
import pyEDIutils.pkginfo as edi2

In [11]:
ent = edi2.entity_table('knb-lter-jrn', '210011002', '106')
ent

https://pasta.lternet.edu/package/name/eml/knb-lter-jrn/210011002/106
https://pasta.lternet.edu/package/data/rmd/eml/knb-lter-jrn/210011002/106/d6dca3385347d23e7b94d59a0ae15ffa
https://pasta.lternet.edu/package/data/rmd/eml/knb-lter-jrn/210011002/106/ffdbf923e4b116d81156c18af0301ef2


Unnamed: 0,entityid,entityname,packageid,datasetid,entityorder,filename,entitytype,filetype
0,d6dca3385347d23e7b94d59a0ae15ffa,"Plant dimensions (cover, height) collected by ...",knb-lter-jrn.210011002.106,210011002,1,JRN_011002_npp_quadrat_meas.csv,dataTable,csv_D
1,ffdbf923e4b116d81156c18af0301ef2,Field protocol for non-destructive quadrat mea...,knb-lter-jrn.210011002.106,210011002,2,nppfield_protocol_master_for-distribution.pdf,otherEntity,


In [12]:
ds_in.tail()

Unnamed: 0,datasetid,revision,title,pubdate,abstract,shortname,updatefrequency,maintenancedesc,abstracttype,boilerplate
112,210425001,75,Gap-filled daily precipitation at the 15 long-...,2020-12-31,abstract.docx,,notPlanned,,file,default
233,210461001,17,Perennial grass recovery following livestock o...,2019-12-31,abstract.docx,,notPlanned,,file,default
234,210472001,1,Vegetation cover and Soil Organic Carbon along...,2018-12-31,abstract.docx,,notPlanned,,file,default
235,210493001,1,Biotic and abiotic composition of biological s...,2020-12-31,abstract.docx,,notPlanned,,file,default
236,210520001,1,"Vascular Plant Species of the Jornada Basin, 1...",2018-12-31,abstract.docx,,notPlanned,,file,default


In [27]:
importlib.reload(edi2)
for i, ind in enumerate(ds_in.index):
    ents = edi2.entity_table('knb-lter-jrn', ds_in.loc[ind,'datasetid'],
                             ds_in.loc[ind,'revision'])
    if i<1:
        ent_out = ents
    else:
        ent_out = pd.concat([ent_out, ents])

https://pasta.lternet.edu/package/name/eml/knb-lter-jrn/100/3
https://pasta.lternet.edu/package/data/rmd/eml/knb-lter-jrn/100/3/2ef8e20914ce4ca8d5a9135df1fec521
https://pasta.lternet.edu/package/data/rmd/eml/knb-lter-jrn/100/3/340a62188ebb0f784b8060b61463eb4e
https://pasta.lternet.edu/package/name/eml/knb-lter-jrn/210001001/62
https://pasta.lternet.edu/package/data/rmd/eml/knb-lter-jrn/210001001/62/64ed05cec3b6bff3c09a5233408d76de
https://pasta.lternet.edu/package/data/rmd/eml/knb-lter-jrn/210001001/62/200c2e217b60d7d438421819d1dc9907
https://pasta.lternet.edu/package/name/eml/knb-lter-jrn/210001002/76
https://pasta.lternet.edu/package/data/rmd/eml/knb-lter-jrn/210001002/76/bb0dd59713db3a3455e5435d4e9fd1f1
https://pasta.lternet.edu/package/data/rmd/eml/knb-lter-jrn/210001002/76/a5e94f2400174bd7842f64217d560184
https://pasta.lternet.edu/package/name/eml/knb-lter-jrn/210001003/79
https://pasta.lternet.edu/package/data/rmd/eml/knb-lter-jrn/210001003/79/67907967840dd83a28c45f497c51180f
htt

In [28]:
ent_out

Unnamed: 0,entityid,entityname,packageid,datasetid,entityorder,filename,entitytype,filetype
0,2ef8e20914ce4ca8d5a9135df1fec521,JER_CDRRC_shrubcover_2011_1ha.tif,knb-lter-jrn.100.3,100,1,JER_CDRRC_shrubcover_2011_1ha.tif,otherEntity,
1,340a62188ebb0f784b8060b61463eb4e,JER_CDRRC_shrubcover_2011.tif,knb-lter-jrn.100.3,100,2,JER_CDRRC_shrubcover_2011.tif,otherEntity,
0,64ed05cec3b6bff3c09a5233408d76de,Plant cover csv,knb-lter-jrn.210001001.62,210001001,1,JRN_001001_runoff_vegetation_data.csv,dataTable,csv_D
1,200c2e217b60d7d438421819d1dc9907,Detailed procedures text file,knb-lter-jrn.210001001.62,210001001,2,Hydrology_prog.txt,otherEntity,
0,bb0dd59713db3a3455e5435d4e9fd1f1,Rainfall runoff and sediment deposition data,knb-lter-jrn.210001002.76,210001002,1,JRN_001002_Hydrology_Runoff.csv,dataTable,csv_D
...,...,...,...,...,...,...,...,...
1,7e4e883b01c49ecd1a5296df5865f66d,Cattle_soil_carbon_figure.jpg,knb-lter-jrn.210472001.1,210472001,2,Cattle_soil_carbon_figure.jpg,otherEntity,
0,0c561cd3e7366da62c7d5842c6f2b15a,"Soil crust microfauna counts, 2017-2018",knb-lter-jrn.210493001.1,210493001,1,JRN_493001_soilcrust_microfauna_count_data.csv,dataTable,csv_D
1,b732cd7b8667f135d6bc9e16dc8a574e,"Soil crust PLFA data, 2017-2018",knb-lter-jrn.210493001.1,210493001,2,JRN_493001_soilcrust_microflora_plfa_data.csv,dataTable,csv_D
2,667daf6ce2edae2a192f7b90b71d246f,"Soil properties data, 2017-2018",knb-lter-jrn.210493001.1,210493001,3,JRN_493001_soilcrust_soilprop_data.csv,dataTable,csv_D


In [29]:
ent_out['entitydesc'] = 'Add description'
ent_in = ent_out.loc[:, ['datasetid','entityorder','entityname','entitytype',
                         'entitydesc','filetype','filename']]
ent_in.head()

Unnamed: 0,datasetid,entityorder,entityname,entitytype,entitydesc,filetype,filename
0,100,1,JER_CDRRC_shrubcover_2011_1ha.tif,otherEntity,Add description,,JER_CDRRC_shrubcover_2011_1ha.tif
1,100,2,JER_CDRRC_shrubcover_2011.tif,otherEntity,Add description,,JER_CDRRC_shrubcover_2011.tif
0,210001001,1,Plant cover csv,dataTable,Add description,csv_D,JRN_001001_runoff_vegetation_data.csv
1,210001001,2,Detailed procedures text file,otherEntity,Add description,,Hydrology_prog.txt
0,210001002,1,Rainfall runoff and sediment deposition data,dataTable,Add description,csv_D,JRN_001002_Hydrology_Runoff.csv


In [30]:
ent_in.entityname.iloc[164]

'Soil Particle Size Analysis and Sand Fractionation data at Permanent Quadrat locations, Jornada Experimental Range, 2001-2020'

In [31]:
# Some too-long names
ent_in.entityname.iloc[114] = 'Rodent capture data across grass-shrub ecotones at 3 sites, 2004-ongoing'
ent_in.entityname.iloc[115] = 'Rodent abundance and biomass data across grass-shrub ecotones at 3 sites, 2004-ongoing'
ent_in.entityname.iloc[119] = 'Plant phenology observations at 15 net primary production (NPP) study sites, 1992-ongoing'
ent_in.entityname.iloc[150] = 'Aboveground Net Primary Production (g/m2) by plant functional group under precip treatments'
ent_in.entityname.iloc[160] = 'Basal Cover of Perennial Grasses and Canopy Cover of Shrubs at JER Permanent Quadrats, 1915-ongoing'
ent_in.entityname.iloc[162] = 'Plant Density of Perennial Forbs and Subshrubs at JER Permanent Quadrats, 1915-ongoing'
ent_in.entityname.iloc[164] = 'Soil Particle Size Analysis and Sand Fractionation data at JER Permanent Quadrats, 2001-2020'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## Now insert the tables

In [50]:
# Establish database connection
sys.path.append('../')
import py2pg.connect as connect
conn = connect.connect('../jrn_metabase_dev.conn.json')

In [19]:
# Do a test query of the database
sql = 'select * from lter_metabase."DataSet";'
dat = pd.read_sql_query(sql, conn)

In [20]:
dat.head()

Unnamed: 0,DataSetID,Revision,Title,PubDate,Abstract,ShortName,UpdateFrequency,MaintenanceDescription,AbstractType,BoilerplateSetting
0,99013,21,SBC LTER: TEST: Water temperature at the bottom,2019-08-07,abstract.99013.docx,Reef bottom water temperature,annually,Ongoing time series.,file,default
1,99021,11,SBC LTER: TEST: NPP dataset with 3 tables,2016-09-08,abstract.99021.docx,Beach wrack IV 2005-06,notPlanned,Completed timeseries. No future data updates a...,file,default
2,99024,17,SBC LTER: TEST: kelp CHN,2019-01-15,abstract.99024.docx,Kelp - algal weights and CHN,annually,Ongoing time series. Data updates may be delay...,file,default
3,99054,4,SBC LTER: TESTOLEUM: Giant kelp canopy biomass...,2014-01-14,abstract.99054.docx,Satellite kelp canopy biomass,notPlanned,Completed timeseries. No future data updates a...,file,default
4,210011002,106,Seasonal non-destructive vegetation measuremen...,2019-12-31,abstract.docx,,notPlanned,,file,default


In [21]:
import py2pg.populate as pop

In [47]:
# Use the function to load ds_in (from EDI) into the DataSet 
# table in jrn_metabase. Errors will be printed here if the 
# copy operation can't be completed.
pop.copy_from_file(conn, ds_in[ds_in.datasetid=='210351004'], 'lter_metabase."DataSet"') # copy the dataframe to SQL
# Close the database connection
#conn.close()

> [0;32m/home/greg/GitHub/lter_metabase_utils/py2pg/populate.py[0m(26)[0;36mcopy_from_file[0;34m()[0m
[0;32m     24 [0;31m    [0mdf[0m[0;34m.[0m[0mto_csv[0m[0;34m([0m[0mtmp_df[0m[0;34m,[0m [0mindex[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0msep[0m[0;34m=[0m[0;34m';'[0m[0;34m,[0m [0mheader[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     25 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 26 [0;31m    [0mf[0m [0;34m=[0m [0mopen[0m[0;34m([0m[0mtmp_df[0m[0;34m,[0m [0;34m'r'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     27 [0;31m    [0mcursor[0m [0;34m=[0m [0mconn[0m[0;34m.[0m[0mcursor[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     28 [0;31m    [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


copy_from_file() done


In [51]:
import importlib
importlib.reload(pop)
pop.copy_from_file(conn, ent_in, 'lter_metabase."DataSetEntities"',
                   columns=('"DataSetID"','"EntitySortOrder"','"EntityName"','"EntityType"',
                            '"EntityDescription"','"FileType"','"FileName"')) # copy the dataframe to SQL
# Close the database connection
conn.close()

Error: duplicate key value violates unique constraint "PK_DataSetEntities"
DETAIL:  Key ("DataSetID", "EntityName")=(100, JER_CDRRC_shrubcover_2011_1ha.tif) already exists.
CONTEXT:  COPY DataSetEntities, line 1

