In [1]:
# import libraries
import requests
import pandas as pd
#import urllib.request
#import time
from bs4 import BeautifulSoup

## Scrape the Jornada Data catalog

This ('https://jornada.nmsu.edu/data-catalogs/jornada') seems to be the most comprehensive data catalog on the Jornada websites. It had 172 total packages at last count. Some are duplicates or test packages, so I think this explains why it is different than the LTER data catalog (https://jornada.nmsu.edu/lter/data), which had only 152 at last count.

In [2]:
url = 'https://jornada.nmsu.edu/data-catalogs/jornada'
response = requests.get(url)

In [3]:
response

<Response [200]>

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
datasets = soup.findAll('div', attrs={'class':'views-row'})
len(datasets)

172

In [6]:
links = []
ids = []
titles = []
creators = []
files = []
packages = []

for i, ds in enumerate(datasets):
    #dataset IDs
    id_div = ds.find('div', attrs={'class':'views-field views-field-field-data-set-id'})
    ids.append(id_div.find('div', attrs={'class':'field-content'}).string[12:])
    # titles
    title_div = ds.find('div', attrs={'class':'views-field views-field-title'})
    titles.append(title_div.find('h2').string)
    # links
    links.append('https://jornada.nmsu.edu' + title_div.find('a')['href'])
    # creators
    creator_div = ds.find('div', attrs={'class':'views-field views-field-field-person-creator'})
    try:
        creators.append(creator_div.find('div', attrs={'class':'field-content'}).string[23:])
    except: 
        creators.append('NA')
    # data files
    file_div = ds.find('div', attrs={'class':'views-field views-field-field-data-source-file'})
    try:
        files.append(file_div.find('div', attrs={'class':'field-content'}).find('a')['href'])
    except:
        files.append('NA')
    # data package
    package_div = ds.find('div', attrs={'class':'views-field views-field-field-related-links'})
    try:
        packages.append(package_div.find('div', attrs={'class':'field-content'}).find('a')['href'])
    except:
        packages.append('NA')

In [7]:
# Write the table out using pandas

jornada_out = pd.DataFrame(
    {'link': links,
     'ID': ids,
     'title': titles,
     'creator': creators,
     'files': files,
     'packages': packages
    })

jornada_out.to_csv('/Volumes/DataProducts/LTER_IM/Website_data_rescue/Jornada_all_catalog.csv')

### Check that the website is the same as the archived copy

In [8]:
handle = open('/Users/gmaurer/Desktop/Jornada Data | Jornada.html')
soup2 = BeautifulSoup(handle, 'html.parser')

In [9]:
titles2 = soup2.findAll('div', attrs={'class':'views-field views-field-title'})
print(len(titles2))

172


## Scrape the Jornada "Long-Term" data catalog

https://jornada.nmsu.edu/data-catalogs/long-term - At last count (9-April-2020) this has 65 datasets

In [10]:
url = 'https://jornada.nmsu.edu/data-catalogs/long-term'
response = requests.get(url)

In [11]:
response

<Response [200]>

In [12]:
soup = BeautifulSoup(response.text, 'html.parser')

In [13]:
rows = soup.findAll('tr')
len(rows)
# 7 of these are header rows

72

In [14]:
links = []
titles = []
resp_pi = []
packages = []

for i, r in enumerate(rows):
    # titles & link
    title_div = r.find('td', attrs={'class':'views-field views-field-title views-align-left views-table-width-30'})
    try:
        titles.append(title_div.find('a').string)
        links.append('https://jornada.nmsu.edu' + title_div.find('a')['href'])
    except:
        titles.append('NA')
        links.append('NA')
    # creators
    resp_div = r.find('td', attrs={'class': 'views-field views-field-field-name views-table-width-15'})
    try:
        resp_pi.append(resp_div.find('a').string)
    except: 
        resp_pi.append('NA')
    # data package
    package_div = r.find('td', attrs={'class':'views-field views-field-field-related-links views-table-width-10'})
    try:
        packages.append(package_div.find('a')['href'])
    except:
        packages.append('NA')

In [15]:
# Write the table out using pandas

jornada_out = pd.DataFrame(
    {'link': links,
     'title': titles,
     'responsible_pi': resp_pi,
     'packages': packages
    })

jornada_out.to_csv('/Volumes/DataProducts/LTER_IM/Website_data_rescue/Jornada_long-term_catalog.csv')

## Scrape the Jornada "Climate" data catalog

https://jornada.nmsu.edu/data-catalogs/climate - At last count (9-April-2020) this has 18 datasets

In [16]:
url = 'https://jornada.nmsu.edu/data-catalogs/climate'
response = requests.get(url)

In [17]:
response

<Response [200]>

In [18]:
#soup = BeautifulSoup(response.text, 'html.parser')
soup=BeautifulSoup(response.text, "lxml")

In [19]:
table_body=soup.find('tbody')
rows = table_body.find_all('tr') # This step excludes the header rows
#rows = soup.findAll('tr')
len(rows)
# Missing 3 rows - no idea why!!!

15

In [20]:
links = []
titles = []
resp_pi = []
packages = []

for i, r in enumerate(rows):
    # titles & link
    title_div = r.find('td', attrs={'class':'views-field views-field-title'})
    try:
        titles.append(title_div.find('a').string)
        links.append('https://jornada.nmsu.edu' + title_div.find('a')['href'])
    except:
        titles.append('NA')
        links.append('NA')
    # creators
    resp_div = r.find('td', attrs={'class': 'views-field views-field-field-name'})
    try:
        resp_pi.append(resp_div.find('a').string)
    except: 
        resp_pi.append('NA')
    # data package
    package_div = r.find('td', attrs={'class':'views-field views-field-field-related-links'})
    try:
        packages.append(package_div.find('a')['href'])
    except:
        packages.append('NA')

In [21]:
packages

['http://jornada.nmsu.edu/files/dataset_packages/JornadaStudy_121_biodiversity_precipitation_tipping_bucket_raingauge_event_data.zip',
 'http://jornada.nmsu.edu/files/dataset_packages/JornadaStudy_121_biodiversity_precipitation_tipping_bucket_raingauge_daily_data.zip',
 'http://jornada.nmsu.edu/files/dataset_packages/JornadaStudy_169_npp_atmospheric_deposition_adc_dust_collection_dryfall_data.zip',
 'http://jornada.nmsu.edu/files/dataset_packages/Jornada_425001_npp_estimated_daily_precipitation_data.zip',
 'http://jornada.nmsu.edu/files/dataset_packages/Jornada_127_evaporation_pan_data.zip',
 'http://jornada.nmsu.edu/files/dataset_packages/JornadaStudy_126_lter_weather_station_climate_daily_data.zip',
 'https://jornada.nmsu.edu/files/dataset_packages/Jornada_126002_lter_weather_station_hourly_data.zip',
 'http://jornada.nmsu.edu/files/dataset_packages/JornadaStudy_002_npp_precipitation_graduated_raingauge_monthly_data.zip',
 'NA',
 'http://jornada.nmsu.edu/files/dataset_packages/Jornad

In [22]:
# Write the table out using pandas

jornada_out = pd.DataFrame(
    {'link': links,
     'title': titles,
     'responsible_pi': resp_pi,
     'packages': packages
    })

jornada_out.to_csv('/Volumes/DataProducts/LTER_IM/Website_data_rescue/Jornada_climate_catalog.csv')