In [1]:
import pandas as pd
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Work with downloaded statistics tables of datasets from the site https://www.earthdata.nasa.gov/

In [2]:
folder_path = 'data'

In [3]:
csv_files = ['air.csv', 'land.csv', 'other.csv', 'space_1.csv', 'space_2.csv', 'space_3.csv', 'space_4.csv', 'space_5.csv', 'water.csv']

In [4]:
df_list = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]
combined_df = pd.concat(df_list, axis=0)

In [5]:
combined_df

Unnamed: 0,Data Provider,Short Name,Version,Entry Title,Processing Level,Platform,Start Time,End Time
0,USGS_LTA,USGSPHOTOS,Not provided,U.S. Geological Survey Aerial Photography,Not provided,Aircraft,1937-04-01T00:00:00.000Z,
1,SCIOPS,PSCNH_ARCHIVED_DATA,2,Plymouth State University Weather Center's Arc...,Not Provided,"METEOROLOGICAL STATIONS, FIXED OBSERVATION STA...",1953-06-01T00:00:00.000Z,2024-06-21T00:00:00.000Z
2,USGS_LTA,NASAPHOTOS,Not provided,NASA Aerial Photography,Not provided,NASA ER-2,1969-07-16T00:00:00.000Z,
3,USGS_LTA,EARTH_LAND_USGS_AMES_AIR_PHOTOS,Not provided,Aerial Photographs (from AMES Pilot Land Data ...,Not provided,AIRCRAFT,1970-01-01T00:00:00.000Z,
4,USGS_LTA,airmoss_chamela_mexico,Not provided,"USGS AirMOSS - Chamela, Mexico",Not provided,UAV,1970-01-01T00:00:00.000Z,
...,...,...,...,...,...,...,...,...
115,POCLOUD,SMODE_L2_LAGRANGIAN_FLOATS_V1,1,S-MODE Lagrangian Float Observations Version 1,2,BUOYS,2022-10-01T00:00:00.000Z,2023-05-31T00:00:00.000Z
116,POCLOUD,SMODE_L2_SHIPBOARD_BIO_V1,1,S-MODE Shipboard Bio-optical Measurements Vers...,2,Ships,2022-10-09T00:00:00.000Z,2022-11-02T00:00:00.000Z
117,ISRO,E06_OCM_GAC_STGO00GND,1.0,EOS-06 OCM Global Area Coverage (GAC) - 1080m ...,1C,OCEAN PLATFORMS,2023-04-01T00:00:00.000Z,
118,ISRO,E06_OCM_LAC_STGO00GND,1.0,EOS-06 OCM Local Area Coverage (LAC) - 366m Re...,1C,OCEAN PLATFORMS,2023-04-01T00:00:00.000Z,


In [6]:
providers_df = combined_df.groupby('Data Provider').size().reset_index(name='Count')
providers_df = providers_df.sort_values(by='Count', ascending=False)
providers_df.set_index('Data Provider', inplace=True)

In [7]:
providers_df

Unnamed: 0_level_0,Count
Data Provider,Unnamed: 1_level_1
ORNL_CLOUD,2181
GES_DISC,1394
LARC_ASDC,1150
POCLOUD,686
GHRC_DAAC,620
NSIDC_ECS,589
LARC,365
LPCLOUD,326
OB_DAAC,311
SEDAC,310


## Website scraping

In [8]:
url = "https://cmr.earthdata.nasa.gov/search/site/collections/directory/ORNL_CLOUD/gov.nasa.eosdis"

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    
    df = pd.read_html(str(table))[0]
    print(df.head())

    if all(col in df.columns for col in ['Collection', 'Short Name', 'Directory']):
            ornl_clouds_df = df[['Collection', 'Short Name', 'Directory']].copy()          
            print(ornl_clouds_df.head())
    else:
            print('0')
else:
        print(f"{response.status_code}")

                                          Collection  \
0            15 Minute Stream Flow Data: USGS (FIFE)   
1                     30 Minute Rainfall Data (FIFE)   
2  A Compilation of Global Soil Microbial Biomass...   
3  A Concise Experiment Plan for the Arctic-Borea...   
4  A Global Data Set of Leaf Photosynthetic Rates...   

                            Short Name Version            Directory  
0            fife_hydrology_strm_15m_1       1   Browse 39 Granules  
1              fife_sur_met_rain_30m_2       1  Browse 117 Granules  
2  Global_Microbial_Biomass_C_N_P_1264       1    Browse 3 Granules  
3   ABoVE_Concise_Experiment_Plan_1617     1.1     Browse 1 Granule  
4      Leaf_Photosynthesis_Traits_1224       1     Browse 1 Granule  
                                          Collection  \
0            15 Minute Stream Flow Data: USGS (FIFE)   
1                     30 Minute Rainfall Data (FIFE)   
2  A Compilation of Global Soil Microbial Biomass...   
3  A Concise Experi

In [9]:
ornl_clouds_df

Unnamed: 0,Collection,Short Name,Directory
0,15 Minute Stream Flow Data: USGS (FIFE),fife_hydrology_strm_15m_1,Browse 39 Granules
1,30 Minute Rainfall Data (FIFE),fife_sur_met_rain_30m_2,Browse 117 Granules
2,A Compilation of Global Soil Microbial Biomass...,Global_Microbial_Biomass_C_N_P_1264,Browse 3 Granules
3,A Concise Experiment Plan for the Arctic-Borea...,ABoVE_Concise_Experiment_Plan_1617,Browse 1 Granule
4,A Global Data Set of Leaf Photosynthetic Rates...,Leaf_Photosynthesis_Traits_1224,Browse 1 Granule
...,...,...,...
1853,"Water Quality and Spectral Reflectance, Peace-...",PAD_2011_1133,Browse 3 Granules
1854,Wind Profile Data: LIDAR - NOAA (FIFE),fife_atmos_wind_lid_138,Browse 13 Granules
1855,Wind Profile Data: Radiosonde (FIFE),fife_atmos_wind_son_139,Browse 422 Granules
1856,"Woody Biomass for Eastern U.S. Forests, 1983-1996",woody_biomass_657,Browse 1 Granule


In [10]:
ornl_clouds_df.rename(columns={'Directory': 'Granules'}, inplace=True)

In [11]:
ornl_clouds_df['Granules'] = ornl_clouds_df['Granules'].str.extract('(\d+)')

In [12]:
ornl_clouds_df

Unnamed: 0,Collection,Short Name,Granules
0,15 Minute Stream Flow Data: USGS (FIFE),fife_hydrology_strm_15m_1,39
1,30 Minute Rainfall Data (FIFE),fife_sur_met_rain_30m_2,117
2,A Compilation of Global Soil Microbial Biomass...,Global_Microbial_Biomass_C_N_P_1264,3
3,A Concise Experiment Plan for the Arctic-Borea...,ABoVE_Concise_Experiment_Plan_1617,1
4,A Global Data Set of Leaf Photosynthetic Rates...,Leaf_Photosynthesis_Traits_1224,1
...,...,...,...
1853,"Water Quality and Spectral Reflectance, Peace-...",PAD_2011_1133,3
1854,Wind Profile Data: LIDAR - NOAA (FIFE),fife_atmos_wind_lid_138,13
1855,Wind Profile Data: Radiosonde (FIFE),fife_atmos_wind_son_139,422
1856,"Woody Biomass for Eastern U.S. Forests, 1983-1996",woody_biomass_657,1


## Average value of rows in datasets (ORNL_CLOUD provider):

In [13]:
ornl_clouds_df['Granules'] = pd.to_numeric(ornl_clouds_df['Granules'], errors='coerce')
average_granules = ornl_clouds_df['Granules'].mean()
print(f"Average value of rows in datasets: {average_granules}")

Average value of rows in datasets: 1048.1178686759956


## The largest ORNL_CLOUD provider dataset on the platform:

In [14]:
sorted_ornl_clouds_df = ornl_clouds_df.sort_values(by='Granules', ascending=False)

In [15]:
sorted_ornl_clouds_df.head(1)

Unnamed: 0,Collection,Short Name,Granules
1337,"MODIS-based GPP, PAR, fC4, and SANIRv estimate...",SLOPE_GPP_CONUS_1786,612416


In [16]:
print(sorted_ornl_clouds_df.iloc[0])

Collection    MODIS-based GPP, PAR, fC4, and SANIRv estimate...
Short Name                                 SLOPE_GPP_CONUS_1786
Granules                                                 612416
Name: 1337, dtype: object
