In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copernicusmarine as cm
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cm.login(username="", password="")

File /home/kshitiz/.copernicusmarine/.copernicusmarine-credentials already exists, overwrite it ? [y/N]:

INFO - 2024-08-06T13:32:41Z - Credentials file stored in /home/kshitiz/.copernicusmarine/.copernicusmarine-credentials.


True

**The Pisces file sizes are very big. Depending on the size of the region of interest, might be better to get in parts and then merge, like from 1997-09 to 1999-12 and then from 2000-01 to 2004-12 and so on** 

Dataset: Global Ocean Biochemistry Hindcast (https://data.marine.copernicus.eu/product/GLOBAL_MULTIYEAR_BGC_001_029/description)
- dataset_id: "cmems_mod_glo_bgc_my_0.25deg_P1M-m",
- variables: ["fe","no3","o2","po4","si"],
- Parameter definitions:
    - fe [mmol/m3]: Mole concentration of dissolved iron in sea water
    - no3 [mmol/m3]: Mole concentration of nitrate in sea water
    - o2 [mmol/m3]: Mole concentration of dissolved molecular oxygen in sea water
    - po4 [mmol/m3]: Mole concentration of phosphate in sea water
    - si [mmol/m3] :Mole concentration of silicate in sea water
- Spatial resolution: 0.25deg x 0.25deg
- It used PISCES biogeochemical model to perform simulation 

In [None]:
filename = "pisces_1997_2021"

In [109]:
# Set parameters
data_request = {
    "dataset_id" : "cmems_mod_glo_bgc_my_0.25deg_P1M-m",
    "variables" : ["fe","no3","o2","po4","si"],
    "longitude" : [-60, -10], 
    "latitude" : [60, 40],
    "time" : ["2017-01-01T00:00:00", "2023-01-03T00:00:00"],
    "depth": [None, None] 
}

In [110]:
# get PISCES data
df = cm.read_dataframe(
    dataset_id=data_request["dataset_id"],
    variables=data_request["variables"],
    minimum_longitude=data_request["longitude"][0],
    maximum_longitude=data_request["longitude"][1],
    minimum_latitude=data_request["latitude"][0],
    maximum_latitude=data_request["latitude"][1],
    minimum_depth=data_request["depth"][0],
    maximum_depth=data_request["depth"][1],
    start_datetime=data_request["time"][0],
    end_datetime=data_request["time"][1]
)

INFO - 2024-08-06T16:53:56Z - Dataset version was not specified, the latest one was selected: "202406"
INFO - 2024-08-06T16:53:56Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-08-06T16:54:08Z - Service was not specified, the default one was selected: "arco-time-series"
  return Timestamp(date).to_pydatetime()


In [111]:
# check that the month extracted are correct
df.reset_index().time.unique()

<DatetimeArray>
['2017-01-31 23:59:59.186763776', '2017-02-28 23:59:59.645777920',
 '2017-04-01 00:00:09.971040256', '2017-04-30 23:59:31.194568704',
 '2017-05-31 23:59:41.519831040', '2017-06-30 23:59:02.743359488',
 '2017-07-31 23:59:13.068621824', '2017-08-31 23:59:23.393884160',
 '2017-09-30 23:58:44.617412608', '2017-10-31 23:58:54.942674944',
 '2017-12-01 00:00:33.605156864', '2018-01-01 00:00:43.930419200',
 '2018-01-31 23:58:36.816728064', '2018-02-28 23:58:37.275742208',
 '2018-03-31 23:58:47.601004544', '2018-05-01 00:00:26.263486464',
 '2018-06-01 00:00:36.588748800', '2018-06-30 23:59:57.812277248',
 '2018-08-01 00:00:08.137539584', '2018-09-01 00:00:18.462801920',
 '2018-09-30 23:59:39.686330368', '2018-10-31 23:59:50.011592704',
 '2018-11-30 23:59:11.235121152', '2018-12-31 23:59:21.560383488',
 '2019-01-31 23:59:31.885645824', '2019-02-28 23:59:32.344659968',
 '2019-03-31 23:59:42.669922304', '2019-04-30 23:59:03.893450752',
 '2019-05-31 23:59:14.218713088', '2019-06-30 

In [112]:
print(df.shape)
df.head()

(1070325, 5)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,fe,no3,o2,po4,si
depth,latitude,longitude,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.50576,60.0,-60.0,2017-01-31 23:59:59.186763776,0.000665,11.55445,299.009827,0.79528,6.085373
0.50576,60.0,-60.0,2017-02-28 23:59:59.645777920,0.000694,11.557551,308.744781,0.79704,6.294991
0.50576,60.0,-60.0,2017-04-01 00:00:09.971040256,0.0006,10.800571,328.6521,0.751645,6.134493
0.50576,60.0,-60.0,2017-04-30 23:59:31.194568704,0.000406,7.980321,353.349823,0.586321,4.23151
0.50576,60.0,-60.0,2017-05-31 23:59:41.519831040,0.000167,4.461554,342.173676,0.376985,1.549171


In [113]:
# drop na rows
df = df.dropna()

print(df.shape)
df.head()

(743228, 5)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,fe,no3,o2,po4,si
depth,latitude,longitude,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.50576,60.0,-60.0,2017-01-31 23:59:59.186763776,0.000665,11.55445,299.009827,0.79528,6.085373
0.50576,60.0,-60.0,2017-02-28 23:59:59.645777920,0.000694,11.557551,308.744781,0.79704,6.294991
0.50576,60.0,-60.0,2017-04-01 00:00:09.971040256,0.0006,10.800571,328.6521,0.751645,6.134493
0.50576,60.0,-60.0,2017-04-30 23:59:31.194568704,0.000406,7.980321,353.349823,0.586321,4.23151
0.50576,60.0,-60.0,2017-05-31 23:59:41.519831040,0.000167,4.461554,342.173676,0.376985,1.549171


In [114]:
# remove depth as index
df = df.reset_index(level=["depth"])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,depth,fe,no3,o2,po4,si
latitude,longitude,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
60.0,-60.0,2017-01-31 23:59:59.186763776,0.50576,0.000665,11.55445,299.009827,0.79528,6.085373
60.0,-60.0,2017-02-28 23:59:59.645777920,0.50576,0.000694,11.557551,308.744781,0.79704,6.294991
60.0,-60.0,2017-04-01 00:00:09.971040256,0.50576,0.0006,10.800571,328.6521,0.751645,6.134493
60.0,-60.0,2017-04-30 23:59:31.194568704,0.50576,0.000406,7.980321,353.349823,0.586321,4.23151
60.0,-60.0,2017-05-31 23:59:41.519831040,0.50576,0.000167,4.461554,342.173676,0.376985,1.549171


In [115]:
# average across the depth for each (time, latitude, longitude)
df_cg = df.reset_index().groupby(["time","latitude","longitude"]).mean()
df_cg = df_cg.drop(columns=["depth"])

print(df_cg.shape)
df_cg.head()

In [117]:
# remove time and just have month and year
df_cg = df_cg.reset_index()

# from https://stackoverflow.com/questions/53509168/extract-year-month-and-day-from-datetime64ns-utc-python
datetimes = pd.to_datetime(df_cg['time'])
df_cg['day'] = datetimes.dt.day
df_cg['month'] = datetimes.dt.month
df_cg['year'] = datetimes.dt.year
df_cg.head()

Unnamed: 0,time,latitude,longitude,fe,no3,o2,po4,si,day,month,year
0,2017-01-31 23:59:59.186763776,60.0,-60.0,0.00071,12.233848,287.338837,0.839621,6.822407,31,1,2017
1,2017-01-31 23:59:59.186763776,60.0,-59.75,0.00069,12.462684,285.335907,0.851425,6.925385,31,1,2017
2,2017-01-31 23:59:59.186763776,60.0,-59.5,0.000676,12.673633,285.718079,0.864672,7.124105,31,1,2017
3,2017-01-31 23:59:59.186763776,60.0,-59.25,0.000666,12.84024,286.595062,0.874653,7.257099,31,1,2017
4,2017-01-31 23:59:59.186763776,60.0,-59.0,0.000662,12.970829,287.187683,0.882612,7.399156,31,1,2017


In [118]:
# reomve the time column
df_cg = df_cg.drop(columns=["time"])
df_cg = df_cg.set_index(["year","month","day","latitude","longitude"])

df_cg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,fe,no3,o2,po4,si
year,month,day,latitude,longitude,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017,1,31,60.0,-60.0,0.00071,12.233848,287.338837,0.839621,6.822407
2017,1,31,60.0,-59.75,0.00069,12.462684,285.335907,0.851425,6.925385
2017,1,31,60.0,-59.5,0.000676,12.673633,285.718079,0.864672,7.124105
2017,1,31,60.0,-59.25,0.000666,12.84024,286.595062,0.874653,7.257099
2017,1,31,60.0,-59.0,0.000662,12.970829,287.187683,0.882612,7.399156


Checking number of months in each year and number of days in each month

In [None]:
df_time = df_cg.reset_index()
df_time = df_time[["year","month","day"]]
df_time = df_time.drop_duplicates()
df_time.head()

Unnamed: 0,year,month,day
0,1997,9,1
110,1997,10,1
206,1997,11,1
214,1998,1,31
290,1998,2,28


In [None]:
# unique day values
df_time.day.unique()

array([ 1, 31, 28, 30, 29])

In [None]:
df_days_in_month = df_time.groupby(["year","month"])["day"].count()
df_days_in_month = df_days_in_month.reset_index()
df_days_in_month.head()

Unnamed: 0,year,month,day
0,1997,9,1
1,1997,10,1
2,1997,11,1
3,1998,1,1
4,1998,2,1


In [None]:
df_days_in_month.loc[df_days_in_month["day"]>1]

Unnamed: 0,year,month,day
41,2001,9,2
60,2003,7,2
68,2004,5,2
87,2006,3,2
107,2008,2,2
195,2016,3,2


In [None]:
df_time.loc[(df_time["year"]==2001) & (df_time["month"]==9)]

Unnamed: 0,year,month,day
5948,2001,9,1
6143,2001,9,30


There are some months where data is given for 1st and last day of the month, even though the data is supposed to be monthly. 

Averaging out these cases, to get only one value per month

In [None]:
# the observation in this dataset are daily
# to be consistent across datasets, averaging out to make it monthly
df_cg = df_cg.groupby(["year","month","latitude","longitude"]).mean()
df_cg = df_cg.drop(columns = ["day"])
df_cg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CHL
year,month,latitude,longitude,Unnamed: 4_level_1
1997,9,60.0,-60.0,0.398709
1997,9,60.0,-59.75,0.360659
1997,9,60.0,-59.5,0.344952
1997,9,60.0,-59.25,0.328261
1997,9,60.0,-59.0,0.321946


In [119]:
df_cg.to_csv(filename+".csv")