In [1]:
import pandas as pd
import numpy as np
import copernicusmarine as cm

  from .autonotebook import tqdm as notebook_tqdm


Checking number of months in each year and number of days in each month

In [2]:
cm.login(username="", password="")

File /home/kshitiz/.copernicusmarine/.copernicusmarine-credentials already exists, overwrite it ? [y/N]:

INFO - 2024-08-06T17:09:16Z - Credentials file stored in /home/kshitiz/.copernicusmarine/.copernicusmarine-credentials.


True

Dataset 1: Global Ocean Color (https://data.marine.copernicus.eu/product/OCEANCOLOUR_GLO_BGC_L4_MY_009_104/description)
- dataset_id: "cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M"
- variables: ["CHL"]
- Parameter definitions:
    - CHL [mg/m3]: Mass concentration of chlorophyll a in sea water
- Spatial resolution: 4km x 4km

In [3]:
# filename for saving the data as csv
filename = 'chl_1997_2021'

In [4]:
# Set parameters
data_request = {
    "dataset_id" : "cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M",
    "variables" : ["CHL"],
    "longitude" : [-60, -10], 
    "latitude" : [60, 40],
    "time" : ["1997-09-01T00:00:00", "2022-01-03T00:00:00"],
    "depth": [None, None] # for bathymetry set it to [0.49402499198913574, 5727.9169921875]
}

In [5]:
df = cm.read_dataframe(
    dataset_id=data_request["dataset_id"],
    variables=data_request["variables"],
    minimum_longitude=data_request["longitude"][0],
    maximum_longitude=data_request["longitude"][1],
    minimum_latitude=data_request["latitude"][0],
    maximum_latitude=data_request["latitude"][1],
    minimum_depth=data_request["depth"][0],
    maximum_depth=data_request["depth"][1],
    start_datetime=data_request["time"][0],
    end_datetime=data_request["time"][1]
)

INFO - 2024-08-06T17:09:39Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-08-06T17:09:39Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-08-06T17:10:00Z - Service was not specified, the default one was selected: "arco-time-series"
  return Timestamp(date).to_pydatetime()


In [6]:
print(df.shape)
df.head()

(366000, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CHL
time,latitude,longitude,Unnamed: 3_level_1
1997-09-01 00:00:35.028205568,60.020832,-59.979164,
1997-09-01 00:00:35.028205568,60.020832,-59.937496,
1997-09-01 00:00:35.028205568,60.020832,-59.895828,0.408068
1997-09-01 00:00:35.028205568,60.020832,-59.854164,0.401471
1997-09-01 00:00:35.028205568,60.020832,-59.812496,0.395363


In [7]:
# remove the index columns corresponding to time, latitude and longitude
df = df.reset_index()
# drop na rows
df = df.dropna()

df.head()

Unnamed: 0,time,latitude,longitude,CHL
2,1997-09-01 00:00:35.028205568,60.020832,-59.895828,0.408068
3,1997-09-01 00:00:35.028205568,60.020832,-59.854164,0.401471
4,1997-09-01 00:00:35.028205568,60.020832,-59.812496,0.395363
5,1997-09-01 00:00:35.028205568,60.020832,-59.770828,0.389933
6,1997-09-01 00:00:35.028205568,60.020832,-59.729164,0.370605


In [8]:
# function to coarse grain the data and make resolution same as Pisces data
def coarse_grain(df, features):
  """
    Parameters
    ----------
    df: pandas dataframe containing the data accessed from copernicus mariner
    features: name of the features in the dataframe

    Output
    ------
    a pandas dataframe with feature values for 0.25 deg x 0.25 deg resolution 

    """
  
  df["0_1"] = df["latitude"].to_numpy() - np.floor(df["latitude"])
  conditions = [df["0_1"] < 0.25,
                (df["0_1"] >= 0.25)  & (df["0_1"] < 0.5),
                (df["0_1"] >= 0.5)  & (df["0_1"] < 0.75),
                (df["0_1"] >= 0.75)  & (df["0_1"] < 1)]
  outputs = [0, 0.25, 0.5, 0.75]
  df['latitude'] = np.floor(df["latitude"]) + np.select(conditions, outputs)
  df = df.drop(columns=["0_1"])

  df["0_1"] = df["longitude"].to_numpy() - np.floor(df["longitude"])
  # if not redefined then conditions is based on latitude
  conditions = [df["0_1"] < 0.25,
              (df["0_1"] >= 0.25)  & (df["0_1"] < 0.5),
              (df["0_1"] >= 0.5)  & (df["0_1"] < 0.75),
              (df["0_1"] >= 0.75)  & (df["0_1"] < 1)]
  df['longitude'] = np.floor(df["longitude"]) + np.select(conditions, outputs)
  df = df.drop(columns=["0_1"])

  return df.groupby(["time","latitude","longitude"])[features].mean() 

In [9]:
# Need coarse grain the values because all these datasets have finer resolution than Pisces data
# Pisces data has resolution of 0.25 deg x 0.25 deg
df_cg = coarse_grain(df, df.columns[3:].tolist())
print(df_cg.shape)
df_cg.head()

(47470, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CHL
time,latitude,longitude,Unnamed: 3_level_1
1997-09-01 00:00:35.028205568,60.0,-60.0,0.398709
1997-09-01 00:00:35.028205568,60.0,-59.75,0.360659
1997-09-01 00:00:35.028205568,60.0,-59.5,0.344952
1997-09-01 00:00:35.028205568,60.0,-59.25,0.328261
1997-09-01 00:00:35.028205568,60.0,-59.0,0.321946


In [10]:
# remove time and just have month and year
df_cg = df_cg.reset_index()

# from https://stackoverflow.com/questions/53509168/extract-year-month-and-day-from-datetime64ns-utc-python
datetimes = pd.to_datetime(df_cg['time'])
df_cg['day'] = datetimes.dt.day
df_cg['month'] = datetimes.dt.month
df_cg['year'] = datetimes.dt.year
df_cg.head()

Unnamed: 0,time,latitude,longitude,CHL,day,month,year
0,1997-09-01 00:00:35.028205568,60.0,-60.0,0.398709,1,9,1997
1,1997-09-01 00:00:35.028205568,60.0,-59.75,0.360659,1,9,1997
2,1997-09-01 00:00:35.028205568,60.0,-59.5,0.344952,1,9,1997
3,1997-09-01 00:00:35.028205568,60.0,-59.25,0.328261,1,9,1997
4,1997-09-01 00:00:35.028205568,60.0,-59.0,0.321946,1,9,1997


In [11]:
# reomve the time column
df_cg = df_cg.drop(columns=["time"])
df_cg = df_cg.set_index(["year","month","day","latitude","longitude"])

df_cg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,CHL
year,month,day,latitude,longitude,Unnamed: 5_level_1
1997,9,1,60.0,-60.0,0.398709
1997,9,1,60.0,-59.75,0.360659
1997,9,1,60.0,-59.5,0.344952
1997,9,1,60.0,-59.25,0.328261
1997,9,1,60.0,-59.0,0.321946


Checking number of months in each year and number of days in each month

In [12]:
df_time = df_cg.reset_index()
df_time = df_time[["year","month","day"]]
df_time = df_time.drop_duplicates()
df_time.head()

Unnamed: 0,year,month,day
0,1997,9,1
110,1997,10,1
206,1997,11,1
214,1998,1,31
290,1998,2,28


In [13]:
# unique day values
df_time.day.unique()

array([ 1, 31, 28, 30, 29])

Are there any months with more than 1 days?

In [14]:
df_days_in_month = df_time.groupby(["year","month"])["day"].count()
df_days_in_month = df_days_in_month.reset_index()
df_days_in_month.head()

Unnamed: 0,year,month,day
0,1997,9,1
1,1997,10,1
2,1997,11,1
3,1998,1,1
4,1998,2,1


In [15]:
df_days_in_month.loc[df_days_in_month["day"]>1]

Unnamed: 0,year,month,day
41,2001,9,2
60,2003,7,2
68,2004,5,2
87,2006,3,2
107,2008,2,2
195,2016,3,2


In [16]:
df_time.loc[(df_time["year"]==2001) & (df_time["month"]==9)]

Unnamed: 0,year,month,day
5948,2001,9,1
6143,2001,9,30


There are some months where data is given for 1st and last day of the month, even though the data is supposed to be monthly. 

Averaging out these cases, to get only one value per month

In [17]:
# the observation in this dataset are daily
# to be consistent across datasets, averaging out to make it monthly
df_cg = df_cg.groupby(["year","month","latitude","longitude"]).mean()
df_cg = df_cg.drop(columns = ["day"])
df_cg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CHL
year,month,latitude,longitude,Unnamed: 4_level_1
1997,9,60.0,-60.0,0.398709
1997,9,60.0,-59.75,0.360659
1997,9,60.0,-59.5,0.344952
1997,9,60.0,-59.25,0.328261
1997,9,60.0,-59.0,0.321946


In [18]:
df_cg.to_csv(filename+".csv")