In [1]:
import pandas as pd
import numpy as np
import copernicusmarine as cm

  from .autonotebook import tqdm as notebook_tqdm


Checking number of months in each year and number of days in each month

In [3]:
cm.login(username="", password="")

File /home/kshitiz/.copernicusmarine/.copernicusmarine-credentials already exists, overwrite it ? [y/N]:

INFO - 2024-08-06T17:25:22Z - Credentials file stored in /home/kshitiz/.copernicusmarine/.copernicusmarine-credentials.


True

Dataset 4: Global Ocean Surface Carbon (https://data.marine.copernicus.eu/product/MULTIOBS_GLO_BIO_CARBON_SURFACE_REP_015_008/description)
- dataset_id: "dataset-carbon-rep-monthly"
- variables: ["fgco2","omega_ar","omega_ca","ph","spco2","talk","tco2"]
- Parameter definitions:
    - fgco2 [molC/m2/yr]: Surface downward mass flux of carbon dioxide expressed as carbon
    - omega_ar: Aragonite saturation state in sea water
    - omega_ca: Calcite saturation state in sea water
    - ph: Sea water ph reported on total scale
    - spco2 [micro atm]: Surface partial pressure of carbon dioxide in sea water
    - talk [micro/mol kg]: Total alkalinity in sea water
    - tco2 [micro/mol kg]: dissolved inorganic carbon in sea water
- Spatial resolution: 0.25deg x 0.25deg
- NOTE: though the resolution is same as that for Pisces, the coordinates at which the values are provided are different from pisces. so in this case coarse_grain() would just make adjustments to align the coordinates with pisces

In [4]:
# filename for saving the data as csv
filename = 'carbon_1997_2021'

In [5]:
# Set parameters
data_request = {
    "dataset_id" : "dataset-carbon-rep-monthly",
    "variables" : ["fgco2","omega_ar","omega_ca","ph","spco2","talk","tco2"],
    "longitude" : [-60, -10], 
    "latitude" : [60, 40],
    "time" : ["1997-09-01T00:00:00", "2022-01-03T00:00:00"],
    "depth": [None, None] 
}

In [6]:
df = cm.read_dataframe(
    dataset_id=data_request["dataset_id"],
    variables=data_request["variables"],
    minimum_longitude=data_request["longitude"][0],
    maximum_longitude=data_request["longitude"][1],
    minimum_latitude=data_request["latitude"][0],
    maximum_latitude=data_request["latitude"][1],
    minimum_depth=data_request["depth"][0],
    maximum_depth=data_request["depth"][1],
    start_datetime=data_request["time"][0],
    end_datetime=data_request["time"][1]
)

INFO - 2024-08-06T17:25:33Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-08-06T17:25:33Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-08-06T17:25:47Z - Service was not specified, the default one was selected: "arco-time-series"
  return Timestamp(date).to_pydatetime()


In [7]:
print(df.shape)
df.head()

(60600, 7)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fgco2,omega_ar,omega_ca,ph,spco2,talk,tco2
time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1997-09-30 23:59:12.119922688,60.125,-59.875,3.892427,1.923069,3.057008,8.148241,298.651947,2241.756104,2064.801025
1997-09-30 23:59:12.119922688,60.125,-59.625,3.776092,1.917293,3.047476,8.145193,301.191803,2242.538574,2066.303955
1997-09-30 23:59:12.119922688,60.125,-59.375,3.588515,1.931662,3.068809,8.14204,304.237915,2244.609619,2067.055176
1997-09-30 23:59:12.119922688,60.125,-59.125,3.499163,1.944287,3.087775,8.140802,305.719208,2247.923584,2069.145752
1997-09-30 23:59:12.119922688,60.125,-58.875,3.433449,1.949239,3.095119,8.139954,306.594818,2249.101562,2069.851562


In [8]:
# remove the index columns corresponding to time, latitude and longitude
df = df.reset_index()
# drop na rows
df = df.dropna()

df.head()

Unnamed: 0,time,latitude,longitude,fgco2,omega_ar,omega_ca,ph,spco2,talk,tco2
0,1997-09-30 23:59:12.119922688,60.125,-59.875,3.892427,1.923069,3.057008,8.148241,298.651947,2241.756104,2064.801025
1,1997-09-30 23:59:12.119922688,60.125,-59.625,3.776092,1.917293,3.047476,8.145193,301.191803,2242.538574,2066.303955
2,1997-09-30 23:59:12.119922688,60.125,-59.375,3.588515,1.931662,3.068809,8.14204,304.237915,2244.609619,2067.055176
3,1997-09-30 23:59:12.119922688,60.125,-59.125,3.499163,1.944287,3.087775,8.140802,305.719208,2247.923584,2069.145752
4,1997-09-30 23:59:12.119922688,60.125,-58.875,3.433449,1.949239,3.095119,8.139954,306.594818,2249.101562,2069.851562


In [9]:
# function to coarse grain the data and make resolution same as Pisces data
def coarse_grain(df, features):
  """
    Parameters
    ----------
    df: pandas dataframe containing the data accessed from copernicus mariner
    features: name of the features in the dataframe

    Output
    ------
    a pandas dataframe with feature values for 0.25 deg x 0.25 deg resolution 

    """
  
  df["0_1"] = df["latitude"].to_numpy() - np.floor(df["latitude"])
  conditions = [df["0_1"] < 0.25,
                (df["0_1"] >= 0.25)  & (df["0_1"] < 0.5),
                (df["0_1"] >= 0.5)  & (df["0_1"] < 0.75),
                (df["0_1"] >= 0.75)  & (df["0_1"] < 1)]
  outputs = [0, 0.25, 0.5, 0.75]
  df['latitude'] = np.floor(df["latitude"]) + np.select(conditions, outputs)
  df = df.drop(columns=["0_1"])

  df["0_1"] = df["longitude"].to_numpy() - np.floor(df["longitude"])
  # if not redefined then conditions is based on latitude
  conditions = [df["0_1"] < 0.25,
              (df["0_1"] >= 0.25)  & (df["0_1"] < 0.5),
              (df["0_1"] >= 0.5)  & (df["0_1"] < 0.75),
              (df["0_1"] >= 0.75)  & (df["0_1"] < 1)]
  df['longitude'] = np.floor(df["longitude"]) + np.select(conditions, outputs)
  df = df.drop(columns=["0_1"])

  return df.groupby(["time","latitude","longitude"])[features].mean() 

In [10]:
# Need coarse grain the values because all these datasets have finer resolution than Pisces data
# Pisces data has resolution of 0.25 deg x 0.25 deg
df_cg = coarse_grain(df, df.columns[3:].tolist())
print(df_cg.shape)
df_cg.head()

(57267, 7)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fgco2,omega_ar,omega_ca,ph,spco2,talk,tco2
time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1997-09-30 23:59:12.119922688,60.0,-60.0,3.892427,1.923069,3.057008,8.148241,298.651947,2241.756104,2064.801025
1997-09-30 23:59:12.119922688,60.0,-59.75,3.776092,1.917293,3.047476,8.145193,301.191803,2242.538574,2066.303955
1997-09-30 23:59:12.119922688,60.0,-59.5,3.588515,1.931662,3.068809,8.14204,304.237915,2244.609619,2067.055176
1997-09-30 23:59:12.119922688,60.0,-59.25,3.499163,1.944287,3.087775,8.140802,305.719208,2247.923584,2069.145752
1997-09-30 23:59:12.119922688,60.0,-59.0,3.433449,1.949239,3.095119,8.139954,306.594818,2249.101562,2069.851562


In [11]:
# remove time and just have month and year
df_cg = df_cg.reset_index()

# from https://stackoverflow.com/questions/53509168/extract-year-month-and-day-from-datetime64ns-utc-python
datetimes = pd.to_datetime(df_cg['time'])
df_cg['day'] = datetimes.dt.day
df_cg['month'] = datetimes.dt.month
df_cg['year'] = datetimes.dt.year
df_cg.head()

Unnamed: 0,time,latitude,longitude,fgco2,omega_ar,omega_ca,ph,spco2,talk,tco2,day,month,year
0,1997-09-30 23:59:12.119922688,60.0,-60.0,3.892427,1.923069,3.057008,8.148241,298.651947,2241.756104,2064.801025,30,9,1997
1,1997-09-30 23:59:12.119922688,60.0,-59.75,3.776092,1.917293,3.047476,8.145193,301.191803,2242.538574,2066.303955,30,9,1997
2,1997-09-30 23:59:12.119922688,60.0,-59.5,3.588515,1.931662,3.068809,8.14204,304.237915,2244.609619,2067.055176,30,9,1997
3,1997-09-30 23:59:12.119922688,60.0,-59.25,3.499163,1.944287,3.087775,8.140802,305.719208,2247.923584,2069.145752,30,9,1997
4,1997-09-30 23:59:12.119922688,60.0,-59.0,3.433449,1.949239,3.095119,8.139954,306.594818,2249.101562,2069.851562,30,9,1997


In [12]:
# reomve the time column
df_cg = df_cg.drop(columns=["time"])
df_cg = df_cg.set_index(["year","month","day","latitude","longitude"])

df_cg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,fgco2,omega_ar,omega_ca,ph,spco2,talk,tco2
year,month,day,latitude,longitude,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1997,9,30,60.0,-60.0,3.892427,1.923069,3.057008,8.148241,298.651947,2241.756104,2064.801025
1997,9,30,60.0,-59.75,3.776092,1.917293,3.047476,8.145193,301.191803,2242.538574,2066.303955
1997,9,30,60.0,-59.5,3.588515,1.931662,3.068809,8.14204,304.237915,2244.609619,2067.055176
1997,9,30,60.0,-59.25,3.499163,1.944287,3.087775,8.140802,305.719208,2247.923584,2069.145752
1997,9,30,60.0,-59.0,3.433449,1.949239,3.095119,8.139954,306.594818,2249.101562,2069.851562


Checking number of months in each year and number of days in each month

In [3]:
df_time = df_cg.reset_index()
df_time = df_time[["year","month","day"]]
df_time = df_time.drop_duplicates()
df_time.head()

Unnamed: 0,year,month,day
0,1997,9,30
189,1997,10,31
378,1997,11,30
567,1997,12,31
756,1998,1,31


In [4]:
# unique day values
df_time.day.unique()

array([30, 31, 28,  1, 29])

Are there any months with more than 1 day?

In [8]:
df_days_in_month = df_time.groupby(["year","month"])["day"].count()
df_days_in_month = df_days_in_month.reset_index()
df_days_in_month.head()

Unnamed: 0,year,month,day
0,1997,9,1
1,1997,10,1
2,1997,11,1
3,1997,12,1
4,1998,1,1


In [9]:
df_days_in_month.loc[df_days_in_month["day"]>1]

Unnamed: 0,year,month,day
13,1998,11,2
17,1999,4,2
25,2000,1,2
30,2000,7,2
38,2001,4,2
39,2001,6,2
49,2002,5,2
54,2002,11,2
58,2003,4,2
67,2004,2,2


In [12]:
df_time.loc[(df_time["year"]==2018) & (df_time["month"]==1)]

Unnamed: 0,year,month,day
45927,2018,1,1
46116,2018,1,31


There are some months where data is given for 1st and last day of the month, even though the data is supposed to be monthly. 

Averaging out these cases, to get only one value per month

In [13]:
# the observation in this dataset are daily
# to be consistent across datasets, averaging out to make it monthly
df_cg = df_cg.groupby(["year","month","latitude","longitude"]).mean()
df_cg = df_cg.drop(columns = ["day"])
df_cg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,fgco2,omega_ar,omega_ca,ph,spco2,talk,tco2
year,month,latitude,longitude,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1997,9,60.0,-60.0,3.892427,1.923069,3.057008,8.148241,298.65195,2241.756,2064.801
1997,9,60.0,-59.75,3.776092,1.917293,3.047476,8.145193,301.1918,2242.5386,2066.304
1997,9,60.0,-59.5,3.588515,1.931662,3.068809,8.14204,304.2379,2244.6096,2067.0552
1997,9,60.0,-59.25,3.499163,1.944287,3.087775,8.140802,305.7192,2247.9236,2069.1458
1997,9,60.0,-59.0,3.433449,1.949239,3.095119,8.139954,306.59482,2249.1016,2069.8516


In [14]:
df_cg.to_csv(filename+".csv")