In [25]:
import pandas as pd
import numpy as np
import copernicusmarine as cm

In [2]:
cm.login(username="", password="")

File /home/kshitiz/.copernicusmarine/.copernicusmarine-credentials already exists, overwrite it ? [y/N]:

INFO - 2024-08-06T17:27:23Z - Credentials file stored in /home/kshitiz/.copernicusmarine/.copernicusmarine-credentials.


True

Dataset 3: Multi Observation Global Ocean Sea Surface Salinity and Sea Surface Density (https://data.marine.copernicus.eu/product/MULTIOBS_GLO_PHY_S_SURFACE_MYNRT_015_013/description)
- dataset_id: "cmems_obs-mob_glo_phy-sss_my_multi_P1M"
- variables: ["sos","dos"]
- Parameter definitions: 
    - sos: Sea surface salinity
    - dos [kg/m3]: sea surface density
- Spatial resolution: 0.125deg x 0.125deg

In [3]:
# filename for saving the data as csv
filename = 'sssd_1997_2022'

In [4]:
# Set parameters
data_request = {
    "dataset_id" : "cmems_obs-mob_glo_phy-sss_my_multi_P1M",
    "variables" : ["sos","dos"],
    "longitude" : [-60, -10], 
    "latitude" : [60, 40],
    "time" : ["1997-09-01T00:00:00", "2023-01-03T00:00:00"],
    "depth": [None, None] # for bathymetry set it to [0.49402499198913574, 5727.9169921875]
}

In [13]:
df = cm.read_dataframe(
    dataset_id=data_request["dataset_id"],
    variables=data_request["variables"],
    minimum_longitude=data_request["longitude"][0],
    maximum_longitude=data_request["longitude"][1],
    minimum_latitude=data_request["latitude"][0],
    maximum_latitude=data_request["latitude"][1],
    minimum_depth=data_request["depth"][0],
    maximum_depth=data_request["depth"][1],
    start_datetime=data_request["time"][0],
    end_datetime=data_request["time"][1]
)

INFO - 2024-08-06T17:29:09Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-08-06T17:29:09Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-08-06T17:29:11Z - Service was not specified, the default one was selected: "arco-time-series"


In [14]:
print(df.shape)
df.head()

(116800, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sos,dos
time,depth,latitude,longitude,Unnamed: 4_level_1,Unnamed: 5_level_1
1997-09-01,0.0,60.0625,-59.9375,33.793015,1026.517822
1997-09-01,0.0,60.0625,-59.8125,33.823151,1026.531494
1997-09-01,0.0,60.0625,-59.6875,33.834396,1026.536743
1997-09-01,0.0,60.0625,-59.5625,33.844009,1026.541992
1997-09-01,0.0,60.0625,-59.4375,33.855732,1026.547974


In [18]:
# remove the index columns corresponding to time, latitude and longitude
df = df.reset_index()
# drop na rows
df = df.dropna()
# drop the depth column
df = df.drop(columns=["depth"])

df.head()

Unnamed: 0,time,latitude,longitude,sos,dos
0,1997-09-01,60.0625,-59.9375,33.793015,1026.517822
1,1997-09-01,60.0625,-59.8125,33.823151,1026.531494
2,1997-09-01,60.0625,-59.6875,33.834396,1026.536743
3,1997-09-01,60.0625,-59.5625,33.844009,1026.541992
4,1997-09-01,60.0625,-59.4375,33.855732,1026.547974


In [19]:
# function to coarse grain the data and make resolution same as Pisces data
def coarse_grain(df, features):
  """
    Parameters
    ----------
    df: pandas dataframe containing the data accessed from copernicus mariner
    features: name of the features in the dataframe

    Output
    ------
    a pandas dataframe with feature values for 0.25 deg x 0.25 deg resolution 

    """
  
  df["0_1"] = df["latitude"].to_numpy() - np.floor(df["latitude"])
  conditions = [df["0_1"] < 0.25,
                (df["0_1"] >= 0.25)  & (df["0_1"] < 0.5),
                (df["0_1"] >= 0.5)  & (df["0_1"] < 0.75),
                (df["0_1"] >= 0.75)  & (df["0_1"] < 1)]
  outputs = [0, 0.25, 0.5, 0.75]
  df['latitude'] = np.floor(df["latitude"]) + np.select(conditions, outputs)
  df = df.drop(columns=["0_1"])

  df["0_1"] = df["longitude"].to_numpy() - np.floor(df["longitude"])
  # if not redefined then conditions is based on latitude
  conditions = [df["0_1"] < 0.25,
              (df["0_1"] >= 0.25)  & (df["0_1"] < 0.5),
              (df["0_1"] >= 0.5)  & (df["0_1"] < 0.75),
              (df["0_1"] >= 0.75)  & (df["0_1"] < 1)]
  df['longitude'] = np.floor(df["longitude"]) + np.select(conditions, outputs)
  df = df.drop(columns=["0_1"])

  return df.groupby(["time","latitude","longitude"])[features].mean() 

In [20]:
# Need coarse grain the values because all these datasets have finer resolution than Pisces data
# Pisces data has resolution of 0.25 deg x 0.25 deg
df_cg = coarse_grain(df, df.columns[3:].tolist())
print(df_cg.shape)
df_cg.head()

(56259, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sos,dos
time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1
1997-09-01,60.0,-60.0,33.808083,1026.524658
1997-09-01,60.0,-59.75,33.839203,1026.539307
1997-09-01,60.0,-59.5,33.868324,1026.554565
1997-09-01,60.0,-59.25,33.938263,1026.588135
1997-09-01,60.0,-59.0,34.003319,1026.617676


In [21]:
df_cg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sos,dos
time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1
1997-09-01,60.0,-60.0,33.808083,1026.524658
1997-09-01,60.0,-59.75,33.839203,1026.539307
1997-09-01,60.0,-59.5,33.868324,1026.554565
1997-09-01,60.0,-59.25,33.938263,1026.588135
1997-09-01,60.0,-59.0,34.003319,1026.617676


In [22]:
# remove time and just have month and year
df_cg = df_cg.reset_index()

# from https://stackoverflow.com/questions/53509168/extract-year-month-and-day-from-datetime64ns-utc-python
datetimes = pd.to_datetime(df_cg['time'])
df_cg['day'] = datetimes.dt.day
df_cg['month'] = datetimes.dt.month
df_cg['year'] = datetimes.dt.year
df_cg.head()

Unnamed: 0,time,latitude,longitude,sos,dos,day,month,year
0,1997-09-01,60.0,-60.0,33.808083,1026.524658,1,9,1997
1,1997-09-01,60.0,-59.75,33.839203,1026.539307,1,9,1997
2,1997-09-01,60.0,-59.5,33.868324,1026.554565,1,9,1997
3,1997-09-01,60.0,-59.25,33.938263,1026.588135,1,9,1997
4,1997-09-01,60.0,-59.0,34.003319,1026.617676,1,9,1997


In [23]:
# reomve the time column
df_cg = df_cg.drop(columns=["time"])
df_cg = df_cg.set_index(["year","month","day","latitude","longitude"])

df_cg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,sos,dos
year,month,day,latitude,longitude,Unnamed: 5_level_1,Unnamed: 6_level_1
1997,9,1,60.0,-60.0,33.808083,1026.524658
1997,9,1,60.0,-59.75,33.839203,1026.539307
1997,9,1,60.0,-59.5,33.868324,1026.554565
1997,9,1,60.0,-59.25,33.938263,1026.588135
1997,9,1,60.0,-59.0,34.003319,1026.617676


In [24]:
df_cg.to_csv(filename+".csv")

Checking number of months in each year and number of days in each month

In [26]:
df_cg = pd.read_csv("1997_2022.csv")
df_cg.head()

Unnamed: 0,year,month,day,latitude,longitude,sos,dos
0,1997,9,1,60.0,-60.0,33.808083,1026.5247
1,1997,9,1,60.0,-59.75,33.839203,1026.5393
2,1997,9,1,60.0,-59.5,33.868324,1026.5546
3,1997,9,1,60.0,-59.25,33.938263,1026.5881
4,1997,9,1,60.0,-59.0,34.00332,1026.6177


In [27]:
df_time = df_cg.reset_index()
df_time = df_time[["year","month","day"]]
df_time = df_time.drop_duplicates()
df_time.head()

Unnamed: 0,year,month,day
0,1997,9,1
194,1997,10,1
388,1997,11,1
582,1997,12,1
776,1998,1,1


In [28]:
df_time.day.unique()

array([1])

only day=1 exists in this, so can drop the day column

In [29]:
df_time = df_time.drop(columns=["day"])
df_time.year.value_counts()

year
2009    12
2010    12
2020    12
2019    12
2018    12
2017    12
2016    12
2015    12
2014    12
2013    12
2012    12
2011    12
2021    12
1998    12
2008    12
2007    12
2006    12
2005    12
2004    12
2003    12
2002    12
2001    12
2000    12
1999    12
1997     4
Name: count, dtype: int64

In [30]:
# drop day as column and resave
df_cg = df_cg.drop(columns=["day"])
df_cg.to_csv("1997_2021.csv")