In [1]:
import pandas as pd
import numpy as np
import copernicusmarine as cm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cm.login(username="", password="")

File /home/kshitiz/.copernicusmarine/.copernicusmarine-credentials already exists, overwrite it ? [y/N]:

INFO - 2024-08-06T17:34:23Z - Credentials file stored in /home/kshitiz/.copernicusmarine/.copernicusmarine-credentials.


True

**Since this is daily data, the SST file sizes can be very big. Depending on the size of the region of interest, might be better to get in parts and then merge, like from 1997-09 to 1999-12 and then from 2000-01 to 2004-12 and so on** 

Dataset 2: Global Ocean OSTIA Sea Surface Temperature (https://data.marine.copernicus.eu/product/SST_GLO_SST_L4_REP_OBSERVATIONS_010_011/description)
- dataset_id: "METOFFICE-GLO-SST-L4-REP-OBS-SST"
- variables: ["analysed_sst"]
- Parameter definitions:
    - analysed_sst [K]: Sea surface temperature
- Spatial resolution: 0.05deg x 0.05deg

In [3]:
# filename for saving the data as csv
filename = 'sst_1997_2021'

In [4]:
# Set parameters
data_request = {
    "dataset_id" : "METOFFICE-GLO-SST-L4-REP-OBS-SST",
    "variables" : ["analysed_sst"],
    "longitude" : [-60, -10], 
    "latitude" : [60, 40],
    "time" : ["1997-09-01T00:00:00", "2022-01-03T00:00:00"],
    "depth": [None, None] # for bathymetry set it to [0.49402499198913574, 5727.9169921875]
}

In [5]:
df = cm.read_dataframe(
    dataset_id=data_request["dataset_id"],
    variables=data_request["variables"],
    minimum_longitude=data_request["longitude"][0],
    maximum_longitude=data_request["longitude"][1],
    minimum_latitude=data_request["latitude"][0],
    maximum_latitude=data_request["latitude"][1],
    minimum_depth=data_request["depth"][0],
    maximum_depth=data_request["depth"][1],
    start_datetime=data_request["time"][0],
    end_datetime=data_request["time"][1]
)

INFO - 2024-08-06T17:34:34Z - Dataset version was not specified, the latest one was selected: "202003"
INFO - 2024-08-06T17:34:34Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-08-06T17:34:54Z - Service was not specified, the default one was selected: "arco-time-series"


In [6]:
print(df.shape)
df.head()

(9039000, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,analysed_sst
time,latitude,longitude,Unnamed: 3_level_1
1997-09-01,60.025002,-59.974998,280.369994
1997-09-01,60.025002,-59.924999,280.429994
1997-09-01,60.025002,-59.875,280.489994
1997-09-01,60.025002,-59.825001,280.529994
1997-09-01,60.025002,-59.775002,280.559994


In [7]:
# remove the index columns corresponding to time, latitude and longitude
df = df.reset_index()
# drop na rows
df = df.dropna()

df.head()

Unnamed: 0,time,latitude,longitude,analysed_sst
0,1997-09-01,60.025002,-59.974998,280.369994
1,1997-09-01,60.025002,-59.924999,280.429994
2,1997-09-01,60.025002,-59.875,280.489994
3,1997-09-01,60.025002,-59.825001,280.529994
4,1997-09-01,60.025002,-59.775002,280.559994


In [8]:
# function to coarse grain the data and make resolution same as Pisces data
def coarse_grain(df, features):
  """
    Parameters
    ----------
    df: pandas dataframe containing the data accessed from copernicus mariner
    features: name of the features in the dataframe

    Output
    ------
    a pandas dataframe with feature values for 0.25 deg x 0.25 deg resolution 

    """
  
  df["0_1"] = df["latitude"].to_numpy() - np.floor(df["latitude"])
  conditions = [df["0_1"] < 0.25,
                (df["0_1"] >= 0.25)  & (df["0_1"] < 0.5),
                (df["0_1"] >= 0.5)  & (df["0_1"] < 0.75),
                (df["0_1"] >= 0.75)  & (df["0_1"] < 1)]
  outputs = [0, 0.25, 0.5, 0.75]
  df['latitude'] = np.floor(df["latitude"]) + np.select(conditions, outputs)
  df = df.drop(columns=["0_1"])

  df["0_1"] = df["longitude"].to_numpy() - np.floor(df["longitude"])
  # if not redefined then conditions is based on latitude
  conditions = [df["0_1"] < 0.25,
              (df["0_1"] >= 0.25)  & (df["0_1"] < 0.5),
              (df["0_1"] >= 0.5)  & (df["0_1"] < 0.75),
              (df["0_1"] >= 0.75)  & (df["0_1"] < 1)]
  df['longitude'] = np.floor(df["longitude"]) + np.select(conditions, outputs)
  df = df.drop(columns=["0_1"])

  return df.groupby(["time","latitude","longitude"])[features].mean() 

In [9]:
# Need coarse grain the values because all these datasets have finer resolution than Pisces data
# Pisces data has resolution of 0.25 deg x 0.25 deg
df_cg = coarse_grain(df, df.columns[3:].tolist())
print(df_cg.shape)
df_cg.head()

(1807800, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,analysed_sst
time,latitude,longitude,Unnamed: 3_level_1
1997-09-01,60.0,-60.0,280.475994
1997-09-01,60.0,-59.75,280.599994
1997-09-01,60.0,-59.5,280.899994
1997-09-01,60.0,-59.25,281.243994
1997-09-01,60.0,-59.0,281.315994


In [10]:
# remove time and just have month and year
df_cg = df_cg.reset_index()

# from https://stackoverflow.com/questions/53509168/extract-year-month-and-day-from-datetime64ns-utc-python
datetimes = pd.to_datetime(df_cg['time'])
df_cg['day'] = datetimes.dt.day
df_cg['month'] = datetimes.dt.month
df_cg['year'] = datetimes.dt.year
df_cg.head()

Unnamed: 0,time,latitude,longitude,analysed_sst,day,month,year
0,1997-09-01,60.0,-60.0,280.475994,1,9,1997
1,1997-09-01,60.0,-59.75,280.599994,1,9,1997
2,1997-09-01,60.0,-59.5,280.899994,1,9,1997
3,1997-09-01,60.0,-59.25,281.243994,1,9,1997
4,1997-09-01,60.0,-59.0,281.315994,1,9,1997


In [11]:
# remove the time column
df_cg = df_cg.drop(columns=["time"])
df_cg = df_cg.set_index(["year","month","day","latitude","longitude"])

df_cg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,analysed_sst
year,month,day,latitude,longitude,Unnamed: 5_level_1
1997,9,1,60.0,-60.0,280.475994
1997,9,1,60.0,-59.75,280.599994
1997,9,1,60.0,-59.5,280.899994
1997,9,1,60.0,-59.25,281.243994
1997,9,1,60.0,-59.0,281.315994


In [20]:
# the observation in this dataset are daily
# to be consistent across datasets, averaging out to make it monthly
df_cg = df_cg.reset_index().groupby(["year","month","latitude","longitude"]).mean()
df_cg = df_cg.drop(columns = ["day"])
df_cg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,analysed_sst
year,month,latitude,longitude,Unnamed: 4_level_1
1997,9,60.0,-60.0,279.375594
1997,9,60.0,-59.75,279.483794
1997,9,60.0,-59.5,279.58706
1997,9,60.0,-59.25,279.888394
1997,9,60.0,-59.0,280.220194


In [23]:
df_cg.to_csv(filename+".csv")

Checking number of months in each year and number of days in each month

In [21]:
df_time = df_cg.reset_index()
df_time = df_time[["year","month"]]
df_time = df_time.drop_duplicates()
df_time.head()

Unnamed: 0,year,month
0,1997,9
200,1997,10
400,1997,11
600,1997,12
800,1998,1


In [22]:
df_time.year.value_counts()

year
2010    12
2021    12
2019    12
2018    12
2017    12
2016    12
2015    12
2014    12
2013    12
2012    12
2011    12
1998    12
2009    12
2008    12
2007    12
2006    12
2005    12
2004    12
2003    12
2002    12
2001    12
2000    12
1999    12
2020    12
2022     5
1997     4
Name: count, dtype: int64