# ***Search*** for a BGC-Argo float of your interest

- lon0: the western edge of the box (range: -180 to 180 deg)
- lon1: the eastern edge of the box
- lat0: the southern edge of the box (range: -90 to 90 deg)
- lat1: the western edge of the box
- date0: the first date (format: yyyymmdd)
- date1: the last date
- mindays: the minimum number of days of data required (e.g., 365 days = there are at least 365 days between the first and last profiles)
- minfreq: the minimum number of sampling frequency (e.g., 14 days = at least one profile every 14 days on average)
- maxdrift: the maximum drifting speed of the float. Useful for 1-d modelling, in which we want floats that are drifting slowly. For drifting speed, refer to Katsumata and Yoshinari (2010), https://doi.org/10.1007/s10872-010-0046-4
- medseamask: Set to `True` if you want to exclude the Mediterranean Sea from search. Otherwise set to `False`.
- list_var: list of BGC variables (CHLA, BBP700, NITRATE, DOWNWELLING_PAR, DOXY, PH_IN_SITU_TOTAL, CDOM)

### <span style="color:red">--- User input begins ---</span>

In [None]:
lon0 = -180 
lon1 = 180 
lat0 = -90 
lat1 = 90 
date0 = 20210101
date1 = 20240131
mindays = 365*1
minfreq = 30
maxdrift = 0.05
medseamask = True
full_sensors = ['CHLA','BBP700','NITRATE','DOWNWELLING_PAR','DOXY','PH_IN_SITU_TOTAL']

### <span style="color:red">--- User input ends ---</span>

In [None]:
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cartopy.crs as ccrs
import cartopy.feature as cft 
from datetime import datetime, timedelta
%matplotlib inline

from math import radians, sin, cos, sqrt, atan2

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in km

    dlat = radians(lat2 - lat1)
    
    # Handle longitude difference with wrapping around the 180° meridian
    dlon = radians((lon2 - lon1 + 360) % 360)  # Normalize to [0, 360)
    if dlon > radians(180):
        dlon -= radians(360)  # Shorten distance across the 180° meridian

    a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c

def calculate_area(min_lat, min_lon, max_lat, max_lon):
    # Approximate width using the distance between min_lon and max_lon at the midpoint latitude
    mid_lat = (min_lat + max_lat) / 2
    width = haversine_distance(mid_lat, min_lon, mid_lat, max_lon)
    
    # Approximate height using the distance between min_lat and max_lat at the midpoint longitude
    height = haversine_distance(min_lat, min_lon, max_lat, min_lon)
    
    return width * height  # Area in km²

def calculate_float_speed(lon,lat,date):
    # NOTE that this represents the overall mean speed and not the mean of the instantaneous speed (between cycles). 
    # This prevents from division by zero when there are multiple sampling (especially in the first day of deployment).
    # We should be careful about this when automating analysis.
    
    # Earth's radius in meters
    R = 6371000  
    # convert to radians
    lon_rad = np.radians(np.array(lon))
    lat_rad = np.radians(np.array(lat))
    # take the difference
    delta_lon = lon_rad[1:] - lon_rad[:-1]
    delta_lat = lat_rad[1:] - lat_rad[:-1]
    # Haversine formula
    a = np.sin(delta_lat / 2)**2 + np.cos(lat_rad[:-1]) * np.cos(lat_rad[1:]) * np.sin(delta_lon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distances = R * c  # in meters
    # Time differences in days
    time_deltas = (np.array(date)[1:] - np.array(date)[:-1]).astype('timedelta64[D]').astype(int)
    # Speed in meters per second
    speed = np.sum(distances) / np.sum(time_deltas) / 86400.
    # average speed
    return speed

def contains_points_in_MedSea(lon, lat):
    """
    Returns:
        bool: True if any point is inside the box, False otherwise
    """    
    within_lon = (np.array(lon) >= -5) & (np.array(lon) <= 37)
    within_lat = (np.array(lat) >= 30) & (np.array(lat) <= 45)
    
    return np.any(within_lon & within_lat)

<a id="jump_bgc-argo"></a>
### Retrieve the latest Argo index from the website / 最新のアルゴデータインデックス(カタログ)を読み込む
- This index can also be viewed from your browser / カタログはブラウザからも閲覧できる: https://data-argo.ifremer.fr/argo_synthetic-profile_index.txt
- Note this process can take some time (for a minute or so) as the index is relatively large (MBs) and the server can be slow / カタログの容量が大きいこととサーバーが遅い時があるため、カタログを読み込むのに１分ほど時間を要することがある。
- `skiprows = 8`: Ignore the first 8 rows which are irrelevant / カタログの最初の８行は必要ないので無視する

In [None]:
data = pd.read_csv('https://data-argo.ifremer.fr/argo_synthetic-profile_index.txt',skiprows=8)
data

### Filter the profile list based on your requirements / 検索条件に応じたリストを作成する
Personal note:
- Run this cell even if you want for the whole region. this filtering helps remove the data with NaN values for dates.
- Some of the inputs above (i.e., mindays, minfreq, maxdrift, medseamask) will not affect the result (but they will affect the map and time series further below).

In [None]:
#extract the dataset based on lon, lat, and variables. Date has a weird format so extract first then postprocess dates
datasub = data[(data['longitude'] >= lon0) & (data['longitude'] <= lon1) & (data['latitude'] >= lat0) &
               (data['latitude'] <= lat1) & (data['date'] >= 0)]

for i in range(len(full_sensors)):
    datasub = datasub[datasub['parameters'].str.contains(full_sensors[i])]

#add a new variable called 'time'
timestr = [str(num) for num in datasub['date']]
timestr = [num[8:8+6] for num in timestr]
timeint = [int(num) for num in timestr]
datasub.loc[:,'time'] = timeint

#modify the date format to be yyyymmdd
datestr = [str(num) for num in datasub['date']]
datestr = [num[0:8] for num in datestr]
dateint = [int(num) for num in datestr]
datasub.loc[:,'date'] = dateint
#refine the dataset based on the selected period
datasub = datasub[(datasub['date'].values >= date0) & (datasub['date'].values <= date1)]
datasub

## Visualize the search results as a map and time series.

In [None]:

!mkdir -p search

In [None]:
fig = plt.figure(figsize=(8,6))
fig2 = plt.figure(figsize=(8,6))
ax2 = fig2.add_subplot(1,1,1)
cmap20 = plt.colormaps['tab20']
tab20_colors = [cmap20(i) for i in range(20)]  # List of 20 RGBA tuples

ax1 = fig.add_subplot(1,1,1,projection= ccrs.PlateCarree(central_longitude=270)) #InterruptedGoodeHomolosine
ax1.add_feature(cft.LAND)
#ax1.coastlines(resolution='50m')
ax1.set_extent([lon0, lon1, lat0, lat1], crs=ccrs.PlateCarree())
gl = ax1.gridlines(draw_labels=True, dms=True, x_inline=False, y_inline=False,color = "None")
gl.top_labels=False
gl.right_labels=False

#temporary list for tracking float info
floatid_cur = 0
floatnum = []
floatsta = []
floatend = []
floatlon = []
floatlat = []
floatdate = []
daimei = []
#list for saving the float infomation
lines = []

#index for floats
abci = 0
#loop over each profile
for i in range(datasub.shape[0]):
    #float id: obtain from the file path split by slash (/) and grab the second index (after the first and second slashes)
    floatid = datasub['file'].values[i].split('/')[1]
    #in the first iteration, save the id and start date
    if i == 0: 
        floatnum.append(floatid)
        floatsta.append(datasub['date'].values[i])
        floatlon.append(datasub['longitude'].values[i])
        floatlat.append(datasub['latitude'].values[i]) 
        floatdate.append(datetime.strptime(str(int(datasub['date'].values[i])),'%Y%m%d'))
        floatid_cur = floatid
    # for the rest of the iteration
    else:
        # accumulate lon,lat if still the same float
        if floatid == floatid_cur:
            floatlon.append(datasub['longitude'].values[i])
            floatlat.append(datasub['latitude'].values[i])
            floatdate.append(datetime.strptime(str(int(datasub['date'].values[i])),'%Y%m%d'))
        # if it is a new float, plot the results of the previous float
        else:
            if len(floatdate) > 1: # ignore if there is only one profile
                # Check for the three criteria based on frequency and duration of the profiling and drift speed
                valid_freq = np.max(np.diff(floatdate)).days < minfreq
                valid_drift = calculate_float_speed(floatlon,floatlat,floatdate) < maxdrift
                valid_dur = max(floatdate)-min(floatdate) > timedelta(days=mindays)
                # only plot the floats that are longer than mindays AND drifting slower than maxdrift AND minfreq satisfied
                if valid_freq and valid_drift and valid_dur:
                    if not medseamask or (medseamask and not contains_points_in_MedSea(floatlon, floatlat)):                        
                        ax1.scatter(floatlon,floatlat,color=tab20_colors[abci % len(tab20_colors)],transform=ccrs.PlateCarree(),zorder=3,s=0.1)
                        ax1.scatter(floatlon[-1],floatlat[-1],color='k',marker='.',transform=ccrs.PlateCarree(),zorder=4,s=6)
                        ax1.text(floatlon[-1],floatlat[-1],abci+1,color='k',transform=ccrs.PlateCarree(),zorder=5,fontsize=5)
                        for k in range(np.size(floatdate)):
                            ax2.scatter(floatdate[k],abci,color=tab20_colors[abci % len(tab20_colors)],marker='|',linewidths=0.1)
                        daimei.append(floatnum[-1]+'('+str(abci+1)+')')
                        abci += 1
                        lines.append(str(floatnum[-1])+','+
                                     str(round(0.5*(min(floatlon)+max(floatlon)),2))+','+
                                     str(round(0.5*(min(floatlat)+max(floatlat)),2))+','+
                                     str(floatdate[0].strftime("%Y%m%d"))+','+
                                     str(floatdate[-1].strftime("%Y%m%d"))+','+
                                     str(len(floatdate))
                                    )
            floatlon = []
            floatlat = []
            floatdate = []
            #floatstatus = []
            floatend.append(datasub['date'].values[i-1])
            floatsta.append(datasub['date'].values[i])
            floatid_cur = floatid
            floatnum.append(floatid)
            floatlon.append(datasub['longitude'].values[i])
            floatlat.append(datasub['latitude'].values[i])
            floatdate.append(datetime.strptime(str(int(datasub['date'].values[i])),'%Y%m%d'))
# end of the loop over the profiles
if len(floatdate) > 1: # ignore if there is only one profile
    # Check for the three criteria based on frequency and duration of the profiling and drift speed
    valid_freq = np.max(np.diff(floatdate)).days < minfreq
    valid_drift = calculate_float_speed(floatlon,floatlat,floatdate) < maxdrift
    valid_dur = max(floatdate)-min(floatdate) > timedelta(days=mindays)
    # only plot the floats that are longer than mindays AND drifting slower than maxdrift AND minfreq satisfied
    if valid_freq and valid_drift and valid_dur:
        if not medseamask or (medseamask and not contains_points_in_MedSea(floatlon, floatlat)):                        
            #plot the remaining float
            ax1.scatter(floatlon,floatlat,color=tab20_colors[abci % len(tab20_colors)],transform=ccrs.PlateCarree(),zorder=3,s=0.1)
            ax1.scatter(floatlon[-1],floatlat[-1],color='k',marker='.',transform=ccrs.PlateCarree(),zorder=4,s=6)
            ax1.text(floatlon[-1],floatlat[-1],abci+1,color='k',transform=ccrs.PlateCarree(),zorder=5,fontsize=5)
            floatend.append(datasub['date'].values[i])
            for k in range(np.size(floatdate)):
                ax2.scatter(floatdate[k],abci,color=tab20_colors[abci % len(tab20_colors)],marker='|',linewidths=0.1)
            daimei.append(floatnum[-1]+'('+str(abci+1)+')')
            lines.append(str(floatnum[-1])+','+
                         str(round(0.5*(min(floatlon)+max(floatlon)),2))+','+
                         str(round(0.5*(min(floatlat)+max(floatlat)),2))+','+
                         str(floatdate[0].strftime("%Y%m%d"))+','+
                         str(floatdate[-1].strftime("%Y%m%d"))+','+
                         str(len(floatdate))
                        )

#save the current time for creating a unique id for the outputs.
now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

# saving the map
plt.tight_layout()
fig.savefig('search/map_search_'+now,dpi=300,bbox_inches='tight')

# saving the time series
ax2.set_yticks(range(np.size(daimei)), daimei)
ax2.set_xlim(datetime.strptime(str(date0),'%Y%m%d'),datetime.strptime(str(date1),'%Y%m%d'))
ax2.set_ylabel('WMO ID (label)')
ax2.yaxis.label.set_horizontalalignment('left')
plt.grid(axis='y')
plt.tight_layout()
fig2.savefig('search/ts_search_'+now,dpi=300,bbox_inches='tight')

# create a copy of this notebook for reference
!cp search.ipynb search/search_{now}.ipynb