In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import json
import os
from glob import glob

import plotly.express as px
px.set_mapbox_access_token('pk.eyJ1IjoiamgxNzM2IiwiYSI6ImNpaG8wZWNnYjBwcGh0dGx6ZG1mMGl0czAifQ.mhmvIGx34x2fw0s3p9pnaw')

# https://polar.ncep.noaa.gov/waves/viewer.shtml?-multi_1-US_eastcoast-

## Potential data sources
- wave info: https://polar.ncep.noaa.gov/waves/viewer.shtml?-multi_1-US_eastcoast-
- https://nowcoast.noaa.gov/
- http://paocweb.mit.edu/research-group/mitgcm
- https://earth.nullschool.net/#current/ocean/surface/currents/orthographic=-67.07,36.11,870/loc=-60.255,46.201

- min/max longitude is pretty large in the test dataset.  we need to ensure we normalize/convert to account for this
- what are the expected outputs for the competition?

- TODO check for outliers - (drifter on land? drifter travel too fast? maybe throw out severe weather?) 
- can we inject knowledge? 
- - oceanographic basins, gulf stream, land interaction
- - nearby drifters may behave the same? 


In [18]:
def load_data(search_path='data/challenge_*day*.json'):
    # load sorted dates
    dates = sorted(glob(search_path))
    for cnt, date in enumerate(dates):  
        print('loading date: %s'% date)
        day_data = json.load(open(date))['all_data']
        for spot in range(len(day_data)):
            spot_day_data = day_data[spot]['data']
            this_track_df = pd.DataFrame(spot_day_data['track'])
            this_wave_df = pd.DataFrame(spot_day_data['waves'])
            this_track_df['spotterId'] = spot_day_data['spotterId']
            this_wave_df['spotterId'] = spot_day_data['spotterId']
            this_track_df['day'] = cnt
            this_wave_df['day'] = cnt
            # get date from filename
            st = date.index('sofar')+len('sofar_')
            en = st + 8
            this_track_df['date'] = date[st:en]
            this_wave_df['date'] =   date[st:en]      
            if not spot + cnt:
                track_df = this_track_df
                wave_df = this_wave_df
            else:
                track_df = track_df.append(this_track_df)
                wave_df = wave_df.append(this_wave_df)
    track_df['ts'] = track_df['timestamp']
    wave_df['ts'] = wave_df['timestamp']
    track_df['sample_num'] = -1
    for spot in track_df['spotterId'].unique():
        track_df.loc[track_df['spotterId'] == spot, 'sample_num'] = np.arange(track_df[track_df['spotterId'] == spot].shape[0])        
    track_df['scaled_sample_num'] = track_df['sample_num'] / track_df['sample_num'].max() 
    return track_df, wave_df

In [19]:
track_df, wave_df = load_data()

loading date: data/challenge_1-day_sofar_20211102_day1JSON.json
loading date: data/challenge_1-day_sofar_20211103_day2JSON.json


In [4]:
def plot_spot_tracks(track_df, savedir='spot_plots'):
    for spot in track_df['spotterId'].unique():
        spot_track_df = track_df[track_df['spotterId'] == spot]
        plt.figure()
        sns.scatterplot(x="longitude", y="latitude",
                        hue="spotterId", size='date',
                        data=spot_track_df)
        if not os.path.exists(savedir):
            os.makedirs(savedir)
        plt.savefig(os.path.join(savedir, '%s.png'%spot))
        plt.close()

In [24]:
fig = px.scatter_mapbox(track_df, lat="latitude", lon="longitude", color="spotterId", size='scaled_sample_num',
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=7, zoom=2)
fig.show()
fig.write_html("drifter_paths.html")