# Script to download and merge the needed data
- First, the AIS data are downloaded for the whole year 2020 
- Second, corresponding to the datapoints received from the AIS data CMEMS wave and physics data are downloaded 
- Third, all parts of AIS data, CMEMS wave and CMEMS phyiscs we are interested in is merged into a single datafram/ csv-file 

- some of the chunks are using code produced by 52North (https://github.com/52North/MariGeoRoute/blob/b3018578ec7f6a128b63c6a44d4514e8fa767cae/AIS/preprocessing_AIS.ipynb) 



---
### Download + Subset of AIS data 


In [None]:
!pip install wget # to be able to download via ftp 

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9675 sha256=afe0b8e9c11cd5986ba41f94115b781dd07975b0555690e38cfd3bd1fa159adc
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
# Import of all needed libraries 
import wget
import os
import pandas as pd 
from os import walk
from glob import glob
import numpy as np
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import zipfile
from datetime import datetime, timedelta
from pathlib import Path

In [None]:
def check_dir(year):
    """
    Function to check how far the download proceeded. 
    
    Parameters
    ----------
    year : int
        describes the year of the data  
    """
    history =[]
    for path, dirs, files in os.walk(str(year)):
        for file in files:
            if file.endswith('.csv'):
                history.append(file.split('.')[0])
    return history

In [None]:
def download_AIS(year, zones):
    """
    Function to download the AIS data from https://coast.noaa.gov/. 
        
    Parameters
    ----------
    year : int
        datetime as a string as the format YYYY-mm-ddTHH:MM:SSZ
    zones : list 
        describes the UTM zones for which data gets downloaded 
    """
    Path(str(year)).mkdir(parents=True, exist_ok=True)
    resume_download = check_dir(str(year))
    url = "https://coast.noaa.gov/htdata/CMSP/AISDataHandler/{0}/".format(year)
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'html.parser')
    files = []
    for a in soup.find_all('a', href=True):
         if a.text and a.text.endswith('zip'):
            name, _ = a['href'].split('.')
            l = name.split('_')
            l.append(a.text)
            files.append(l) 

    df = pd.DataFrame(files)
    df.columns = [*df.columns[:-1], 'Files']
  
    for c in df.columns:
        if c == 'Files': continue
        unique_col = len(df[c].unique())
        if unique_col == 12 or unique_col == 6: # some years provid data only for 6 month
            df['Month'] = df[c]
        elif unique_col == 31:
            df['Days'] = df[c]
        elif 20 >= unique_col >= 18 or 'zone' in df[c][0].lower():
            if 'zone' in df[c][0].lower():
                df['Zone'] = pd.to_numeric([z[z.lower().find('zone')+4:]for z in df[c]])
            else:
                df['Zone'] = pd.to_numeric(df[c])
        del df[c]
    
    #  download
    if 'Zone' in df.columns:
        dl = df[df['Zone'].isin(zones)][['Files','Zone']]
    else:
        dl = df[['Files','Month']]
    for file, zone in tqdm(dl.values.tolist()):
        output = '%s_%s' % (year, str(zone))
        Path(str(year)).joinpath(output).mkdir(parents=True, exist_ok=True)
        if file.split('.')[0] in resume_download: continue
        print(file)
        wget.download(os.path.join(url,file)) 
        with zipfile.ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall(os.path.join(str(year), output))
        os.remove(file)

In [None]:
def subset_AIS_to_CSV(year):
    """
    Function to filter the AIS data on the data we need. 
    Therefore the rows MMSI, VesselName, CallSign, Cargo and TranscieverClass are dropped.
    In addition, the data is filtered to ships where cargo is being transported and those ships are underway. 
    On top of that, the data is roughly filtered to ships that are Panamax 2 size, meaning they fit through the Panama Canal extension.  

    Parameters
    ----------
    year : int
        datetime as a string as the format YYYY-mm-ddTHH:MM:SSZ 
    """
    data_list = []
    for path, dirs, files in os.walk(str(year)):
        for file in files:
                if file.endswith('.csv'):
                    x = os.path.join(path, file)

                    df = pd.read_csv(x)
                    df = df.drop(['MMSI', 'VesselName', 'CallSign', 'Cargo', 'TranscieverClass'], axis=1, errors='ignore')
                    df = df.dropna()
                    df = df.query('(Status == "under way using engine" or Status == 8) & (VesselType in [1016, 1003, 1004] or 79 >= VesselType >= 70) & SOG > 7 & Length < 367 & 30 < Width < 50 & Draft < 16')
                    df['BaseDateTime'] = pd.to_datetime(df.BaseDateTime, format='%Y-%m-%dT%H:%M:%S')
                    data_list.extend(df.values)

    df = pd.DataFrame(data_list, columns=['BaseDateTime', 'LAT', 'LON', 'SOG', 'COG', 'Heading',  'IMO', 'VesselType', 'Status', 'Length','Width', 'Draft'])
    df = df.dropna()
    df = df[df['BaseDateTime'] >= datetime(2016,3,1,3)]
    df.to_csv(os.path.join(str(year), '%s_AIS.csv' % str(year)))
    return df

In [None]:
def show_plotly(dataframe):
    """
    Function to plot the downloaded data points.  
        
    Parameters
    ----------
    dataframe : pandas DataFrame
        dataframe with the point data and needed SOG 
    """
    import plotly.express as px
    fig = px.scatter_mapbox(dataframe,
                        lat=dataframe.LAT,
                        lon=dataframe.LON, color='SOG', mapbox_style="stamen-toner")
    fig.update_geos(
        lataxis_range=[dataframe.LAT.min(),dataframe.LAT.max()], lonaxis_range=[dataframe.LON.min(), dataframe.LON.max()]
    )
    fig.show()

In [None]:
# Here the AIS data gets downloaded for the year 2020, and the UTM zones 1 - 25 (https://upload.wikimedia.org/wikipedia/commons/e/ed/Utm-zones.jpg). 
download_AIS(2020, zones=list(range(1,25)))

  0%|          | 0/366 [00:00<?, ?it/s]

AIS_2020_01_01.zip


  0%|          | 1/366 [00:20<2:05:31, 20.64s/it]

AIS_2020_01_02.zip


  1%|          | 2/366 [00:41<2:06:21, 20.83s/it]

AIS_2020_01_03.zip


  1%|          | 3/366 [01:06<2:13:28, 22.06s/it]

AIS_2020_01_04.zip


  1%|          | 4/366 [01:27<2:10:34, 21.64s/it]

AIS_2020_01_05.zip


  1%|▏         | 5/366 [01:55<2:21:32, 23.53s/it]

AIS_2020_01_06.zip


  2%|▏         | 6/366 [02:17<2:18:43, 23.12s/it]

AIS_2020_01_07.zip


  2%|▏         | 7/366 [02:39<2:15:32, 22.65s/it]

AIS_2020_01_08.zip


  2%|▏         | 8/366 [02:59<2:10:52, 21.93s/it]

AIS_2020_01_09.zip


  2%|▏         | 9/366 [03:19<2:06:32, 21.27s/it]

AIS_2020_01_10.zip


  3%|▎         | 10/366 [03:39<2:04:49, 21.04s/it]

AIS_2020_01_11.zip


  3%|▎         | 11/366 [04:03<2:08:39, 21.74s/it]

AIS_2020_01_12.zip


  3%|▎         | 12/366 [04:28<2:14:16, 22.76s/it]

AIS_2020_01_13.zip


  4%|▎         | 13/366 [04:51<2:15:30, 23.03s/it]

AIS_2020_01_14.zip


  4%|▍         | 14/366 [05:16<2:18:16, 23.57s/it]

AIS_2020_01_15.zip


  4%|▍         | 15/366 [05:40<2:18:08, 23.61s/it]

AIS_2020_01_16.zip


  4%|▍         | 16/366 [06:06<2:22:40, 24.46s/it]

AIS_2020_01_17.zip


  5%|▍         | 17/366 [06:29<2:19:08, 23.92s/it]

AIS_2020_01_18.zip


  5%|▍         | 18/366 [06:54<2:20:51, 24.29s/it]

AIS_2020_01_19.zip


  5%|▌         | 19/366 [07:17<2:17:15, 23.73s/it]

AIS_2020_01_20.zip


  5%|▌         | 20/366 [07:35<2:07:18, 22.08s/it]

AIS_2020_01_21.zip


  6%|▌         | 21/366 [07:54<2:01:45, 21.18s/it]

AIS_2020_01_22.zip


  6%|▌         | 22/366 [08:14<2:00:01, 20.93s/it]

AIS_2020_01_23.zip


  6%|▋         | 23/366 [08:34<1:57:35, 20.57s/it]

AIS_2020_01_24.zip


  7%|▋         | 24/366 [08:55<1:58:04, 20.72s/it]

AIS_2020_01_25.zip


  7%|▋         | 25/366 [09:15<1:56:57, 20.58s/it]

AIS_2020_01_26.zip


  7%|▋         | 26/366 [09:35<1:55:02, 20.30s/it]

AIS_2020_01_27.zip


  7%|▋         | 27/366 [09:55<1:54:53, 20.34s/it]

AIS_2020_01_28.zip


  8%|▊         | 28/366 [10:17<1:56:53, 20.75s/it]

AIS_2020_01_29.zip


  8%|▊         | 29/366 [10:37<1:55:16, 20.52s/it]

AIS_2020_01_30.zip


  8%|▊         | 30/366 [10:57<1:53:57, 20.35s/it]

AIS_2020_01_31.zip


  8%|▊         | 31/366 [11:18<1:54:41, 20.54s/it]

AIS_2020_02_01.zip


  9%|▊         | 32/366 [11:38<1:53:03, 20.31s/it]

AIS_2020_02_02.zip


  9%|▉         | 33/366 [11:57<1:51:16, 20.05s/it]

AIS_2020_02_03.zip


  9%|▉         | 34/366 [12:20<1:54:55, 20.77s/it]

AIS_2020_02_04.zip


 10%|▉         | 35/366 [12:39<1:52:00, 20.30s/it]

AIS_2020_02_05.zip


 10%|▉         | 36/366 [12:58<1:50:26, 20.08s/it]

AIS_2020_02_06.zip


 10%|█         | 37/366 [13:17<1:47:47, 19.66s/it]

AIS_2020_02_07.zip


 10%|█         | 38/366 [13:37<1:47:33, 19.68s/it]

AIS_2020_02_08.zip


 11%|█         | 39/366 [13:57<1:48:53, 19.98s/it]

AIS_2020_02_09.zip


 11%|█         | 40/366 [14:18<1:49:10, 20.09s/it]

AIS_2020_02_10.zip


 11%|█         | 41/366 [14:39<1:51:03, 20.50s/it]

AIS_2020_02_11.zip


 11%|█▏        | 42/366 [15:02<1:53:34, 21.03s/it]

AIS_2020_02_12.zip


 12%|█▏        | 43/366 [15:23<1:53:13, 21.03s/it]

AIS_2020_02_13.zip


 12%|█▏        | 44/366 [15:43<1:52:12, 20.91s/it]

AIS_2020_02_14.zip


 12%|█▏        | 45/366 [16:07<1:55:58, 21.68s/it]

AIS_2020_02_15.zip


 13%|█▎        | 46/366 [18:33<5:15:27, 59.15s/it]

AIS_2020_02_16.zip


 13%|█▎        | 47/366 [18:54<4:13:51, 47.75s/it]

AIS_2020_02_17.zip


 13%|█▎        | 48/366 [19:17<3:33:25, 40.27s/it]

AIS_2020_02_18.zip


 13%|█▎        | 49/366 [19:39<3:03:54, 34.81s/it]

AIS_2020_02_19.zip


 14%|█▎        | 50/366 [20:08<2:54:05, 33.06s/it]

AIS_2020_02_20.zip


 14%|█▍        | 51/366 [20:35<2:43:26, 31.13s/it]

AIS_2020_02_21.zip


 14%|█▍        | 52/366 [21:04<2:39:59, 30.57s/it]

AIS_2020_02_22.zip


 14%|█▍        | 53/366 [21:25<2:24:01, 27.61s/it]

AIS_2020_02_23.zip


 15%|█▍        | 54/366 [21:45<2:11:59, 25.38s/it]

AIS_2020_02_24.zip


 15%|█▌        | 55/366 [22:06<2:03:57, 23.92s/it]

AIS_2020_02_25.zip


 15%|█▌        | 56/366 [22:27<1:59:07, 23.06s/it]

AIS_2020_02_26.zip


 16%|█▌        | 57/366 [22:47<1:54:39, 22.26s/it]

AIS_2020_02_27.zip


 16%|█▌        | 58/366 [23:07<1:51:18, 21.68s/it]

AIS_2020_02_28.zip


 16%|█▌        | 59/366 [23:28<1:49:15, 21.35s/it]

AIS_2020_02_29.zip


 16%|█▋        | 60/366 [23:49<1:47:55, 21.16s/it]

AIS_2020_03_01.zip


 17%|█▋        | 61/366 [24:08<1:45:21, 20.73s/it]

AIS_2020_03_02.zip


 17%|█▋        | 62/366 [24:29<1:44:35, 20.64s/it]

AIS_2020_03_03.zip


 17%|█▋        | 63/366 [24:52<1:48:42, 21.53s/it]

AIS_2020_03_04.zip


 17%|█▋        | 64/366 [25:13<1:46:52, 21.23s/it]

AIS_2020_03_05.zip


 18%|█▊        | 65/366 [25:33<1:45:14, 20.98s/it]

AIS_2020_03_06.zip


 18%|█▊        | 66/366 [25:58<1:51:05, 22.22s/it]

AIS_2020_03_07.zip


 18%|█▊        | 67/366 [26:19<1:48:13, 21.72s/it]

AIS_2020_03_08.zip


 19%|█▊        | 68/366 [26:39<1:46:02, 21.35s/it]

AIS_2020_03_09.zip


 19%|█▉        | 69/366 [27:01<1:46:13, 21.46s/it]

AIS_2020_03_10.zip


 19%|█▉        | 70/366 [27:23<1:46:20, 21.56s/it]

AIS_2020_03_11.zip


 19%|█▉        | 71/366 [27:44<1:45:29, 21.46s/it]

AIS_2020_03_12.zip


 20%|█▉        | 72/366 [28:06<1:45:50, 21.60s/it]

AIS_2020_03_13.zip


 20%|█▉        | 73/366 [28:28<1:45:36, 21.63s/it]

AIS_2020_03_14.zip


 20%|██        | 74/366 [28:51<1:47:33, 22.10s/it]

AIS_2020_03_15.zip


 20%|██        | 75/366 [29:14<1:48:45, 22.42s/it]

AIS_2020_03_16.zip


 21%|██        | 76/366 [29:36<1:47:43, 22.29s/it]

AIS_2020_03_17.zip


 21%|██        | 77/366 [29:58<1:46:19, 22.08s/it]

AIS_2020_03_18.zip


 21%|██▏       | 78/366 [30:19<1:44:32, 21.78s/it]

AIS_2020_03_19.zip


 22%|██▏       | 79/366 [30:41<1:44:26, 21.83s/it]

AIS_2020_03_20.zip


 22%|██▏       | 80/366 [31:02<1:43:23, 21.69s/it]

AIS_2020_03_21.zip


 22%|██▏       | 81/366 [31:23<1:42:02, 21.48s/it]

AIS_2020_03_22.zip


 22%|██▏       | 82/366 [31:45<1:41:56, 21.54s/it]

AIS_2020_03_23.zip


 23%|██▎       | 83/366 [32:07<1:41:52, 21.60s/it]

AIS_2020_03_24.zip


 23%|██▎       | 84/366 [32:29<1:42:01, 21.71s/it]

AIS_2020_03_25.zip


 23%|██▎       | 85/366 [32:50<1:41:48, 21.74s/it]

AIS_2020_03_26.zip


 23%|██▎       | 86/366 [33:12<1:40:58, 21.64s/it]

AIS_2020_03_27.zip


 24%|██▍       | 87/366 [33:33<1:40:14, 21.56s/it]

AIS_2020_03_28.zip


 24%|██▍       | 88/366 [33:55<1:40:00, 21.59s/it]

AIS_2020_03_29.zip


 24%|██▍       | 89/366 [34:16<1:39:26, 21.54s/it]

AIS_2020_03_30.zip


 25%|██▍       | 90/366 [34:38<1:39:20, 21.60s/it]

AIS_2020_03_31.zip


 25%|██▍       | 91/366 [35:00<1:39:46, 21.77s/it]

AIS_2020_04_01.zip


 25%|██▌       | 92/366 [35:21<1:38:40, 21.61s/it]

AIS_2020_04_02.zip


 25%|██▌       | 93/366 [35:44<1:39:31, 21.87s/it]

AIS_2020_04_03.zip


 26%|██▌       | 94/366 [36:07<1:40:24, 22.15s/it]

AIS_2020_04_04.zip


 26%|██▌       | 95/366 [36:27<1:38:06, 21.72s/it]

AIS_2020_04_05.zip


 26%|██▌       | 96/366 [36:51<1:40:41, 22.38s/it]

AIS_2020_04_06.zip


 27%|██▋       | 97/366 [37:15<1:42:23, 22.84s/it]

AIS_2020_04_07.zip


 27%|██▋       | 98/366 [37:37<1:40:37, 22.53s/it]

AIS_2020_04_08.zip


 27%|██▋       | 99/366 [37:59<1:40:15, 22.53s/it]

AIS_2020_04_09.zip


 27%|██▋       | 100/366 [38:22<1:39:42, 22.49s/it]

AIS_2020_04_10.zip


 28%|██▊       | 101/366 [38:43<1:37:23, 22.05s/it]

AIS_2020_04_11.zip


OSError: ignored

In [None]:
# Here the data get subsetted as specified above for the year 2020
subset_AIS_to_CSV(year='2020')

2020/2020_01/AIS_2020_01_25.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_19.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_20.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_29.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_05.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_14.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_30.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_01.csv
2020/2020_01/AIS_2020_01_24.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_13.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_15.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_09.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_27.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_11.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_04.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_31.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_17.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_02.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_08.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_07.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_26.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_28.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_21.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_23.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_03.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_18.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_22.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



2020/2020_01/AIS_2020_01_10.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_12.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_16.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



2020/2020_01/AIS_2020_01_06.csv



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



Unnamed: 0,BaseDateTime,LAT,LON,SOG,COG,Heading,IMO,VesselType,Status,Length,Width,Draft
0,2020-01-25 19:00:00,30.68622,-88.03597,5.3,182.9,182.0,IMO9176606,70.0,8.0,228.0,32.0,10.0
1,2020-01-25 20:00:00,30.52765,-88.02257,12.0,175.0,174.0,IMO9176606,70.0,8.0,228.0,32.0,10.0
2,2020-01-25 21:00:00,30.32924,-88.02809,12.5,187.0,187.0,IMO9176606,70.0,8.0,228.0,32.0,10.0
3,2020-01-25 22:00:00,30.12015,-88.05660,13.6,136.4,136.0,IMO9176606,70.0,8.0,228.0,32.0,10.0
4,2020-01-25 23:00:00,29.88354,-87.83000,17.0,144.4,145.0,IMO9176606,70.0,8.0,228.0,32.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...
181,2020-01-22 20:00:00,28.10118,-90.09590,11.5,40.9,44.0,IMO9484261,70.0,8.0,261.0,32.0,11.0
182,2020-01-22 21:00:00,28.25850,-89.93973,11.9,37.8,39.0,IMO9484261,70.0,8.0,261.0,32.0,11.0
183,2020-01-22 22:00:00,28.39628,-89.81469,11.8,38.9,40.0,IMO9484261,70.0,8.0,261.0,32.0,11.0
184,2020-01-22 23:00:00,28.46866,-89.74957,11.9,38.5,40.0,IMO9484261,70.0,8.0,261.0,32.0,11.0


In [None]:
# Here all subsetted datapoints are shown by the show_plotly function 
year = 2020
df = pd.read_csv(Path(str(year),'%s_AIS.csv' % year))
show_plotly(df)



---
### Download of CMEMS physics and wave data + Merge with AIS data 


In [None]:
# Import of all needed libraries 
!pip install motuclient
import pandas as pd
import requests
import numpy as np
from motu_utils.utils_cas import authenticate_CAS_for_URL
from motu_utils.utils_http import open_url
import xarray as xr
from datetime import datetime, timezone, timedelta 
import time
from pathlib import Path
from bs4 import BeautifulSoup
import sys

Defaulting to user installation because normal site-packages is not writeable


In [None]:
# utils to convert dates 
str_to_date = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
date_to_str = lambda x: x.strftime('%Y-%m-%dT%H:%M:%SZ')
str_to_date2 = lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')

In [None]:
# input CMEMS credentials
UN_CMEMS = input('Please input your CMEMS username: ')
PW_CMEMS = input('Please input your CMEMS password: ')

Please input your CMEMS username: jj
Please input your CMEMS password: jjj


In [None]:
def create_request_url(date, lat, lon, datatype):
    """
    Function creates depending on date, latitute and longitude a valid url to request the cmems data using the credentials defined above. 
    Either for the cmems physics or the wave data. 
        
    Parameters
    ----------
    date : str
        datetime as a string as the format YYYY-mm-ddTHH:MM:SSZ
    lat : str, float
        latitude
    lon : str, float
        longitude
    datatype : str 
        describes the type of downloaded data either cmems physics or wave data. 
    """
    if (datatype == 'physics'):
        base_url = 'http://nrt.cmems-du.eu/motu-web/Motu?action=productdownload&service=GLOBAL_ANALYSIS_FORECAST_PHY_001_024-TDS&product=global-analysis-forecast-phy-001-024'
    else:
        base_url = 'http://nrt.cmems-du.eu/motu-web/Motu?action=productdownload&service=GLOBAL_ANALYSIS_FORECAST_WAV_001_027-TDS&product=global-analysis-forecast-wav-001-027'

    y_lo=float(lat)
    y_hi=float(lat)
    x_lo=float(lon)
    x_hi=float(lon)

    if (datatype == 'physics'):
        # depth
        z_hi = 0.5
        z_lo = 0.49

        t_lo=date_to_str(str_to_date2(date).replace(hour=12, minute=0, second=0))
        t_hi=t_lo
        
        url = base_url + '&x_lo={0}&x_hi={1}&y_lo={2}&y_hi={3}&t_lo={4}&t_hi={5}&z_lo={6}&z_hi={7}&mode=console'.format(x_lo, x_hi,y_lo,y_hi,t_lo,t_hi,z_lo,z_hi)
    else:
        dataset_temporal_resolution = 180
        time_in_min = (str_to_date2(date).hour * 60) + str_to_date2(date).minute
        rest = time_in_min % dataset_temporal_resolution
        t_lo = date_to_str((str_to_date2(date) - timedelta(minutes=rest)).replace(second=0))
        t_hi = t_lo

        url = base_url+'&x_lo={0}&x_hi={1}&y_lo={2}&y_hi={3}&t_lo={4}&t_hi={5}&mode=console'.format(x_lo, x_hi, y_lo, y_hi, t_lo, t_hi)
    
    return authenticate_CAS_for_URL(url , UN_CMEMS, PW_CMEMS)

In [None]:
def retrive_and_interpolate_data(date, lat, lon, dataset_temporal_resolution, datatype):
    """
    Function to retrive all variables from dataset for a specific timestamp, latitude, longitude concidering the temporal resolution of the dataset to calculate interpolated values. 
    To get a higher number of data faster, only the next lower data point is searched, not also the higher one. Accordingly, some lines of code are commented out. 

    Parameters
    ----------
    date : datetime object
        datetime as a date object
    lat : str, float
        latitude
    lon : str, float
        longitude
    dataset_temporal_resolution: int
        the temporal resolution of the dateset (in hours)
    datatype : str 
        describes the type of downloaded data either cmems physics or wave data. 
    """
    h = date.hour 
    rest = h % dataset_temporal_resolution 
    if rest == 0:
        url = create_request_url(date_to_str(date), lat, lon, datatype)
        date = xr.open_dataset(open_url(url).read())
        return np.ravel(date.to_dataframe().reset_index(drop=True).values)
    else:
        nearest_lower = date - timedelta(hours= rest)
        #nearest_upper = date + timedelta(hours=dataset_temporal_resolution-rest)
        
        url_lower = create_request_url(date_to_str(nearest_lower), lat, lon, datatype)
        #url_upper = create_request_url(date_to_str(nearest_upper), lat, lon, datatype)
        
        bytes_lower = open_url(url_lower).read()
        #bytes_upper = open_url(url_upper).read()
        
        try:
            data_lower = xr.open_dataset(bytes_lower)
            #data_upper = xr.open_dataset(bytes_upper) 
        except:
            # print the error tag from html
            print(BeautifulSoup(bytes_lower, 'html.parser').find('p', {"class": "error"}))
            #print(BeautifulSoup(bytes_upper, 'html.parser').find('p', {"class": "error"}))
       
      
        v_lower = data_lower.to_dataframe().reset_index(drop=True)
        #v_upper = data_upper.to_dataframe().reset_index(drop=True)
    
        return np.ravel(v_lower)
    
        # temporal interpolation 
        alpha = rest / dataset_temporal_resolution
        return np.ravel((1- alpha)* v_lower.values + (alpha * v_upper.values))

In [None]:
def append_enviorment_data(year):
    """
    Function which merges the downloaded data into one dataframe.  
        
    Parameters
    ----------
    year : int
        datetime as a string as the format YYYY-mm-ddTHH:MM:SSZ 
    """
    src_csv_path = Path(str(year),'%s_AIS.csv' % year)
    output_csv_path = Path(str(year),'%s_merged.csv' % year)
    
    # get extracted AIS data and remove index column
    df = pd.read_csv(src_csv_path,parse_dates=['BaseDateTime'], date_parser=str_to_date)
    df.drop(['Unnamed: 0'], axis=1, errors='ignore', inplace=True)
    
    # define new columns for the output datafarme 
    cols = list(df.columns) + ['VHM0_WW', 'VMDR_SW2', 'VMDR_SW1','VMDR', 'VTM10', 'VTPK', 'VPED', 'VTM02', 'VMDR_WW', 'VTM01_SW2', 'VHM0_SW1', 'VTM01_SW1', 'VSDX', 'VSDY', 'VHM0', 'VTM01_WW', 'VHM0_SW2'] + ['thetao', 'so', 'uo', 'vo', 'zos', 'mlotst', 'bottomT', 'siconc', 'sithick', 'usi', 'vsi']
    
    # check if already appended data to resume in case of disconnetion or other errors
    data_list = []
    if Path(output_csv_path).exists():
        data_list = list(pd.read_csv(output_csv_path).drop(['Unnamed: 0'], axis=1, errors='ignore').values)
        print('Resuming download from row %s ' % len(data_list))
        
    # loop over the AIS data starting from the last index, where it has stopped
    last_index = len(data_list)
    for x in df.values[last_index:]:
        date, lat, lon = x[:3]

        env_variables = retrive_and_interpolate_data(date, lat, lon, 3, 'wave')

        phy_variables = retrive_and_interpolate_data(date, lat, lon, 3, 'physics')
        data_list.append(np.concatenate([x, env_variables, phy_variables]))
        
        pd.DataFrame(data_list, columns=cols).to_csv(output_csv_path)
        last_index+=1
        sys.stdout.write("\rEntry row index: %s/%s" % (last_index, len(df)))
        sys.stdout.flush()
        
    return pd.DataFrame(data_list, columns=cols)

In [None]:
# Here the download and merge is ultimately performed for the year 2020 
result = append_enviorment_data(2020)

Resuming download from row 75 
http://nrt.cmems-du.eu/motu-web/Motu?action=productdownload&service=GLOBAL_ANALYSIS_FORECAST_WAV_001_027-TDS&product=global-analysis-forecast-wav-001-027&x_lo=-122.44421000000001&x_hi=-122.44421000000001&y_lo=47.71676&y_hi=47.71676&t_lo=2020-11-14T21:00:00Z&t_hi=2020-11-14T21:00:00Z&mode=console
http://nrt.cmems-du.eu/motu-web/Motu?action=productdownload&service=GLOBAL_ANALYSIS_FORECAST_WAV_001_027-TDS&product=global-analysis-forecast-wav-001-027&x_lo=-122.44421000000001&x_hi=-122.44421000000001&y_lo=47.71676&y_hi=47.71676&t_lo=2020-11-15T00:00:00Z&t_hi=2020-11-15T00:00:00Z&mode=console
http://nrt.cmems-du.eu/motu-web/Motu?action=productdownload&service=GLOBAL_ANALYSIS_FORECAST_PHY_001_024-TDS&product=global-analysis-forecast-phy-001-024&x_lo=-122.44421000000001&x_hi=-122.44421000000001&y_lo=47.71676&y_hi=47.71676&t_lo=2020-11-14T12:00:00Z&t_hi=2020-11-14T12:00:00Z&z_lo=0.49&z_hi=0.5&mode=console
http://nrt.cmems-du.eu/motu-web/Motu?action=productdownload

This completes the download and merging of the data and the data can now be used to build the model. 

In [None]:
# This is an optional script for merging the data if you produce data on two different instances.
a = pd.read_csv("input1")
b = pd.read_csv("input2")
#b = b.dropna(axis=1)

merged = a.append(b)
merged
merged.to_csv("output", index=False)