# AIS Vessel Data Download and Aggregating Tool

In [20]:
# These libraries are built-in to Python
import zipfile
import os

# These are packages you need to install
import requests
from bs4 import BeautifulSoup
import pandas as pd

 **Downloads all the daily files for the selected year and month the AIS website**

In [21]:
def download_zip_files(year: str, month: str):
    url = f"https://coast.noaa.gov/htdata/CMSP/AISDataHandler/{year}/index.html"
    page = os.path.dirname(url)
    # Directory to save the downloaded files
    download_dir = month + '-' + year + '-zip'
    os.makedirs(download_dir, exist_ok=True)
    # Fetch the page content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find all links on the page
    links = soup.find_all('a', href=True)
    # Filter and download files that match the regex pattern
    for link in links:
        file_url = link['href']
        filename = os.path.basename(file_url)
        # Check if the filename matches the regex pattern
        try:
            link_month = filename.split('_')[2]
        except IndexError:
            continue
        if link_month == month:
            print(f"Downloading {filename}...")
            download_link = os.path.join(page, filename)
            file_response = requests.get(download_link)
            # Save the file to the download directory
            with open(os.path.join(download_dir, filename), 'wb') as f:
                f.write(file_response.content)
    print("Download complete.")


**Unzips all your downloaded files**

In [22]:
def unzip_files(year: str, month: str):
    # Directory containing the zip files
    zip_dir = month + '-' + year + '-zip'
    # Directory to extract the contents to
    extract_dir = month + '-' + year
    os.makedirs(extract_dir, exist_ok=True)

    # Iterate over all files in the directory
    for filename in os.listdir(zip_dir):
        if filename.endswith(".zip"):
            zip_path = os.path.join(zip_dir, filename)
            print(f"Unzipping {filename}...")

            # Open the zip file
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                # Extract all the contents to the specified directory
                zip_ref.extractall(extract_dir)

    print("Unzipping complete.")

In [23]:
def shrink_dataset(df):
    df['BaseDateTime'] = pd.to_datetime(df['BaseDateTime'], format='%Y-%m-%dT%H:%M:%S')
    filtered = df[df['BaseDateTime'].dt.hour.isin([0, 6, 12, 18])] # Select 0, 6, 12, 18 hours 
    filtered = filtered[filtered['BaseDateTime'].dt.minute.between(0, 1)] # Select 0 - 1 minutes
    filtered = filtered[filtered['BaseDateTime'].dt.second.between(0, 59)] # Select 0 - 59 seconds
    return filtered

In [24]:
def downloadData(selectedYear):
    year = selectedYear # Year
    month = '09' # Month: September

    # download_zip_files(year, month)
    # unzip_files(year, month)

    extract_dir = month + '-' + year

    month_day_dfs = []

    # Iterate over all files in the directory
    for filename in os.listdir(extract_dir):
        if filename.endswith(".csv"):
            file_path = os.path.join(extract_dir, filename)
            print(f"Reading and processing {filename}...")

            df = pd.read_csv(file_path, on_bad_lines='skip')
            # Filter for vessels with length greater than 200
            df = df[df['Length'] > 200]

            # Filter for specific status codes (70-89, 1003, 1004, 1016, 1017, 1024)
            valid_status_codes = list(range(70, 90)) + [1003, 1004, 1016, 1017, 1024]
            df = df[df['VesselType'].isin(valid_status_codes)]

            df = shrink_dataset(df)

            # Filter out vessels that are not off the U.S. East Coast
            east_coast_vessels = df[(df['LAT'] >= 25) & (df['LAT'] <= 45) &
                                        (df['LON'] >= -80) & (df['LON'] <= -45)]

            # Append the filtered DataFrame to the list
            month_day_dfs.append(east_coast_vessels)

    sept_df = pd.concat(month_day_dfs)
    return sept_df

In [25]:
sept_2024_df = downloadData("2020")

Reading and processing AIS_2020_09_16.csv...
Reading and processing AIS_2020_09_02.csv...
Reading and processing AIS_2020_09_03.csv...
Reading and processing AIS_2020_09_17.csv...
Reading and processing AIS_2020_09_29.csv...
Reading and processing AIS_2020_09_01.csv...
Reading and processing AIS_2020_09_15.csv...
Reading and processing AIS_2020_09_14.csv...
Reading and processing AIS_2020_09_28.csv...
Reading and processing AIS_2020_09_04.csv...
Reading and processing AIS_2020_09_10.csv...
Reading and processing AIS_2020_09_11.csv...
Reading and processing AIS_2020_09_05.csv...
Reading and processing AIS_2020_09_13.csv...
Reading and processing AIS_2020_09_07.csv...
Reading and processing AIS_2020_09_06.csv...
Reading and processing AIS_2020_09_12.csv...
Reading and processing AIS_2020_09_23.csv...
Reading and processing AIS_2020_09_22.csv...
Reading and processing AIS_2020_09_08.csv...
Reading and processing AIS_2020_09_20.csv...
Reading and processing AIS_2020_09_21.csv...
Reading an

In [26]:
sept_2024_df_copy = sept_2024_df.copy()

In [27]:
# Step 1: Round the timestamp to the nearest 6-hour mark
# This uses rounding in pandas: "H" means to round to the nearest hour, and 6H rounds to the nearest 6 hours
sept_2024_df_copy['RoundedTime'] = sept_2024_df['BaseDateTime'].dt.round('6H')

# Step 2: Drop duplicates based on MMSI and the rounded time
# This keeps the first occurrence of each MMSI at the rounded 6-hour interval
deduplicated_sept_2024_df = sept_2024_df_copy.drop_duplicates(subset=['MMSI', 'RoundedTime'])

deduplicated_sept_2024_df['BaseDateTime'] = deduplicated_sept_2024_df['RoundedTime']
deduplicated_sept_2024_df.drop(columns=['RoundedTime'], inplace=True)

  sept_2024_df_copy['RoundedTime'] = sept_2024_df['BaseDateTime'].dt.round('6H')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deduplicated_sept_2024_df['BaseDateTime'] = deduplicated_sept_2024_df['RoundedTime']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deduplicated_sept_2024_df.drop(columns=['RoundedTime'], inplace=True)


In [28]:
sept_df = pd.read_csv('sept_east_coast_vessels.csv')

In [29]:
# Combine August and September datasets
df = pd.concat([deduplicated_sept_2024_df, sept_df], ignore_index=True)

In [30]:
df.head(10)

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,TransceiverClass
0,477293200.0,2020-09-16 00:00:00,32.99666,-78.00064,9.6,251.6,264.0,OOCL SINGAPORE,IMO9628001,VRMX7,70.0,0.0,366.0,48.0,12.0,70.0,A
1,215640000.0,2020-09-16 00:00:00,32.90088,-79.9604,0.0,47.0,41.0,CMA CGM CHATEAU DIF,IMO9335202,9HA5221,71.0,0.0,294.0,32.0,-12.4,71.0,A
2,367821000.0,2020-09-16 00:00:00,36.97286,-76.43363,0.0,0.0,511.0,YANO,IMO7825409,NAQH,70.0,0.0,276.0,32.0,13.0,70.0,A
3,356093000.0,2020-09-16 00:00:00,40.51192,-72.42767,10.1,270.1,269.0,MSC JUDITH,IMO9299549,3EGR5,70.0,0.0,324.0,42.0,14.5,71.0,A
4,255806362.0,2020-09-16 00:00:00,40.68322,-74.14994,0.0,355.2,309.0,MSC BUSAN,IMO9289087,CQEE7,74.0,5.0,325.0,43.0,11.4,74.0,A
5,356330000.0,2020-09-16 00:00:00,37.72419,-74.95381,11.7,209.4,209.0,MSC KRYSTAL,IMO9372470,3EPZ4,70.0,0.0,277.0,40.0,14.5,,A
6,477920300.0,2020-09-16 00:00:00,40.55932,-70.02637,17.9,266.0,266.0,OOCL ATLANTA,IMO9285005,VRAR6,70.0,0.0,323.0,42.0,14.5,,A
7,369390000.0,2020-09-16 00:00:00,27.2275,-79.842,12.1,183.0,182.0,SAFMARINE MAFADI,IMO9314210,KRIJ,70.0,0.0,292.0,32.0,11.0,70.0,A
8,255806154.0,2020-09-16 00:00:00,43.86142,-67.28072,0.2,38.7,319.0,NORDTULIP,IMO9521447,CQAE8,89.0,0.0,229.0,42.0,-11.0,89.0,A
9,373120000.0,2020-09-16 00:00:00,40.49092,-73.91987,13.7,117.6,120.0,MSC VERONIQUE,IMO8618293,3EYX,70.0,0.0,294.0,32.0,13.5,,A


In [35]:
# Convert 'BaseDateTime' to datetime format
df['BaseDateTime'] = pd.to_datetime(df['BaseDateTime'])

# Sort by 'BaseDateTime' and Reset index
df.sort_values('BaseDateTime', inplace=True)
df.reset_index(drop=True, inplace=True)

In [36]:
df.head(10)

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,TransceiverClass
0,353274000.0,2020-09-01,31.16356,-79.87955,12.6,267.9,262.0,NYK DAEDALUS,IMO9337614,3EMS,70.0,0.0,294.0,32.0,13.5,70.0,A
1,538008722.0,2020-09-01,40.45438,-69.223,12.3,90.0,94.0,FULMAR,IMO9402471,V7A2691,80.0,0.0,250.0,44.0,-12.5,80.0,A
2,368660000.0,2020-09-01,36.79857,-75.74077,14.7,117.7,116.0,MAERSK IDAHO,IMO9193264,WKPM,70.0,0.0,292.0,32.0,13.5,70.0,A
3,309046000.0,2020-09-01,30.3488,-79.34123,12.5,329.3,328.0,MOL CREATION,IMO9321237,C6WD9,70.0,0.0,316.0,45.0,14.5,71.0,A
4,218158000.0,2020-09-01,28.20606,-79.7833,12.0,181.0,181.0,BREMEN EXPRESS,IMO9343728,DGZL,70.0,0.0,335.0,42.0,14.5,,A
5,566780000.0,2020-09-01,38.35979,-74.56793,15.4,321.7,321.0,MAERSK IZMIR,IMO9348168,9V2004,70.0,0.0,232.0,32.0,11.8,71.0,A
6,316013980.0,2020-09-01,43.7762,-76.99618,11.7,68.6,70.0,RADCLIFFE R LATIMER,IMO7711725,VCPK,70.0,0.0,226.0,23.0,10.2,,A
7,308371000.0,2020-09-01,35.93562,-74.45473,15.0,18.0,13.0,LARVIK,IMO9307346,C6VC2,80.0,0.0,213.0,32.0,12.3,89.0,A
8,374727000.0,2020-09-01,37.29049,-76.09993,0.2,94.1,156.0,ELSA S,IMO9736353,3FGE4,70.0,1.0,229.0,,,,A
9,636017112.0,2020-09-01,39.4275,-75.52683,14.7,318.0,319.0,POMERENIA SKY,IMO9339583,A8MG6,70.0,0.0,208.0,29.0,11.6,90.0,A


In [37]:
df.shape

(42236, 17)

In [38]:
unique_vessels = df.nunique()['MMSI']
unique_vessels

1955

In [39]:
df["BaseDateTime"]

0       2020-09-01 00:00:00
1       2020-09-01 00:00:00
2       2020-09-01 00:00:00
3       2020-09-01 00:00:00
4       2020-09-01 00:00:00
                ...        
42231   2023-09-30 18:00:00
42232   2023-09-30 18:00:00
42233   2023-09-30 18:00:00
42234   2023-09-30 18:00:00
42235   2023-09-30 18:00:00
Name: BaseDateTime, Length: 42236, dtype: datetime64[ns]

In [40]:
# Export csv and upload to github
df.to_csv('sept_east_coast_vessels_2020_2023.csv', index=False)