DATA 780 Project Jena K. Vaughn

Import packages

In [128]:
import numpy as np
import pandas as pd
import folium
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

We will be using these functions

In [144]:
def timerange_check(dfname):
    df_copy = dfname.copy()
    df_copy['datetime'] = pd.to_datetime(df_copy['timestamp'], unit='s')

    print("Min:", df_copy['datetime'].min())
    print("Max:", df_copy['datetime'].max())
    return None

def get_coords(num):
  """
  Takes vessel # and returns all lat & lon coords for that vessel.
  num: Any number between 1-354. (Total # of ships)
  """
  ship_dfs = ship_dfs = {i+1: group.reset_index(drop=True)
            for i, (_, group) in enumerate(final_df.groupby('mmsi'))}
  lat = ship_dfs[num]['lat'].values
  long = ship_dfs[num]['lon'].values

  coords = [[lat[i], long[i]] for i in range(len(lat))]

  return coords

def map_ship_movements(num):
  """
  Takes vessel # and returns visualization of ship movements.
  num: Any number between 1-354. (Total # of ships)
  """
  coordinates = get_coords(num)

  m = folium.Map(location=[coordinates[0][0], coordinates[0][1]], zoom_start=5)

  trail_coordinates = coordinates

  folium.PolyLine(trail_coordinates, tooltip="Coast").add_to(m)
  return m

def map_ship_movements_chunk(num, n_coords=10):
    """
    Visualizes ship movement for a selected vessel.
    num: Ship number between 1-354.
    n_coords: Number of coordinates to plot from that ship's data.
    """
    coordinates = get_coords(num)

    trail_coordinates = coordinates[:n_coords]

    if len(trail_coordinates) == 0:
        print(f"No coordinates found for Ship #{num}")
        return None

    m = folium.Map(location=trail_coordinates[0], zoom_start=8)

    folium.PolyLine(trail_coordinates, tooltip=f'Ship #{num}').add_to(m)

    return m

def sample_mmsi_by_year(df, sample_size=10, year=2016):
  """
  Sample ships by year, seperated also by MMSI
  df: Dataframe (use final_df)
  sample_size: Number of ships sampled from each vessel_type
  year: Year to sample
  """
  dfs_sampled = []
  # Copy and preprocess the dataframe
  df_copy = df.copy()
  df_copy['datetime'] = pd.to_datetime(df_copy['timestamp'], unit='s')

  #Filter df for year and drop nan rows
  df_year = df_copy[df_copy['datetime'].dt.year == year].dropna()

  if df_year.empty:
    return pd.DataFrame()


  for vessel_type in df_year['vessel_type'].unique():
    #Filter df for the current vessel_type
    df_vessel_type = df_year[df_year['vessel_type'] == vessel_type]

    sampled_mmsi = df_vessel_type['mmsi'].drop_duplicates().sample(min(sample_size, df_vessel_type['mmsi'].nunique()),random_state=217)

    #Filter df to include only rows with the sampled mmsi
    df_sampled = df_vessel_type[df_vessel_type['mmsi'].isin(sampled_mmsi)]

    dfs_sampled.append(df_sampled)

    #Concat into df
  final_df = pd.concat(dfs_sampled, ignore_index=True)
  return final_df

def prep_data_for_cnn_non_overlap(df, sequence_length=10, feature_columns=['speed', 'distance_from_port', 'course', 'lat', 'lon'], label_column='vessel_type'):
    """
    Prepare data for CNN model. NON-OVERLAPPING SEQUENCES
    df: Input dataframe
    sequence_length: Choose a sequence length for timesteps
    feature_columns: List of features to use for prediction

    label_column: Target column to predict
    RETURNS:
    feature_sequences: Array of sequences of features; Shape = (batch_size, sequence_length, len(feature_columns))
    y_labels: Array of target labels (batch_size, )
    """
    df_copy = df.copy()

    df_copy['datetime'] = pd.to_datetime(df_copy['timestamp'], unit='s')

    latlon_data = df_copy[['lat', 'lon']].values
    other_features = df_copy[[col for col in feature_columns if col not in ['lat', 'lon']]].values

    scaler = MinMaxScaler()
    other_features = scaler.fit_transform(other_features)

    #Combine: lat/lon untouched, others normalized
    feature_data = np.concatenate([latlon_data, other_features], axis=1)

    #Non-overlapping sequences
    num_samples = len(df_copy) // sequence_length

    feature_sequences = []
    y_labels = []

    for i in range(num_samples):
        start_idx = i * sequence_length
        end_idx = start_idx + sequence_length

        sequence = feature_data[start_idx:end_idx]

        if len(sequence) == sequence_length:
            feature_sequences.append(sequence)
            y_labels.append(df_copy[label_column].iloc[end_idx - 1])

    feature_sequences = np.array(feature_sequences)
    y_labels = np.array(y_labels)

    return feature_sequences, y_labels


def prep_data_for_cnn_overlap(df, sequence_length=10, feature_columns=['speed', 'distance_from_port', 'course', 'lat', 'lon'], label_column='is_fishing'):
    """
    Prepare data for CNN model. OVERLAPPING SEQUENCES
    df: Input dataframe
    sequence_length: Choose a sequence length for timesteps
    feature_columns: List of features to use for prediction
    label_column: Target column to predict

    RETURNS:
    feature_sequences: Array of sequences of features; Shape = (batch_size, sequence_length, len(feature_columns))
    y_labels: Array of target labels (batch_size, )
    """
    df_copy = df.copy()

    if label_column == 'is_fishing':
        df_copy = df_copy[df_copy['is_fishing'] != -1]
        df_copy['is_fishing'] = df_copy['is_fishing'].apply(lambda x: round(x))

    if df_copy.empty:
        return np.array([]), np.array([])

    df_copy['datetime'] = pd.to_datetime(df_copy['timestamp'], unit='s')

    latlon_data = df_copy[['lat', 'lon']].values
    other_features = df_copy[[col for col in feature_columns if col not in ['lat', 'lon']]].values

    scaler = MinMaxScaler()
    other_features = scaler.fit_transform(other_features)

    feature_data = np.concatenate([latlon_data, other_features], axis=1)

    num_samples = len(df_copy) - sequence_length + 1

    feature_sequences = []
    y_labels = []

    for i in range(num_samples):
        start_idx = i
        end_idx = start_idx + sequence_length
        sequence = feature_data[start_idx:end_idx]

        if len(sequence) == sequence_length:
            feature_sequences.append(sequence)
            y_labels.append(df_copy[label_column].iloc[end_idx - 1])

    feature_sequences = np.array(feature_sequences)
    y_labels = np.array(y_labels)

    return feature_sequences, y_labels

Read in CSVs

In [130]:
df_drifting_longlines = pd.read_csv("/content/drive/MyDrive/DATA780Project_Vaughn/drifting_longlines.csv")
df_fixed_gear = pd.read_csv("/content/drive/MyDrive/DATA780Project_Vaughn/fixed_gear.csv")
df_pole_and_line = pd.read_csv("/content/drive/MyDrive/DATA780Project_Vaughn/pole_and_line.csv")
df_purse_seines = pd.read_csv("/content/drive/MyDrive/DATA780Project_Vaughn/purse_seines.csv")
df_trawlers = pd.read_csv("/content/drive/MyDrive/DATA780Project_Vaughn/trawlers.csv")
df_trollers = pd.read_csv("/content/drive/MyDrive/DATA780Project_Vaughn/trollers.csv")

In [131]:
#Add column based on df
df_drifting_longlines['vessel_type'] = 0
df_fixed_gear['vessel_type'] = 1
df_pole_and_line['vessel_type'] = 2
df_purse_seines['vessel_type'] = 3
df_trawlers['vessel_type'] = 4
df_trollers['vessel_type'] = 5

list = [df_drifting_longlines, df_fixed_gear, df_pole_and_line, df_purse_seines, df_trawlers, df_trollers]
final_df = pd.concat(list, ignore_index=True).dropna() #Concat into big df

final_df.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,vessel_type
0,12639560000000.0,1327137000.0,232994.28125,311748.65625,8.2,230.5,14.865583,-26.853662,-1.0,dalhousie_longliner,0
1,12639560000000.0,1327137000.0,233994.265625,312410.34375,7.3,238.399994,14.86387,-26.8568,-1.0,dalhousie_longliner,0
2,12639560000000.0,1327137000.0,233994.265625,312410.34375,6.8,238.899994,14.861551,-26.860649,-1.0,dalhousie_longliner,0
3,12639560000000.0,1327143000.0,233994.265625,315417.375,6.9,251.800003,14.822686,-26.865898,-1.0,dalhousie_longliner,0
4,12639560000000.0,1327143000.0,233996.390625,316172.5625,6.1,231.100006,14.821825,-26.867579,-1.0,dalhousie_longliner,0


In [132]:
#'is_fishin' = -1 means no data.  How much does our data shrink once we remove?

count_minusone = (final_df['is_fishing'] == -1).sum()
print(f"Number of -1 values in 'is_fishing' & proportion: {count_minusone} {count_minusone/len(final_df)*100:.2f}%. With {len(final_df)-count_minusone} left")

Number of -1 values in 'is_fishing' & proportion: 21295728 97.82%. With 473935 left


In [133]:
#Drop NaN rows & rows with -1; we will still retain ~500k samples
#Drop rows where 'is_fishing' is -1 and also drop rows with NaN values
df_cleaned = final_df[(final_df['is_fishing'] != -1)].dropna()

print(df_cleaned.shape)

(473935, 11)


In [134]:
#View movements
map_ship_movements(123)

#Uncomment the below to view smaller chunck
#map_ship_movements_chunk(123, 100) #First 100 movements

Model 1: 'vessel_type' classifcation for vessels in 2016


In [145]:
#Prep Data for sequence input; Use non overlaping sequences
test = sample_mmsi_by_year(final_df)

feature_seq, y = prep_data_for_cnn_non_overlap(test)

X_train, X_test, y_train, y_test = train_test_split(feature_seq, y, test_size=0.20, random_state=621)

In [146]:
#Check the shape of the resulting splits
print(f"Stratified Random Sample of vessels Shape: {test.shape}")
print(f"Train X shape: {X_train.shape}, Test X shape: {X_test.shape}")
print(f"Train Y shape: {y_train.shape}, Test Y shape: {y_test.shape}")

Stratified Random Sample of vessels Shape: (1336249, 12)
Train X shape: (106899, 10, 5), Test X shape: (26725, 10, 5)
Train Y shape: (106899,), Test Y shape: (26725,)


In [147]:
conv = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(10, 5)),
    tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu'),
    tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(128, kernel_size=3, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax') #6 classes
])

#Compile model
conv.compile(
    optimizer=Adam(learning_rate=.0001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

#Fit
conv.fit(X_train, y_train, epochs=25, batch_size=32, validation_data=(X_test, y_test))

#Get accuracy
loss, accuracy = conv.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Epoch 1/25
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.7465 - loss: 0.7111 - val_accuracy: 0.8337 - val_loss: 0.4303
Epoch 2/25
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.8402 - loss: 0.4018 - val_accuracy: 0.8563 - val_loss: 0.3608
Epoch 3/25
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.8582 - loss: 0.3432 - val_accuracy: 0.8462 - val_loss: 0.3503
Epoch 4/25
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.8707 - loss: 0.3117 - val_accuracy: 0.8752 - val_loss: 0.3017
Epoch 5/25
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.8854 - loss: 0.2800 - val_accuracy: 0.9029 - val_loss: 0.2770
Epoch 6/25
[1m3341/3341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.8942 - loss: 0.2566 - val_accuracy: 0.9018 - val_loss: 0.2460
Epoch 7/25
[1m

Model 2: 'is_fishing' classification

In [138]:
#Preprocess data; Important Note: Use overlap for this
feature_seq2, y2 = prep_data_for_cnn_overlap(final_df)

X_train, X_test, y_train, y_test = train_test_split(feature_seq2, y2, test_size=0.20, random_state=444)

print(f"Train X shape: {X_train.shape}, Test X shape: {X_test.shape}")
print(f"Train Y shape: {y_train.shape}, Test Y shape: {y_test.shape}")

Train X shape: (379140, 10, 5), Test X shape: (94786, 10, 5)
Train Y shape: (379140,), Test Y shape: (94786,)


In [139]:
conv2 = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(10, 5)), #Shape (seq_length, features)
    tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu'),
    tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(128, kernel_size=3, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  #Same as previous model, just use sigmoid for binary classification
])

#Compile
conv2.compile(optimizer=Adam(learning_rate=.0001),
              loss='binary_crossentropy',
              metrics=['accuracy'])
#Fit
conv2.fit(X_train, y_train, epochs=25, batch_size=64, validation_data=(X_test, y_test))

#Print accuracy
loss2, accuracy2 = conv2.evaluate(X_test, y_test)
print(f"Test Loss: {loss2}, Test Accuracy: {accuracy2}")

Epoch 1/25
[1m5925/5925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.6759 - loss: 0.6216 - val_accuracy: 0.7027 - val_loss: 0.5451
Epoch 2/25
[1m5925/5925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - accuracy: 0.7459 - loss: 0.4960 - val_accuracy: 0.7854 - val_loss: 0.4310
Epoch 3/25
[1m5925/5925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - accuracy: 0.7848 - loss: 0.4342 - val_accuracy: 0.7970 - val_loss: 0.4131
Epoch 4/25
[1m5925/5925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - accuracy: 0.8069 - loss: 0.3997 - val_accuracy: 0.8227 - val_loss: 0.3722
Epoch 5/25
[1m5925/5925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - accuracy: 0.8216 - loss: 0.3759 - val_accuracy: 0.8232 - val_loss: 0.3693
Epoch 6/25
[1m5925/5925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step - accuracy: 0.8318 - loss: 0.3582 - val_accuracy: 0.8259 - val_loss: 0.3824
Epoch 7/25

In [140]:
#Encode combined column for 'is_fishing' & 'vessel_type' combinations
df_cleanedv2 = df_cleaned.copy()
df_cleanedv2['is_fishing'] = df_cleanedv2['is_fishing'].round().astype(int)


mapping = {
    (0, 0): 0,
    (0, 1): 1,
    (0, 2): 2,
    (0, 3): 3,
    (0, 4): 4,
    (0, 5): 5,
    (1, 0): 6,
    (1, 1): 7,
    (1, 2): 8,
    (1, 3): 9,
    (1, 4): 10,
    (1, 5): 11,
}


df_cleanedv2['combined'] = df_cleanedv2.apply(lambda row: mapping.get((row['is_fishing'], row['vessel_type'])), axis=1)

Model 3: 'is_fishing' & 'vessel_type' classification

In [141]:
#Prep data with overlaping sequences
X, y = prep_data_for_cnn_overlap(df_cleanedv2, label_column='combined')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=999)

print(f"Train X shape: {X_train.shape}, Test X shape: {X_test.shape}")
print(f"Train Y shape: {y_train.shape}, Test Y shape: {y_test.shape}")

Train X shape: (379140, 10, 5), Test X shape: (94786, 10, 5)
Train Y shape: (379140,), Test Y shape: (94786,)


In [142]:
conv3 = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(10, 5)),
    tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu', padding='same'),
    tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu', padding='same'),
    tf.keras.layers.MaxPooling1D(pool_size=3),
    tf.keras.layers.Conv1D(128, kernel_size=3, activation='relu', padding='same'),
    tf.keras.layers.Conv1D(128, kernel_size=3, activation='relu', padding='same'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(256, kernel_size=3, activation='relu', padding='same'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(12, activation='softmax')  #'softmax' for 12 classes; similar structure to previous models with added 1Dconv layers
])

#Compile
conv3.compile(optimizer=Adam(learning_rate=.0001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy']
)

#Fit
conv3.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_test, y_test))

#Print accuracy
loss3, accuracy3 = conv3.evaluate(X_test, y_test)
print(f"Test Loss: {loss3}, Test Accuracy: {accuracy3}")

Epoch 1/100
[1m2963/2963[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.5292 - loss: 1.1840 - val_accuracy: 0.5998 - val_loss: 0.9526
Epoch 2/100
[1m2963/2963[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.6280 - loss: 0.8974 - val_accuracy: 0.6504 - val_loss: 0.8178
Epoch 3/100
[1m2963/2963[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.6804 - loss: 0.7677 - val_accuracy: 0.6875 - val_loss: 0.7300
Epoch 4/100
[1m2963/2963[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.7191 - loss: 0.6805 - val_accuracy: 0.7284 - val_loss: 0.6588
Epoch 5/100
[1m2963/2963[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.7416 - loss: 0.6332 - val_accuracy: 0.7417 - val_loss: 0.6196
Epoch 6/100
[1m2963/2963[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.7569 - loss: 0.6009 - val_accuracy: 0.7705 - val_loss: 0.5739
Epoch 

In [148]:
#Compare models & their accuracy
conv.summary()
conv2.summary()
conv3.summary()

print(f'''Accruacy for Model 1:{accuracy}''')
print(f'''Accruacy for Model 1:{accuracy2}''')
print(f'''Accruacy for Model 1:{accuracy3}''')

Accruacy for Model 1:0.9610102772712708
Accruacy for Model 1:0.8899943232536316
Accruacy for Model 1:0.9080032706260681
