In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from math import radians, cos, sin, acos
import matplotlib.pyplot as plt





In [None]:
# Load data
data = pd.read_csv('../data/voyages.csv')

# Display the first few rows of the dataframe
print("Data loaded successfully:")
display(data.head())



In [None]:
# Convert dateStamp and timeStamp to a single datetime column
data['event_time'] = pd.to_datetime('1899-12-30') + pd.to_timedelta(data['dateStamp'], unit='D') + pd.to_timedelta(data['timeStamp'] * 24, unit='H')
print("Event times calculated:")
display(data[['id', 'event', 'event_time']].head())

# Calculate previous event time and coordinates
data['prev_event_time'] = data['event_time'].shift(1)
data['prev_lat'] = data['lat'].shift(1)
data['prev_lon'] = data['lon'].shift(1)
print("Previous event times and coordinates calculated:")
display(data[['id', 'event', 'prev_event_time', 'prev_lat', 'prev_lon']].head())

# Calculate time difference in hours
data['time_diff_hours'] = (data['event_time'] - data['prev_event_time']).dt.total_seconds() / 3600.0
print("Time differences calculated:")
display(data[['id', 'event', 'time_diff_hours']].head())

# Function to calculate distance between two coordinates using the haversine formula
def haversine(lat1, lon1, lat2, lon2):
    R = 3959  # Earth radius in miles
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    return R * acos(cos(lat1) * cos(lat2) * cos(lon2 - lon1) + sin(lat1) * sin(lat2))

# Calculate distance travelled
data['distance_travelled'] = data.apply(
    lambda row: haversine(row['lat'], row['lon'], row['prev_lat'], row['prev_lon']) if pd.notnull(row['prev_lat']) else 0,
    axis=1
)
print("Distances travelled calculated:")
display(data[['id', 'event', 'distance_travelled']].head())

# Calculate sailing time and port stay duration
data['sailing_time'] = np.where(data['event'] == 'SOSP', data['time_diff_hours'], np.nan)
data['port_stay_duration'] = np.where(data['event'] == 'EOSP', data['time_diff_hours'], np.nan)
print("Sailing times and port stay durations calculated:")
display(data[['id', 'event', 'sailing_time', 'port_stay_duration']].head())



In [None]:
# Plot the results
plt.figure(figsize=(12, 6))

# Filter out rows with NaN values in sailing_time or port_stay_duration for plotting
df_sailing = data.dropna(subset=['sailing_time'])
df_port = data.dropna(subset=['port_stay_duration'])

# Plot sailing times
plt.bar(df_sailing['event_time'], df_sailing['sailing_time'], width=0.1, label='Sailing Time (hours)')

# Plot port stay durations
plt.bar(df_port['event_time'], df_port['port_stay_duration'], width=0.1, label='Port Stay Duration (hours)', color='orange')

plt.xlabel('Event Time')
plt.ylabel('Duration (hours)')
plt.title('Voyage Timeline')
plt.legend()
plt.show()