In [1]:
import pandas as pd
from pymongo import MongoClient, ASCENDING
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
import math
import matplotlib.pyplot as plt


ModuleNotFoundError: No module named 'pymongo'

# Uploading to mongo db

Small comments:
* We used local version of Mongo DB, because online one (free plan) has usage and memory restrictions
* Chunk sizes and other parameters are appropriate for the database running local machine, computer did not crash

In [None]:
# Constants
CSV_FILE_PATH = "/home/jovyan/csv-files/my_data.csv"
CHUNK_SIZE = 10000
MONGO_URI = "mongodb://localhost:27017"
DB_NAME = 'ieva_dbs'
COLLECTION_NAME = 'testas'

# Initialize a dictionary to store unique indices for each MMSI
mmsi_index_dict = {}
current_index = 1

# Function to read CSV in chunks
def read_csv_in_chunks(file_path, chunk_size):
    return pd.read_csv(file_path, chunksize=chunk_size)

# Function to upload chunk to MongoDB
def upload_chunk_to_mongodb(chunk, uri, db_name, collection_name, mmsi_index_dict):
    global current_index
    # Convert chunk to dictionary records
    records = chunk.to_dict('records')

    # Filter out records with null values and add unique index for each MMSI
    filtered_records = []
    for record in records:
        record = {k: v for k, v in record.items() if pd.notna(v)}
        mmsi = record.get('MMSI')
        if mmsi:
            if mmsi not in mmsi_index_dict:
                mmsi_index_dict[mmsi] = current_index
                current_index += 1
            record['unique_index'] = mmsi_index_dict[mmsi]
            filtered_records.append(record)
    
    if filtered_records:
        # Create a new MongoClient instance for this thread
        client = MongoClient(uri)
        db = client[db_name]
        collection = db[collection_name]
        collection.insert_many(filtered_records)
        client.close()

# Main function to read CSV and upload in parallel
def main():
    chunks = read_csv_in_chunks(CSV_FILE_PATH, CHUNK_SIZE)
    
    with ThreadPoolExecutor() as executor:
        futures = []
        for chunk in chunks:
            future = executor.submit(upload_chunk_to_mongodb, chunk, MONGO_URI, DB_NAME, COLLECTION_NAME, mmsi_index_dict)
            futures.append(future)
        
        for future in futures:
            future.result()

if __name__ == "__main__":
    main()


# Processing data from Mongo DB collection

Small comments:
* We used speed as an additional filter. We used 40 knots as a threshold

In [None]:
# Add MMSI index for better querying
# **Updated MongoDB client connection**
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]
collection.create_index([('MMSI', ASCENDING)])


In [None]:
# **Updated MongoDB client connection**
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
source_collection = db[COLLECTION_NAME]
target_collection = db["processed"]

# Helper functions
def is_valid_coordinate(lat, lon):
    return -90 <= lat <= 90 and -180 <= lon <= 180

def parse_timestamp(timestamp):
    try:
        return datetime.strptime(timestamp, "%d/%m/%Y %H:%M:%S")
    except ValueError:
        return None

# This function intends to calculate vessel speed to remove obvious GPS jamming (location jumps)
def calculate_speed(lat1, lon1, lat2, lon2, time_diff):
    # Haversine formula to calculate the great-circle distance between two points
    R = 3440.065  # Radius of the Earth in nautical miles
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c  # Distance in nautical miles
    speed = distance / (time_diff.total_seconds() / 3600)  # Speed in knots
    return speed

def process_documents(documents):
    valid_documents = []
    previous_doc = None

    # Sort documents by parsed timestamp
    documents.sort(key=lambda x: parse_timestamp(x["# Timestamp"]))

    seen_timestamps = set()

    for doc in documents:
        lat = doc.get("Latitude")
        lon = doc.get("Longitude")
        timestamp = doc.get("# Timestamp")
        
        if not is_valid_coordinate(lat, lon):
            continue
        
        date_time = parse_timestamp(timestamp)
        if not date_time or date_time in seen_timestamps:
            continue
        
        seen_timestamps.add(date_time)
        doc["ParsedTimestamp"] = date_time
        
        if previous_doc:
            time_diff = date_time - previous_doc["ParsedTimestamp"]
            if time_diff.total_seconds() == 0:
                continue
            speed = calculate_speed(previous_doc["Latitude"], previous_doc["Longitude"], lat, lon, time_diff)
            
            if speed > 40:
                continue
            
            doc["TimeDiff"] = time_diff.total_seconds()
        
        valid_documents.append(doc)
        previous_doc = doc
    
    # Remove the ParsedTimestamp key before saving
    for doc in valid_documents:
        doc.pop("ParsedTimestamp", None)
    
    return valid_documents

def worker(mmsi):
    # Fetch documents for the MMSI
    documents = list(source_collection.find({"MMSI": mmsi}))
    if len(documents) < 100:
        print(f"Skipping MMSI {mmsi} - fewer than 100 documents.")
        return
    
    # Process and insert documents
    processed_docs = process_documents(documents)
    if processed_docs:
        target_collection.insert_many(processed_docs)

# Fetch unique MMSI values
unique_mmsis = source_collection.distinct("MMSI")

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(worker, mmsi) for mmsi in unique_mmsis]
    for future in as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Error processing: {e}")

print("Processing complete.")


In [None]:
# Adding MMSI index for further analysis
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db['processed']
collection.create_index([('MMSI', ASCENDING)])

# Histograms

In [None]:
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
target_collection = db["processed"]

def extract_data(documents):
    data = []
    for doc in documents:
        if "MMSI" in doc and "# Timestamp" in doc:
            data.append({
                "MMSI": doc["MMSI"],
                "# Timestamp": doc["# Timestamp"],
                "TimeDiff": doc.get("TimeDiff", None)
            })
    return data

def worker(mmsi):
    documents = list(target_collection.find({"MMSI": mmsi}))
    return extract_data(documents)

unique_mmsis = target_collection.distinct("MMSI")

all_data = []

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(worker, mmsi) for mmsi in unique_mmsis]
    for future in as_completed(futures):
        try:
            data = future.result()
            all_data.extend(data)
        except Exception as e:
            print(f"Error processing: {e}")


In [None]:
# Preparing data for histograms
df = pd.DataFrame(all_data)
df = df.dropna(subset=['TimeDiff'])
df['ParsedTimestamp'] = pd.to_datetime(df['# Timestamp'], format="%d/%m/%Y %H:%M:%S")

In [None]:
time_diff_counts = df['TimeDiff'].value_counts().sort_index()
time_diff_counts

In [None]:
time_diff_counts = df['TimeDiff'].value_counts().sort_index()

# Converting the time_diff_counts to a DataFrame for easier handling
time_diff_df = time_diff_counts.reset_index()
time_diff_df.columns = ['TimeDiff', 'Count']

# Plotting the histogram
plt.figure(figsize=(10, 6))
plt.hist(time_diff_df['TimeDiff'], bins=3000, weights=time_diff_df['Count'], edgecolor='black')
plt.title('Histogram of Time Differences')
plt.xlabel('Time Difference (seconds)')
plt.ylabel('Occurrences')
plt.xlim(0, 500) # chosen this limit because nothing is seen besides this point
plt.show()


In [None]:
# Here we check for the max value when there are more than 10 occurences
time_diff_df = time_diff_df[time_diff_df["Count"] > 10]
time_diff_df.max()

In [None]:
timestamp_counts = df['ParsedTimestamp'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
plt.bar(timestamp_counts.index, timestamp_counts.values, width=0.04)
plt.xlabel('Timestamp')
plt.ylabel('Occurrences')
plt.title('Bar plot of Timestamps')
plt.show()

Last comments:

* From the histogram it can be concluded that vessel location is renewed mostly every 1-20 seconds. It is a very common practice in location tracking. However, there are quite some delta t up to 200 seconds. It's a common practice to do less frequent tracking when the ship is idling
* From timestamp bar plot it can be seen that the occurences of signals are similar througout the day, except there is one peak at 3 pm. It can probably happen because of some error, or the tracking is set at 1 second.