In [1]:
import pyarrow.csv as pc
import pyarrow as pa
import pandas as pd

# Define the path to your CSV file
csv_file = "./data/cleaned/1.csv"  # Change this to your actual file path

# Read the CSV file into a PyArrow Table
try:
    table = pc.read_csv(csv_file)
    print("CSV file successfully read into a PyArrow Table.")
except Exception as e:
    print(f"Error reading CSV file: {e}")
    exit()

# Print column names and their types
schema = table.schema
print("Columns and their types:")
for field in schema:
    print(f"{field.name}: {field.type}")

# Convert to a pandas DataFrame
df = table.to_pandas()
print("Converted to pandas DataFrame.")

# Display first few rows
print(df.head())


CSV file successfully read into a PyArrow Table.
Columns and their types:
Timestamp: timestamp[ns]
Type of mobile: string
MMSI: int64
Latitude: double
Longitude: double
Navigational status: string
ROT: double
SOG: double
COG: double
Heading: int64
IMO: int64
Callsign: string
Name: string
Ship type: string
Cargo type: string
Width: int64
Length: int64
Type of position fixing device: string
Draught: double
Destination: string
ETA: timestamp[s]
Data source type: string
A: int64
B: int64
C: int64
D: int64
__index_level_0__: int64
__index_level_0__: int64
Converted to pandas DataFrame.
   Timestamp Type of mobile       MMSI   Latitude  Longitude  \
0 2025-01-26        Class A  245241000  55.398643  14.711777   
1 2025-01-26        Class A  245241000  55.398643  14.711777   
2 2025-01-26        Class A  256883000  54.743528  16.130240   
3 2025-01-26        Class A  636020259  55.053283  14.032850   
4 2025-01-26        Class A  215662000  55.781642  15.883830   

      Navigational status  

In [2]:
### Clean MOORED/ANCHORED/ETC.
# Remove rows where nav status is "Moored" or "Anchored"
print("Before moored/anchored: ", df.shape)
df = df[~df['Navigational status'].isin(['Moored', 'Anchored', 'At anchor', 'Reserved for future use', 'Not under command', 'Unknown value', 'Restricted maneuverability'])]
print("After moored/anchored: ", df.shape)


Before moored/anchored:  (9189296, 28)
After moored/anchored:  (9189296, 28)


In [3]:
## 

In [4]:
import pyarrow as pa
import pyarrow.csv as pc  # CSV module in PyArrow

# Convert the DataFrame to an Arrow Table.
table = pa.Table.from_pandas(df)

# Write the Arrow Table to CSV.
pc.write_csv(table, './data/cleaned/1.csv')


ValueError: Duplicate column names found: ['Timestamp', 'Type of mobile', 'MMSI', 'Latitude', 'Longitude', 'Navigational status', 'ROT', 'SOG', 'COG', 'Heading', 'IMO', 'Callsign', 'Name', 'Ship type', 'Cargo type', 'Width', 'Length', 'Type of position fixing device', 'Draught', 'Destination', 'ETA', 'Data source type', 'A', 'B', 'C', 'D', '__index_level_0__', '__index_level_0__']

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def funcz(df):
    # For nicer plots
    sns.set(style='whitegrid', context='talk')

    # ---------------------------
    # Basic DataFrame Information
    # ---------------------------
    print("===== First 5 Rows =====")
    print(df.head())
    
    print("\n===== DataFrame Info =====")
    print(df.info())
    
    print("\n===== Summary Statistics (Numeric) =====")
    print(df.describe())
    
    print("\n===== Summary Statistics (All Columns) =====")
    print(df.describe(include='all'))
    
    # ---------------------------
    # Check for Missing Values
    # ---------------------------
    print("\n===== Missing Values =====")
    print(df.isnull().sum())
    
    # ---------------------------
    # Distribution of Ship Types
    # ---------------------------
    print("\n===== Distribution of Ship Types =====")
    ship_type_counts = df['Ship type'].value_counts()
    print(ship_type_counts)

    # Select the top 6 most common ship types
    top_ship_types = ship_type_counts.nlargest(6)

    # Plot the distribution of ship types
    plt.figure(figsize=(8, 8))
    sns.barplot(x=top_ship_types.index, y=top_ship_types.values, palette="viridis")

    plt.ticklabel_format(style='plain', axis='y')
    plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

    plt.title('Ship Type Distribution')
    plt.xlabel('Ship Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    plt.savefig("ship_type_distribution.png", dpi=300)
    plt.close()
    
    # ---------------------------
    # Analysis of Navigational Status
    # ---------------------------
    print("\n===== Distribution of Navigational Status =====")
    nav_status_counts = df['Navigational status'].value_counts()
    print(nav_status_counts)

    # Select the top 6 most common navigational statuses
    top_nav_status = nav_status_counts.nlargest(6)

    plt.figure(figsize=(8, 8))
    sns.barplot(x=top_nav_status.index, y=top_nav_status.values, palette="rocket")

    plt.ticklabel_format(style='plain', axis='y')
    plt.gca().yaxis.set_major_formatter(plt.matplotlib.ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

    plt.title('Navigational Status Distribution')
    plt.xlabel('Navigational Status')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    plt.savefig("navigational_status_distribution.png", dpi=300)
    plt.close()
    
    # ---------------------------
    # Analysis of Speed Over Ground (SOG)
    # ---------------------------
    print("\n===== SOG Statistics =====")
    print(df['SOG'].describe())

    plt.figure(figsize=(8, 8))
    sns.histplot(df['SOG'].dropna(), kde=True, bins=30, color='steelblue')
    plt.title('Histogram of Speed Over Ground (SOG)')
    plt.xlabel('SOG')
    plt.ylabel('Frequency')
    plt.ticklabel_format(style='plain', axis='y')
    plt.tight_layout()
    
    plt.savefig("histogram_SOG.png", dpi=300)
    plt.close()
    
    # ---------------------------
    # Analysis of Rate of Turn (ROT)
    # ---------------------------
    print("\n===== ROT Statistics =====")
    print(df['ROT'].describe())

    plt.figure(figsize=(8, 8))
    sns.histplot(df['ROT'].dropna(), kde=True, bins=30, color='indianred')
    plt.title('Histogram of Rate of Turn (ROT)')
    plt.xlabel('ROT')
    plt.ylabel('Frequency')
    plt.ticklabel_format(style='plain', axis='y')
    plt.tight_layout()
    
    plt.savefig("histogram_ROT.png", dpi=300)
    plt.close()
    
    # ---------------------------
    # Analysis of Ship Dimensions: Length vs Width
    # ---------------------------
    plt.figure(figsize=(8, 8))
    sns.scatterplot(x='Length', y='Width', data=df, alpha=0.5, s=20, color='darkgreen')
    plt.title('Ship Length vs Width')
    plt.xlabel('Length')
    plt.ylabel('Width')
    plt.tight_layout()
    
    plt.savefig("ship_length_vs_width.png", dpi=300)
    plt.close()
    
    # ---------------------------
    # Correlation Heatmap of Numeric Columns
    # ---------------------------
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    corr_matrix = df[numeric_cols].corr()

    plt.figure(figsize=(8, 8))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)
    plt.title('Correlation Heatmap of Numeric Features')
    plt.tight_layout()
    
    plt.savefig("correlation_heatmap.png", dpi=300)
    plt.close()

    print("All plots have been saved as PNG files.")



In [11]:
funcz(df)

===== First 5 Rows =====
   Timestamp Type of mobile       MMSI   Latitude  Longitude  \
0 2025-01-26        Class A  245241000  55.398643  14.711777   
1 2025-01-26        Class A  245241000  55.398643  14.711777   
2 2025-01-26        Class A  256883000  54.743528  16.130240   
3 2025-01-26        Class A  636020259  55.053283  14.032850   
4 2025-01-26        Class A  215662000  55.781642  15.883830   

      Navigational status  ROT   SOG    COG  Heading  ...  Draught  \
0  Under way using engine  0.0   9.6   57.7     60.0  ...      4.3   
1  Under way using engine  0.0   9.6   57.7     60.0  ...      4.3   
2  Under way using engine  0.0  13.4   82.7     84.0  ...      7.3   
3  Under way using engine  0.0  10.6  255.0    255.0  ...      9.6   
4  Under way using engine  0.0  12.8   60.7     60.0  ...      6.9   

  Destination                 ETA Data source type      A      B     C     D  \
0  SODERTALJE 2025-01-27 04:00:00              AIS   85.0   10.0   3.0  11.0   
1  SODERT


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_ship_types.index, y=top_ship_types.values, palette="viridis")



===== Distribution of Navigational Status =====
Navigational status
Under way using engine                                   8823531
Engaged in fishing                                        226835
Under way sailing                                          48035
Constrained by her draught                                 39141
Power-driven vessel pushing ahead or towing alongside      33299
Reserved for future amendment [HSC]                        12515
Power-driven vessel towing astern                           5933
Aground                                                        7
Name: count, dtype: int64



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_nav_status.index, y=top_nav_status.values, palette="rocket")



===== SOG Statistics =====
count    9.188099e+06
mean     1.182806e+01
std      5.104274e+00
min      1.000000e-01
25%      9.500000e+00
50%      1.140000e+01
75%      1.400000e+01
max      1.022000e+02
Name: SOG, dtype: float64

===== ROT Statistics =====
count    8.221441e+06
mean     5.776214e-03
std      4.041687e+00
min     -7.087000e+02
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      7.087000e+02
Name: ROT, dtype: float64
All plots have been saved as PNG files.
