In [2]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('sample_data/finestapi_tracking_db.csv')

# Print schema of dataframe

# id: database table row identifier
# updated: update time of the row in question
# ship: name of the ship
# imo: IMO number of the ship
# lat: latitude (dd.dddd)
# long: longitude (ddd.dddd)
# sog: Speed Over Ground
# cog: Course Over Ground
# hdg: Heading
# depPort: UNLOCODE of the port of departure: EETLL or FIHEL
# etdSchedule: scheduled departure time given by the shipping company
# etd: departure time estimated by FinEstAPI
# atd: actual departure time of the ship
# arrPort: UNLOCODE of the port of arrival: EETLL or FIHEL
# etaSchedule: scheduled arrival time
# eta: estimated arrival time
# ata: actual arrival time

schema = df.dtypes
print(schema)

id               int64
updated         object
ship            object
imo              int64
lat            float64
long           float64
sog            float64
cog              int64
hdg              int64
depPort         object
etdSchedule     object
etd             object
atd             object
arrPort         object
etaSchedule     object
eta             object
ata             object
dtype: object


1. Data cleaning:
+ Ensure consistent ship name (some names are capitalized)
+ Drop rows with missing fields.
+ Ensure consistent date format across all columns.

In [3]:
# Ensure consistent ship name (some names are capitalized)
df['ship'] = df['ship'].str.lower()

# Drop rows with missing fields
df = df.dropna()

# Ensure consistent date format across all columns
# Convert all date columns to a consistent format with hour, minute, and second
date_columns = ['etdSchedule', 'etd', 'atd', 'etaSchedule', 'eta', 'ata']  # Replace with the actual column names
date_format = '%d/%m/%Y %H:%M:%S' # Replace with the desired date format

df[date_columns] = df[date_columns].apply(lambda col: 
                                          pd.to_datetime(col, format='mixed', dayfirst=True))

# Print the DataFrame with consistent date format
print(df)
num_rows = df.shape[0]
print(num_rows)

            id           updated       ship      imo      lat     long  sog  \
238       4374  05/04/2018 21:24   megastar  9773064  59.4452  24.7717  3.9   
243       4379  05/04/2018 21:27       star  9364722  60.1477  24.9149  3.2   
495       4631  06/04/2018 06:27   megastar  9773064  60.1478  24.9146  0.5   
497       4633  06/04/2018 06:28       star  9364722  59.4444  24.7705  0.7   
722       4859  06/04/2018 09:27   megastar  9773064  59.4451  24.7713  2.9   
...        ...               ...        ...      ...      ...      ...  ...   
823216  911728  14/03/2019 22:26   megastar  9773064  59.4453  24.7717  3.3   
823220  911732  14/03/2019 22:28       star  9364722  60.1481  24.9150  0.9   
823458  911970  15/03/2019 06:08  finlandia  9214379  60.1487  24.9112  1.3   
823652  912164  15/03/2019 07:26       star  9364722  59.4451  24.7715  2.5   
823659  912171  15/03/2019 07:29   megastar  9773064  60.1478  24.9146  1.1   

        cog  hdg depPort         etdSchedule       