# Import Libraries and Data

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime
from sklearn import preprocessing
#import os
%timeit
%matplotlib inline

df = pd.read_csv('/Users/jasongangel/Downloads/AIS/data/WorkOrder7538249_SAIS_2016-01copy.csv', usecols=['MMSI','PERIOD', 'Latitude', 'Longitude','SPEED_KNOTS', 'COG_DEG', 'SHIP_AND_CARGO_TYPE','DRAUGHT'], na_values ='None',low_memory=False, parse_dates=['PERIOD'])

df = df.rename(columns={'Latitude': 'LAT', 'Longitude': 'LONG', 'SHIP_AND_CARGO_TYPE':'SHIP_TYPE'})

#  Data Characteristics

In [3]:
df.head(5)

Unnamed: 0,MMSI,PERIOD,LAT,LONG,SPEED_KNOTS,COG_DEG,SHIP_TYPE,DRAUGHT
0,229454000,2016-01-08 11:50:00,40.374271,-73.103733,15.9,92.0,79,10.0
1,229564000,2016-01-08 11:50:00,34.008233,-56.555884,11.9,73.0,70,13.6
2,229655000,2016-01-08 11:50:00,40.164743,-58.553108,19.1,59.9,70,12.9
3,249904000,2016-01-08 11:50:00,39.218795,-64.369931,18.0,72.1,70,9.9
4,209648000,2016-01-08 11:50:00,42.783833,-52.453734,11.7,290.5,70,16.4


In [4]:
#Dataframe Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7147784 entries, 0 to 7147783
Data columns (total 8 columns):
MMSI           int64
PERIOD         datetime64[ns]
LAT            float64
LONG           float64
SPEED_KNOTS    float64
COG_DEG        float64
SHIP_TYPE      object
DRAUGHT        object
dtypes: datetime64[ns](1), float64(4), int64(1), object(2)
memory usage: 436.3+ MB


In [5]:
#Percentage of Rows w/ Missing Data
(df.isnull().sum().sum()/df.shape[0])*100

0.8127414034895291

In [6]:
#Missing data per column
df.isnull().sum()

MMSI               0
PERIOD             0
LAT                0
LONG               0
SPEED_KNOTS     5610
COG_DEG        21467
SHIP_TYPE          0
DRAUGHT        31016
dtype: int64

In [7]:
#Number of Unique Vessels
df['MMSI'].nunique()

27162

# Data clean 

In [24]:
#Copy df
df1 = df.copy()

MMSI

In [25]:
#Remove invalid MMSIs and sort by clean MMSIs
df1=df[(df['MMSI'] > 200000000)].sort_values(['MMSI','PERIOD'])

#Only include MMSIs with over 50 AIS Positions
#counts = df1['MMSI'].value_counts()
#counts = counts[counts > 50]
#df1 = df1[df1['MMSI'].isin(counts[counts > 3].index)]

#Create Country Column from MMSI
df1['MMSI'] = df1['MMSI'].astype(str)
df1['COUNTRY'] = df1['MMSI'].str.slice(start=0, stop=3)

df1['MMSI'].nunique()

27126

SPEED_KNOTS

In [26]:
#Drop null values
df1.dropna(how='any', axis=0, subset=['SPEED_KNOTS'], inplace=True)

In [None]:
#Create TIME_DELTA Column = Time since last AIS Report
#df1['TIME_DELTA'] = df1.sort_values(['MMSI','PERIOD']).groupby('MMSI')['PERIOD'].diff()
#df1.head(5)
#Create Distance traveled Since last report

COG_DEG

In [27]:
#Remove NAN COG_DEG
df1['COG_DEG']= df1['COG_DEG'].fillna(0)

#Round COG_DEG to INT
df1['COG_DEG'] = df1['COG_DEG'].astype('int')

SHIP_TYPE and DRAUGHT

In [28]:
# Fill DRAUGHT null values with 0
df1['DRAUGHT']= df1['DRAUGHT'].fillna(0)

#Cast all columns to numeric type and mark incompatable values as NAN
df1[['SHIP_TYPE','DRAUGHT']] = df1[['SHIP_TYPE','DRAUGHT']].apply(pd.to_numeric,errors='coerce',axis=0)

# Filter out Non Cargo and Tanker vessesls
df1 = df1[(df1['SHIP_TYPE'] >= 70) & (df1['SHIP_TYPE'] <= 89)]

#Cast as a str
df1['SHIP_TYPE'] = df1['SHIP_TYPE'].astype(str)

#Remove second digit of SHIP_TYPE (has no significance)
df1['SHIP_TYPE'] = df1['SHIP_TYPE'].str.slice(start=0, stop=1)

#Cast Back to numeric
df1[['SHIP_TYPE']] = df1[['SHIP_TYPE']].apply(pd.to_numeric,errors='coerce',axis=0)

#Round Draught to INT
df1['DRAUGHT'] = df1['DRAUGHT'].astype('int')

#Display count of unique vessel types
df1.SHIP_TYPE.value_counts()

7    4995895
8    1620692
Name: SHIP_TYPE, dtype: int64

Downcast

In [29]:
#Downcast float columns
cols = ['SPEED_KNOTS','COG_DEG', 'DRAUGHT','LAT','LONG']
df1[cols] = df1[cols].apply(pd.to_numeric, downcast='float', errors='coerce', axis=0)

#Downcast int columns
cols = ['SHIP_TYPE']
df1[cols] = df1[cols].apply(pd.to_numeric, downcast='signed', errors='coerce', axis=0)

#Drop MMSI Column
df1 = df1.drop(['MMSI'], axis=1)

# Data Analysis

In [None]:
df1.head()

In [None]:
df1.info()

In [None]:
df1['COUNTRY'].nunique()

In [None]:
df1.isnull().sum()

#SPEED_KNOTs Distribution
plt.figure(figsize=(10,4))
sns.distplot(df1['SPEED_KNOTS'],kde=False,bins=30)

#SPEED_KNOTS by SHIP_AND_CARGO_TYPE
plt.figure(figsize=(15,7))
sns.boxplot(x='SHIP_AND_CARGO_TYPE', y='SPEED_KNOTS',data=df1)

plt.figure(figsize=(10,4))
df1['DRAUGHT'].hist(bins=50)

#df1['SHIP_AND_CARGO_TYPE'].value_counts()
#plt.figure(figsize=(15,7))
#sns.countplot(x='SHIP_AND_CARGO_TYPE',data=df1)

#Value counts in SHIP_AND_CARGO_TYPE
plt.figure(figsize=(100,10))
sns.countplot(x='COUNTRY',data=df1)

In [None]:
#Show Stats for TIME_DELTA
#df1['TIME_DELTA'].describe()

# Pre-Processing

In [None]:
#df2 = df1.copy()
df2.info()

In [None]:
from numpy import array
from numpy import argmax
from keras.utils import to_categorical
# define example
data = array(df2['SHIP_AND_CARGO_TYPE'])
data

In [None]:
# one hot encode
encoded = to_categorical(data)
print(encoded)
# invert encoding
inverted = argmax(encoded[0])
print(inverted)

In [None]:
import scipy
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df2[['SPEED_KNOTS','DRAUGHT','DIM_BOW','DIM_STERN','DIM_STERN','DIM_STARBOARD','DIM_PORT','COG_DEG']] = scaler.fit_transform(df2[['SPEED_KNOTS', 'DRAUGHT','DIM_BOW','DIM_STERN','DIM_STERN','DIM_STARBOARD','DIM_PORT','COG_DEG']])

In [None]:
df2 = pd.get_dummies(df2, prefix=['ST_', 'CT_'])

In [None]:
#Downcast float columns
cols = ['SPEED_KNOTS','COG_DEG', 'DRAUGHT','DIM_BOW','DIM_STERN','DIM_STERN','DIM_STARBOARD','DIM_PORT','COG_DEG']
df2[cols] = df2[cols].apply(pd.to_numeric, downcast='float', errors='coerce', axis=0)
df2.info()

# Export

In [None]:
import csv
df1.to_csv('/Users/jasongangel/Downloads/AIS/data/Clean/Clean-01_16*.csv', index=False, encoding='utf-8')
#quoting=csv.QUOTE_NONE

# Datashader

#DATASHADER
import datashader as ds
import datashader.transfer_functions as tf
import dask.dataframe as dd
dd = dd.read_csv('/Users/jasongangel/Downloads/AIS/data/Clean/-*.csv').persist()

canvas = ds.Canvas(plot_width=500, plot_height=300, 
                   x_range=(-8,8), y_range=(-8,8), 
                   x_axis_type='linear', y_axis_type='linear')

%time tf.shade(ds.Canvas().points(dd,'LONG','LAT'))

df2 = pd.read_csv('/Users/jasongangel/Downloads/AIS/data/Clean/Clean-*.csv')
df2.info()