In [1]:
#importing libraries
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%%time
raw_data = pd.read_csv('C:/Users/vijaya/Desktop/AIS_2017_01_Zone10.csv')

data = raw_data.head(1000000)

data.head()

Wall time: 56.8 s


Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo
0,367114690,2017-01-01T00:00:06,48.51094,-122.60705,0.0,-49.6,511.0,,,,,under way using engine,,,,
1,367479990,2017-01-01T00:00:03,48.15891,-122.67268,0.1,10.1,353.0,WSF KENNEWICK,IMO9618331,WDF6991,1012.0,moored,83.39,19.5,3.2,
2,368319000,2017-01-01T00:00:08,43.34576,-124.32142,0.0,32.8,173.0,,,,,engaged in fishing,,,,
3,367154100,2017-01-01T00:00:15,46.74264,-124.93125,6.8,6.0,352.0,,,,,undefined,,,,
4,367446870,2017-01-01T00:00:59,48.5132,-122.60718,0.0,23.2,511.0,,,,,,,,,


In [3]:
#one MMSI represents one single vessel
pd.set_option('display.max_rows', 1763)
data['MMSI'].value_counts()

477874000    1494
316031266    1493
366765940    1491
367153430    1490
367008020    1483
367033060    1476
367441530    1474
367162190    1473
367327250    1471
367529030    1470
367047170    1469
367419960    1469
367605150    1466
367653630    1466
366435740    1466
367098550    1465
316003666    1464
367331730    1462
367513230    1461
367530080    1460
367060330    1459
367608420    1457
367088940    1456
367384780    1455
367530040    1454
367516730    1453
367763660    1453
367450580    1451
367566980    1450
367170860    1449
367476050    1447
366772990    1446
366709770    1445
367380280    1445
367531260    1444
367434360    1444
369703000    1443
367305920    1442
259005000    1442
367154120    1440
367155110    1438
367513160    1437
366929710    1436
367133160    1435
366773060    1433
366772760    1432
367611420    1431
367301450    1430
367649320    1429
367185050    1429
366772750    1428
366759130    1427
366991520    1427
366987710    1426
366773070    1422
316017162 

In [4]:
#changing dtype of BaseDateTime columns from Object to datetime
data['BaseDateTime'] = pd.to_datetime(data['BaseDateTime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
# find which column contain missing values
data.isnull().any()

MMSI            False
BaseDateTime    False
LAT             False
LON             False
SOG             False
COG             False
Heading         False
VesselName       True
IMO              True
CallSign         True
VesselType       True
Status           True
Length           True
Width            True
Draft            True
Cargo            True
dtype: bool

In [6]:
# put all columns in a list that contain missing values
missing_values_columns = data.columns[data.isnull().any()].tolist()
missing_values_columns

['VesselName',
 'IMO',
 'CallSign',
 'VesselType',
 'Status',
 'Length',
 'Width',
 'Draft',
 'Cargo']

In [7]:
#a dataframe that gives a record about how much missing values exist in any column

count_missing_values = []
for i in missing_values_columns:
    count = data[i].isnull().sum()
    count_missing_values.append(count)
    
missing_values_record = pd.DataFrame({'Column name': missing_values_columns,
                                     'Missing Value Count': count_missing_values})

missing_values_record


Unnamed: 0,Column name,Missing Value Count
0,VesselName,44306
1,IMO,522415
2,CallSign,193714
3,VesselType,111413
4,Status,306071
5,Length,145443
6,Width,295900
7,Draft,584314
8,Cargo,637563


In [8]:
#fill missing values by "linear interpolation".
#data = data.interpolate(method ='linear', limit_direction ='forward') 

data = data.interpolate(method='linear', axis=0).ffill().bfill()

KeyboardInterrupt: 

In [None]:
#fix random seed for reproducibility
import numpy as np
np.random.seed(7)

In [None]:
year = pd.DatetimeIndex(data['BaseDateTime']).year.tolist()

month = pd.DatetimeIndex(data['BaseDateTime']).month.tolist()

day = pd.DatetimeIndex(data['BaseDateTime']).day.tolist()

hour = pd.DatetimeIndex(data['BaseDateTime']).hour.tolist()

minute = pd.DatetimeIndex(data['BaseDateTime']).minute.tolist()

data['year'] = year
data['month'] = month
data['day'] = day
data['hour'] = hour
data['minute'] = minute

In [None]:
data.columns

In [None]:
#Now to BaseDateTime column is no longer for use. Therefore, we can drop that column
data.drop(['BaseDateTime'], axis=1, inplace=True)

In [None]:
#Drop non-numeric columns
data.drop(['VesselName'], axis=1, inplace=True)
data.drop(['IMO'], axis=1, inplace=True)
data.drop(['CallSign'], axis=1, inplace=True)
data.drop(['Status'], axis=1, inplace=True)

In [None]:
#create a column called target with random 1's and 0's as labels for anomolies and non-anomolies 
import numpy as np

data['target'] = np.random.randint(2, size=len(data))

data

In [None]:
features = list(data.columns.values)
features.remove('target')
#print(features)
X = data[features]
y = data['target']

In [None]:
print(X.head())

In [None]:
#divide the data into test and train sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(16,)),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

In [None]:
#for a binary classification problem 
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5, batch_size=1)

In [None]:
model.summary()

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)