In [1]:
import numpy as np
import pandas as pd
import glob
import geopy
 import geopy.distance
import datetime
import sys
import time

# Classifier training experiment

Loading the csv file containing the preprocessed data. The bool represents the route identification (bool=0 is south, bool=1 is North)

In [32]:
path = r"train_ship_data_final_classifier.csv"
df=pd.read_csv(path,sep=',',low_memory=False)
df

Unnamed: 0,Length,Breadth,Draught,Longitude,Latitude,SOG,bool,ETA
0,152,24,6.8,5.838777,53.63944,14.3,0,12.685507
1,152,24,6.8,5.845800,53.64076,14.3,0,12.667177
2,152,24,6.8,5.852149,53.64198,14.3,0,12.650496
3,152,24,6.8,5.858515,53.64320,14.3,0,12.633836
4,152,24,6.8,5.864876,53.64441,14.3,0,12.617179
...,...,...,...,...,...,...,...,...
186520,229,38,13.3,9.806900,53.55355,7.4,1,0.083519
186521,229,38,13.3,9.810224,53.55293,7.3,1,0.066774
186522,229,38,13.3,9.813442,53.55230,7.2,1,0.049974
186523,229,38,13.3,9.816650,53.55172,7.1,1,0.033443


In [33]:
# from google.colab import drive
# drive.mount('/content/drive')

df

This section is the loading of the relevant machine learning libraries

In [6]:
#Training part of the data
from sklearn import metrics

from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from hyperopt import fmin, tpe, hp,STATUS_OK, Trials
import matplotlib.pyplot as plt
#!pip install tpot
from tpot import TPOTClassifier




## Data preperation (Train-test split)
The data set is now split into the training and test set. The training set will have 75% of the original data set,while the test data has 25% of the data set. The features of the data set are: 
<ol>
<li>Length</li>
<li>Breadth</li>
<li>Draught</li>
<li>Longitude</li>
<li>Latitude</li>
<li>SOG</li>
</ol>
The labels is the route taken i.e bool

In [9]:
#Features
features = df.drop('bool', axis=1)
features= np.array(features)

# Labels
labels= np.array(df['bool'])

feature_list = list(df.columns)

#Splitting the data to train and test

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.25, random_state = 42)

print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:',  y_test.shape)

For the training, logistic regression was selected and in this section for classification and their default values(no tuning) is used to train and show the accuracy results

In [None]:
# Training part with logistic regression without tuning(Default values)

lr = LogisticRegression()

start_time = time.time()

lr.fit(X_train, y_train)

sv_notune= time.time() - start_time

y_pred= lr.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


## Tuning Section
This section now deals with tuining the Logistic regression 

In [11]:
# LRC object to be used for tuning 

lrc = LogisticRegression()



## Tuning Section
The following HPO algorithims will be used for tuning
1.Random search
2.Bayesian Optimization
3.Genetic Algorithims

For the logistic regression algorithm. The following hyperparameters are used for tuning
- C : regalurization strength
- Maximum iterations (max_iter) : Maximum number of iterations taken for the solvers to converge
- Solver : Algorithm to use in the optimization problem.
- Penalty :Used to specify the norm used in the penalization



## Random Search
With the hyperparameters mentioned above , a 'search space' is used to sample from and find the best possible hyperparameters for logistic regresssion.

In [18]:
# Tuning LRC with the following algorithims
        # 1) Random
        # 2) Bayesian
        #  3) Genetic algorithims
        
# Random searcch set parameters
C=np.random.uniform(0.01,200,30)
max_iter = np.random.randint(50,2000,30)
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty= ['none', 'l1', 'l2', 'elasticnet']
random_grid = {'C': C, 'solver':solver,
               'penalty': penalty,'max_iter':max_iter
               }
random_grid 

#Random search tuning
lr_random = RandomizedSearchCV(estimator = lrc, param_distributions = random_grid, n_iter = 1000, cv = 3, verbose=10, random_state=42, n_jobs =-1)
start_time = time.time()
lr_random.fit(X_train, y_train)

random_time =time.time() - start_time

#Parameters selected for random search
lr_random.get_params()


In [19]:
#Testing on the test features, predictions for random search
y_pred1=lr_random.predict(X_test)

print("%s seconds it took to train with Random search tuning" % (random_time))
print(classification_report(y_test,y_pred1))

43537.440590143204 seconds it took to train with Random search tuning
              precision    recall  f1-score   support

           0       0.83      0.96      0.89     35954
           1       0.73      0.36      0.48     10678

    accuracy                           0.82     46632
   macro avg       0.78      0.66      0.69     46632
weighted avg       0.81      0.82      0.80     46632



## Bayesian Optimizaition
The same search space is used here is the same as that of Random search. The number of trials for the bayesian optimization will be 50 trials.

In [20]:
#Tuning with Bayesian optimization

search_space= {'C': hp.uniform ('C', 0.01, 200),
        'max_iter': hp.uniform ('max_iter', 50, 2000)
         'solver' : hp.choice('solver',['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) 
        'penalty' : hp.choice('penalty',['none', 'l1', 'l2', 'elasticnet']) 
    }
def objective(search_space):
    bayes = LogisticRegression(C = search_space['C'],max_iter = search_space['max_iter'],solver=search_space['solver'],penalty=search_space['penalty'])
    accuracy = cross_val_score(bayes, X_train, y_train, cv = 4).mean()
    return {'loss': accuracy, 'status': STATUS_OK }

trials = Trials()


# Training from bayes tuning


start_time = time.time()


best = fmin(fn= objective,
            space= search_space,
            algo= tpe.suggest,
            max_evals = 50,
            trials= trials)
best


slv = {0: 'newton-cg', 1: 'lbfgs', 2: 'liblinear', 3: 'sag', 4:'saga' }
pen = {0: 'none', 1: 'l1', 2: 'l2','elasticnet'}
trainedlrc  = LogisticRegression(C = best['C'],max_iter = best['max_iter'],solver= slv[best['solver']],penalty= pen[best['penalty']]).fit(X_train, y_train)

bayesian_time =time.time() - start_time

y_pred2 = trainedlrc.predict(X_test)

 26%|██▌       | 13/50 [00:30<01:20,  2.18s/trial, best loss: 0.7902253892725529]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 44%|████▍     | 22/50 [00:51<01:05,  2.33s/trial, best loss: 0.7902253892725529]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver o

 52%|█████▏    | 26/50 [00:59<00:53,  2.22s/trial, best loss: 0.7902253892725529]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 60%|██████    | 30/50 [01:07<00:40,  2.03s/trial, best loss: 0.7902253892725529]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver o

 62%|██████▏   | 31/50 [01:09<00:35,  1.85s/trial, best loss: 0.7902253892725529]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 92%|█████████▏| 46/50 [01:43<00:09,  2.44s/trial, best loss: 0.7902253890681619]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



100%|██████████| 50/50 [01:51<00:00,  2.24s/trial, best loss: 0.7902253890681619]


## Genetic Algrorithim

5 Generations are used for testing and there is an early stoppage time of 5 minutes.

In [21]:
# The genetic algorithim part

lrc_tpot = TPOTClassifier(verbosity=3, 
 scoring='accuracy', 
 random_state=32, 
 n_jobs=-1, 
 generations=5, 
 population_size=20,
 early_stop=5)

start_time = time.time()

lrc_tpot.fit(X_train, y_train) 

genetic_time = time.time() - start_time

32 operators have been imported by TPOT.


HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=120.0, style=ProgressStyle(de…

Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.
Skipped pipeline #32 due to time out. Continuing to the next pipeline.

Generation 1 - Current Pareto front scores:
-1	0.9843380234899428	ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=gini, ExtraTreesClassifier__max_features=1.0, ExtraTreesClassifier__min_samples_leaf=9, ExtraTreesClassifier__min_samples_split=3, ExtraTreesClassifier__n_estimators=100)
-2	0.9846454073523903	GaussianNB(RandomForestClassifier(input_matrix, RandomForestClassifier__bootstrap=True, RandomForestClassifier__criterion=gini, RandomForestClassifier__max_features=0.9000000000000001, RandomForestClassifier__min_samples_leaf=12, RandomForestClassifier__min_samples_split=13, RandomForestClassifier__n_estimators=100))
Generation 2 - Current Pareto front scores:
-1	0.9843380234899428	ExtraTreesClassifier(input_matrix, ExtraTree

In [22]:
predictions3 = lrc_tpot.predict(X_test)

## Results

The results for LRC with the time it took to train. The results are displayed as a classification report with precision,recall,accuracy and f1 score recorded.

In [23]:
# Results of training (time and classification report)

#1 Result of LRC without tuning
print("%s seconds it took to train without tuning" % (sv_notune))    
print(classification_report(y_test,y_pred))

#2 Result of LRC with Random search tuning
print("%s seconds it took to train with Random search tuning" % (random_time))
print(classification_report(y_test,y_pred1))

#3 Result of LRC with Bayesian search tuning 
print("%s seconds it took to train with Bayesian search tuning " % (bayesian_time))
print(classification_report(y_test,y_pred2))

#4 Result of Classification with Genetic Algorithim
print("%s seconds it took to train genetic algorithim" % (genetic_time))
print(classification_report(y_test,predictions3))


0.5604255199432373 seconds it took to train without tuning
              precision    recall  f1-score   support

           0       0.79      0.98      0.88     35954
           1       0.66      0.14      0.23     10678

    accuracy                           0.79     46632
   macro avg       0.73      0.56      0.55     46632
weighted avg       0.76      0.79      0.73     46632

43537.440590143204 seconds it took to train with Random search tuning
              precision    recall  f1-score   support

           0       0.83      0.96      0.89     35954
           1       0.73      0.36      0.48     10678

    accuracy                           0.82     46632
   macro avg       0.78      0.66      0.69     46632
weighted avg       0.81      0.82      0.80     46632

112.39598250389099 seconds it took to train with Bayesian search tuning 
              precision    recall  f1-score   support

           0       0.79      0.98      0.88     35954
           1       0.66      0.14  