In [None]:
 # import all the necessary libraries
import os
import re
import pandas as pd
import numpy as np

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, timedelta

import cartopy.crs as ccrs

from sklearn import datasets
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import discriminant_analysis
from sklearn import svm

# options
data_dir = os.getenv("HOME")+'/data/NOTAM-classifier'
pd.set_option('display.max_columns', 500)

# aestetics
plt.style.use('seaborn-whitegrid')
plt.rc('pdf',fonttype=42)
sns.mpl.rc('figure', figsize = (10, 8))
sns.set_context('notebook', font_scale=1.8, rc={'lines.linewidth': 2.5})

# File contents

In [None]:
NOTAMs_df = pd.read_csv(data_dir+'/NOTAMS.csv', sep=',').set_index('id')

In [None]:
print(len(NOTAMs_df))

## Header

In [None]:
NOTAMs_df.head(10)

In [5]:
features = [
    'scope', 'FIR_12', 'high_min_alt', 
    'low_max_alt', 'diurnal_duration', 
    'long_text', 'small_radius', 
    'trafficind', 'code_23', 
    'n_locations', 'code_45']

In [6]:
from sklearn import preprocessing

n_samples = len(NOTAMs_df)
n_features = len(features)
X = np.zeros((n_samples, n_features))

le = preprocessing.LabelEncoder()
for i,feature in enumerate(features):
    print('Encoding {}'.format(feature))    
    X[:, i] = le.fit_transform(NOTAMs_df[feature].astype(str))

y = NOTAMs_df['supress']

X

Encoding scope
Encoding FIR_12
Encoding high_min_alt
Encoding low_max_alt
Encoding diurnal_duration
Encoding long_text
Encoding small_radius
Encoding trafficind
Encoding code_23
Encoding n_locations
Encoding code_45


array([[  5.,  84.,   0., ..., 132.,   0.,  10.],
       [  5.,  76.,   1., ..., 148.,   1.,  38.],
       [  5.,  76.,   1., ..., 148.,   1.,  38.],
       ...,
       [  4.,  63.,   0., ...,  56.,   4.,  25.],
       [  4.,   8.,   0., ...,  56.,   0.,  25.],
       [  0.,  57.,   0., ...,  95.,   0.,  23.]])

In [7]:
def classify(X, y, classifier, prob=None, random_seed=20091982):
    """ Run classifier and print
    results
    """
    
    X_train, X_test, y_train, y_test = \
    model_selection.train_test_split(
        X, y, test_size=0.20, random_state=random_seed)
    
    classifier.fit(X_train, y_train)
    if prob is not None:
        y_pred = classifier.predict_proba(X_test)[:,0] < prob
    else:
        y_pred = classifier.predict(X_test)
        
 
    N = len(y_test)
    TP = np.sum((y_pred == y_test) & (y_test == 1))
    TN = np.sum((y_pred == y_test) & (y_test == 0))
    FP = np.sum((y_pred != y_test) & (y_pred == 1))
    FN = np.sum((y_pred != y_test) & (y_pred == 0))

    accuracy = (TP+TN)/N
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)

    result_string = 'N={0}, TP={1}, TN={2}, FP={3}, FN={4}\n'.format(N, TP, TN, FP, FN)
    result_string += \
        'Precision: {0:.4f}, recall: {1:.4f}, accuracy: {2:.4f}'\
        .format(precision, recall, accuracy)

    print(result_string)
    
    report = metrics.classification_report(y_test, y_pred)
    
    
    print(report)
    
    return classifier

def get_feature_importances(cols, importances):
    
    count = 0
    indices = np.argsort(importances)[::-1]
    for i in indices:
        print('{1}: {0:.2f}%'.format(
            importances[i]*100.0, cols[i]))
        count += 1
        #if count == 10:
        #    break
    return

In [8]:
classifier = classify(X, y, RandomForestClassifier(random_state=20091982), prob=0.5)

# recall = 98%, reduce screened NOTAMs by 90%
classifier = classify(X, y, RandomForestClassifier(random_state=20091982), prob=0.1)

# recall = 99%, reduce screened NOTAMs by 80%
classifier = classify(X, y, RandomForestClassifier(random_state=20091982), prob=0.001)

N=19710, TP=8083, TN=9734, FP=931, FN=962
Precision: 0.8967, recall: 0.8936, accuracy: 0.9040
             precision    recall  f1-score   support

          0       0.91      0.91      0.91     10665
          1       0.90      0.89      0.90      9045

avg / total       0.90      0.90      0.90     19710

N=19710, TP=6615, TN=10495, FP=170, FN=2430
Precision: 0.9749, recall: 0.7313, accuracy: 0.8681
             precision    recall  f1-score   support

          0       0.81      0.98      0.89     10665
          1       0.97      0.73      0.84      9045

avg / total       0.89      0.87      0.86     19710

N=19710, TP=4927, TN=10611, FP=54, FN=4118
Precision: 0.9892, recall: 0.5447, accuracy: 0.7883
             precision    recall  f1-score   support

          0       0.72      0.99      0.84     10665
          1       0.99      0.54      0.70      9045

avg / total       0.84      0.79      0.77     19710



## Features importances

In [9]:
get_feature_importances(features, classifier.feature_importances_)

code_23: 21.59%
FIR_12: 21.23%
scope: 19.75%
code_45: 16.95%
low_max_alt: 13.25%
trafficind: 2.08%
long_text: 1.84%
diurnal_duration: 1.25%
high_min_alt: 0.85%
n_locations: 0.67%
small_radius: 0.53%


## One hot encoders

In [11]:
enc = preprocessing.OneHotEncoder()    
X_one_hot = enc.fit_transform(X)

X_one_hot.shape

(98547, 421)

In [12]:
# essentially recall = 100%, reduce screened NOTAMs by 60%
classifier = classify(X_one_hot, y, LogisticRegression(), prob=0.1)

N=19710, TP=5223, TN=10482, FP=183, FN=3822
Precision: 0.9661, recall: 0.5774, accuracy: 0.7968
             precision    recall  f1-score   support

          0       0.73      0.98      0.84     10665
          1       0.97      0.58      0.72      9045

avg / total       0.84      0.80      0.79     19710



In [13]:
# some more test
classifier = classify(X_one_hot, y, LogisticRegression(), prob=0.01)

N=19710, TP=1692, TN=10646, FP=19, FN=7353
Precision: 0.9889, recall: 0.1871, accuracy: 0.6260
             precision    recall  f1-score   support

          0       0.59      1.00      0.74     10665
          1       0.99      0.19      0.31      9045

avg / total       0.77      0.63      0.55     19710



In [21]:
classifier = classify(X_one_hot, y, MLPClassifier(), prob=0.5)

N=19710, TP=7894, TN=9994, FP=671, FN=1151
Precision: 0.9217, recall: 0.8727, accuracy: 0.9076
             precision    recall  f1-score   support

          0       0.90      0.94      0.92     10665
          1       0.92      0.87      0.90      9045

avg / total       0.91      0.91      0.91     19710



In [20]:
classifier = classify(X, y, RandomForestClassifier(random_state=20091982), prob=0.5)
classifier = classify(X_one_hot, y, RandomForestClassifier(random_state=20091982), prob=0.5)

N=19710, TP=8083, TN=9734, FP=931, FN=962
Precision: 0.8967, recall: 0.8936, accuracy: 0.9040
             precision    recall  f1-score   support

          0       0.91      0.91      0.91     10665
          1       0.90      0.89      0.90      9045

avg / total       0.90      0.90      0.90     19710

N=19710, TP=8097, TN=9766, FP=899, FN=948
Precision: 0.9001, recall: 0.8952, accuracy: 0.9063
             precision    recall  f1-score   support

          0       0.91      0.92      0.91     10665
          1       0.90      0.90      0.90      9045

avg / total       0.91      0.91      0.91     19710



In [16]:
classifier = classify(X_one_hot, y, BernoulliNB(), prob=0.01)

N=19710, TP=5217, TN=9892, FP=773, FN=3828
Precision: 0.8710, recall: 0.5768, accuracy: 0.7666
             precision    recall  f1-score   support

          0       0.72      0.93      0.81     10665
          1       0.87      0.58      0.69      9045

avg / total       0.79      0.77      0.76     19710



In [18]:
# classifier = classify(X, y, GradientBoostingClassifier(), prob=0.01)
classifier = classify(X, y, svm.SVC(probability=False), prob=None)

N=19710, TP=7844, TN=9805, FP=860, FN=1201
Precision: 0.9012, recall: 0.8672, accuracy: 0.8954
             precision    recall  f1-score   support

          0       0.89      0.92      0.90     10665
          1       0.90      0.87      0.88      9045

avg / total       0.90      0.90      0.90     19710



In [None]:
# Takes time to train
# classifier = classify(X, y, svm.SVC())


In [30]:
classifier = classify(
    X_one_hot, y, 
    MLPClassifier(
        hidden_layer_sizes=(200, 10), 
        learning_rate_init=0.01, 
        verbose=True), 
    prob=0.5)

Iteration 1, loss = 0.24659881
Iteration 2, loss = 0.20226820
Iteration 3, loss = 0.19163974
Iteration 4, loss = 0.18474149
Iteration 5, loss = 0.18029736
Iteration 6, loss = 0.17660244
Iteration 7, loss = 0.17491304
Iteration 8, loss = 0.17161727
Iteration 9, loss = 0.17039928
Iteration 10, loss = 0.16904896
Iteration 11, loss = 0.16736087
Iteration 12, loss = 0.16621319
Iteration 13, loss = 0.16498497
Iteration 14, loss = 0.16434805
Iteration 15, loss = 0.16301117
Iteration 16, loss = 0.16245633
Iteration 17, loss = 0.16281371
Iteration 18, loss = 0.16201198
Iteration 19, loss = 0.16135211
Iteration 20, loss = 0.16055302
Iteration 21, loss = 0.16028648
Iteration 22, loss = 0.16077473
Iteration 23, loss = 0.15980274
Iteration 24, loss = 0.15932235
Iteration 25, loss = 0.15917991
Iteration 26, loss = 0.15836582
Iteration 27, loss = 0.15835138
Iteration 28, loss = 0.15831309
Iteration 29, loss = 0.15892939
Training loss did not improve more than tol=0.000100 for two consecutive epochs. 