# Load Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# location of original images
subdirectory = 'C:/Users/Julian Bautista/Documents/School Stuff/Semesters/5 Spring 2017/Machine Learning/All_images/'

# Read in Images and Labels

In [2]:
# read labels for aircraft images
labels = pd.read_csv('aircraft.csv')
print(labels.head())
print(labels.shape)

                  imageName       ylabel  plane helicopter  other aircraft
0  2016-10-23T10+40+20_430Z  No aircraft  False      False  False    False
1  2016-11-26T20+15+55_250Z  No aircraft  False      False  False    False
2  2016-12-15T15+26+33_480Z  No aircraft  False      False  False    False
3  2016-10-23T14+01+49_396Z  No aircraft  False      False  False    False
4  2016-08-14T05+43+34_690Z  No aircraft  False      False  False    False
(2410, 6)


In [3]:
from skimage import io, color

# iterate through the aircraft.csv file to get image names and labels
# read the images, convert to grayscale, and store image ndarrays and labels in lists
grayscale_images_list = []
y_list = []

for index, row in labels.iterrows():
    toRead = subdirectory + row['imageName']
    grayscale_images_list.append(color.rgb2gray(io.imread(toRead)))
    y_list.append(row['aircraft'])

In [4]:
# convert the lists to ndarrays
grayscale_images = np.asarray(grayscale_images_list)
Y = np.asarray(y_list)
print(grayscale_images.shape)

(2410, 360, 640)


In [5]:
# flatten the images ndarray to one row per image
imgs_flat = grayscale_images.reshape((grayscale_images.shape[0], -1))
print(imgs_flat.shape)
print(Y.shape)

(2410, 230400)
(2410,)


# Ready Data for Modeling and Evaluation

In [6]:
#Aaron Hill's Binary Classification Evaluator
class BinaryClassificationPerformance():
    '''Performance measures to evaluate the fit of a binary classification model'''
    
    def __init__(self, predictions, labels, desc, probabilities=None):
        '''Initialize attributes: predictions-vector of predicted values for Y, labels-vector of labels for Y'''
        '''probabilities-optional, probability that Y is equal to True'''
        self.probabilities = probabilities
        self.performance_df = pd.concat([pd.DataFrame(predictions), pd.DataFrame(labels)], axis=1)
        self.performance_df.columns = ['preds', 'labls']
        self.desc = desc
        self.performance_measures = {}
  
    def compute_measures(self):
        '''Compute performance measures defined by Flach p. 57'''
        self.performance_measures['Pos'] = self.performance_df['preds'].sum()
        self.performance_measures['Neg'] = self.performance_df.shape[0] - self.performance_df['preds'].sum()
        self.performance_measures['TP'] = ((self.performance_df['preds'] == True) & (self.performance_df['labls'] == True)).sum()
        self.performance_measures['TN'] = ((self.performance_df['preds'] == False) & (self.performance_df['labls'] == False)).sum()
        self.performance_measures['FP'] = ((self.performance_df['preds'] == True) & (self.performance_df['labls'] == False)).sum()
        self.performance_measures['FN'] = ((self.performance_df['preds'] == False) & (self.performance_df['labls'] == True)).sum()
        self.performance_measures['Accuracy'] = (self.performance_measures['TP'] + self.performance_measures['TN']) / (self.performance_measures['Pos'] + self.performance_measures['Neg'])

In [7]:
#Separate out train and test sets
from sklearn.cross_validation import train_test_split

data_train, data_test, y_train, y_test = train_test_split(imgs_flat, Y,
                                                          test_size = 0.5,
                                                          random_state = 56)



# Train Model

In [8]:
#allow pickle creation
from sklearn.externals import joblib

In [9]:
#Try SVM
from sklearn import linear_model
svm = linear_model.SGDClassifier()
svm.fit(data_train, y_train)
joblib.dump(svm, 'svm.pkl') # pickle

svm_performance = BinaryClassificationPerformance(svm.predict(data_train), y_train, 'svm')
svm_performance.compute_measures()
print(svm_performance.performance_measures)

{'Accuracy': 0.97759336099585059, 'Neg': 1205, 'FN': 27, 'Pos': 0, 'TN': 1178, 'FP': 0, 'TP': 0}


In [10]:
#Try logistic model
from sklearn import linear_model
lgs = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
lgs.fit(data_train, y_train)
joblib.dump(lgs, 'lgs.pkl') # pickle

lgs_performance = BinaryClassificationPerformance(lgs.predict(data_train), y_train, 'lgs')
lgs_performance.compute_measures()
print(lgs_performance.performance_measures)

{'Accuracy': 0.98838174273858925, 'Neg': 1192, 'FN': 14, 'Pos': 13, 'TN': 1178, 'FP': 0, 'TP': 13}


In [11]:
#Try Naive Bayes
from sklearn.naive_bayes import MultinomialNB
nbs = MultinomialNB()
nbs.fit(data_train, y_train)
joblib.dump(nbs, 'nbs.pkl') # pickle

nbs_performance = BinaryClassificationPerformance(nbs.predict(data_train), y_train, 'nbs')
nbs_performance.compute_measures()
print(nbs_performance.performance_measures)

{'Accuracy': 0.69792531120331947, 'Neg': 842, 'FN': 14, 'Pos': 363, 'TN': 828, 'FP': 350, 'TP': 13}


In [12]:
#Try Ridge Regression Classifier
from sklearn import linear_model
rdg = linear_model.RidgeClassifier()
rdg.fit(data_train, y_train)
joblib.dump(rdg, 'rdg.pkl') # pickle

rdg_performance = BinaryClassificationPerformance(rdg.predict(data_train), y_train, 'rdg')
rdg_performance.compute_measures()
print(rdg_performance.performance_measures)

{'Accuracy': 1.0, 'Neg': 1178, 'FN': 0, 'Pos': 27, 'TN': 1178, 'FP': 0, 'TP': 27}


In [13]:
# Try Perceptron
from sklearn import linear_model
prc = linear_model.SGDClassifier(loss='perceptron')
prc.fit(data_train, y_train)
joblib.dump(prc, 'prc.pkl') # pickle

prc_performance = BinaryClassificationPerformance(prc.predict(data_train), y_train, 'prc')
prc_performance.compute_measures()
print(prc_performance.performance_measures)

{'Accuracy': 0.92199170124481333, 'Neg': 1108, 'FN': 12, 'Pos': 97, 'TN': 1096, 'FP': 82, 'TP': 15}


# Check Model Performance of Training

In [16]:
fits[1].performance_measures['TP'] / fit.performance_measures['Pos']

0.13402061855670103

In [17]:
fits = [svm_performance, lgs_performance, nbs_performance, rdg_performance, prc_performance]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'ro')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
plt.axis([0, 1, 0, 1])
plt.title('ROC plot: training set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()

ValueError: posx and posy should be finite values

<matplotlib.figure.Figure at 0x1602fd888d0>

# Test Models

In [23]:
#Try SVM
svm = joblib.load('svm.pkl')

svm_performance = BinaryClassificationPerformance(svm.predict(data_train), y_test, 'svm')
svm_performance.compute_measures()
print(svm_performance.performance_measures)

{'Accuracy': 0.98340248962655596, 'Neg': 1205, 'FN': 20, 'Pos': 0, 'TN': 1185, 'FP': 0, 'TP': 0}


In [24]:
#Try logistic model
lgs = joblib.load('lgs.pkl')

lgs_performance = BinaryClassificationPerformance(lgs.predict(data_train), y_test, 'lgs')
lgs_performance.compute_measures()
print(lgs_performance.performance_measures)

{'Accuracy': 0.97261410788381741, 'Neg': 1192, 'FN': 20, 'Pos': 13, 'TN': 1172, 'FP': 13, 'TP': 0}


In [25]:
#Try Naive Bayes
nbs = joblib.load('nbs.pkl')

nbs_performance = BinaryClassificationPerformance(nbs.predict(data_train), y_test, 'nbs')
nbs_performance.compute_measures()
print(nbs_performance.performance_measures)

{'Accuracy': 0.68879668049792531, 'Neg': 842, 'FN': 16, 'Pos': 363, 'TN': 826, 'FP': 359, 'TP': 4}


In [26]:
#Try Ridge Regression Classifier
rdg = joblib.load('rdg.pkl')

rdg_performance = BinaryClassificationPerformance(rdg.predict(data_train), y_test, 'rdg')
rdg_performance.compute_measures()
print(rdg_performance.performance_measures)

{'Accuracy': 0.96265560165975106, 'Neg': 1178, 'FN': 19, 'Pos': 27, 'TN': 1159, 'FP': 26, 'TP': 1}


In [27]:
# Try Perceptron
prc = joblib.load('prc.pkl')

prc_performance = BinaryClassificationPerformance(prc.predict(data_train), y_test, 'prc')
prc_performance.compute_measures()
print(prc_performance.performance_measures)

{'Accuracy': 0.90788381742738589, 'Neg': 1108, 'FN': 17, 'Pos': 97, 'TN': 1091, 'FP': 94, 'TP': 3}


# Check Model Performance of Test

In [29]:
fits = [svm_performance, lgs_performance, nbs_performance, rdg_performance, prc_performance]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'ro')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
#plt.axis([0, 1, 0, 1])
plt.title('ROC plot: training set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()

ValueError: posx and posy should be finite values

<matplotlib.figure.Figure at 0x1602dbecbe0>

In [None]:
#cropping, y1_pixel:y2_pixel, x1_pixel:x2_pixel
#grayscale, contrast, resolution, contour finding, date