#### Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import json
import sys
from datetime import datetime

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 

In [2]:
def WriteJSON(obj,filename):
    with open(filename, 'w+') as outfile:
        try:
            obj_json = json.dumps(obj, sort_keys=True, indent=4,default=str)
            outfile.write(obj_json)
        except Exception as e:
            print(e, file=sys.stderr)
            print('File not written.')

In [3]:
def ReadJSON(filename):
    obj = []
    try: 
        with open(filename, 'r') as infile:
            obj = json.load(infile)
    except Exception as e:
        print(e, file=sys.stderr)
        print('File not found.')
        
    return obj

In [4]:
def FitAndScoreCLA(features,labels,classifiers,testSize=0.20):

    X_train, X_test, y_train, y_test = train_test_split(features,labels,test_size = testSize, random_state=42)
    
    clfs = []
    for classifier in classifiers:
        tmp = {}
        clf = classifier['Method']
        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        
        # Get report and matrix for display
        print('Classification report for  -',classifier['Name'])
        print('-----------------------------------------------------------------------------------------------')
        print(" %s:\n%s\n"% (clf, classification_report(y_test, y_pred)))
        
        tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
        print(classifier['Name'],'Confusion Matrix')
        print('   P0 \t P1 ')
        print('A0',tn,'\t',fp)
        print('A1',fn,'\t',tp)
        print('\n')
        
        
        # Get report and matrix for file
        clr = classification_report(y_test, y_pred,output_dict=True)
        cnm = list(confusion_matrix(y_test,y_pred))
        
        tmp[classifier['Name']] = {'Report':clr,
                                  'Matrix':cnm}
        clfs.append(tmp)  
        
    # Open results file, append new result, write to file
    resultsObj = ReadJSON(results_file)
    
    now = datetime.now()
    date_time = now.strftime("%m/%d/%Y, %H:%M:%S")
    
    currResults = {'Description':description,
                   'classifiers':clfs,
                   'Run Time':date_time,
                   'Sample Size':sample_size,
                   'Image Resolution':img_size,
                   'Counts':{'0':dict(df.Catenary.value_counts())[0],'1':dict(df.Catenary.value_counts())[1]},
              }
    
    resultsObj.append(currResults)
    WriteJSON(resultsObj,results_file)

### Start Program

In [6]:
'''
Parameters 
----------
Set for each test. 


img_folder: Root folder of image collection

results_file: JSON file for output of results and metadata

description: String for labeling/notes

sample_size: Sample size to pull from each csv, 0-1

img_size: Native resolution is 1280x1280

'''

img_folder = '../data/output_images/'

results_file = '../data/results/'+'results.json'

description = 'Baseline with all 10 image sets.'

sample_size = .50

img_size = (640,640)

In [7]:
'''
Loads csv only, no images.
'''

# Name of folder
names = [
    'Australia',
    'China',
    'Germany',
    'NewarkLR',
    'Switzerland',
    'Amtrak',
    'BostonMTBA',
    'DenverRTD',
    'LosAngelesMR',
    'SeattleLLR',
    'Netherlands'
]

# Name of csv
abbr = [
    'AUS',
    'CHN',
    'GRM',
    'NEW',
    'SWZ',
    'AMT',
    'BOS',
    'DEN',
    'LAA',
    'SEA',
    'NET'
]
locations = dict(zip(names,abbr))

# Collect each csv into one df adding railway name
frames = []
for key,value in locations.items():
    try:
        filename = img_folder+key+'/'+value+'.csv'
        tmp = pd.read_csv(filename,header=0)
        tmp['Railway'] = key
        
        # Take sample from each folder 
        tmp = tmp.sample(frac=sample_size).reset_index(drop=True)
        frames.append(tmp)
    except Exception as e:
        print(e)

df = pd.concat(frames)

df = df.dropna()
df['Catenary'] = df['Catenary'].astype(int)

df.head()

[Errno 2] File b'../data/output_images/China/CHN.csv' does not exist: b'../data/output_images/China/CHN.csv'


Unnamed: 0,Name,Longitude,Latitude,Catenary,Railway
0,145.038226_-37.910223200000004,145.038226,-37.910223,1,Australia
1,153.00315669999998_-27.5259447,153.003157,-27.525945,1,Australia
2,151.17792749999998_-33.5829912,151.177927,-33.582991,1,Australia
3,151.1934804_-33.8669032,151.19348,-33.866903,1,Australia
4,152.99821210000002_-27.5279436,152.998212,-27.527944,1,Australia


In [8]:
df.Catenary.value_counts()

1    160
0     83
Name: Catenary, dtype: int64

In [9]:
'''
Open known non-catenary lines and add differntial to df
'''

zeros = df.Catenary.value_counts()[0]
ones = df.Catenary.value_counts()[1]

names = [
    'Amtrak_non_cat_1',
    'Amtrak_non_cat_2',
    'Amtrak_non_cat_3'
]

abbr = [
    'ANC',
    'ANC2',
    'ANC3'
]
locations = dict(zip(names,abbr))

diff = ones - zeros

if diff > 0:
    frames = []
    for key,value in locations.items():
        try:
            filename = img_folder+key+'/'+value+'.csv'
            tmp = pd.read_csv(filename,header=0)
            tmp['Railway'] = key
            frames.append(tmp)
        except Exception as e:
            print(e)

    try:
        duds = pd.concat(frames)
        duds = duds.dropna()
        duds['Catenary'] = duds['Catenary'].astype(int) 
        
        duds = duds.sample(n=diff).reset_index(drop=True)
        df = pd.concat([df,duds]).reset_index(drop=True)
    except Exception as e:
        print(e)
        duds = duds.sample(len(duds.index.tolist())).reset_index(drop=True)
        df = pd.concat([df,duds]).reset_index(drop=True)
        
df.shape

(320, 5)

In [10]:
df.Catenary.value_counts()

1    160
0    160
Name: Catenary, dtype: int64

In [11]:
'''
Load images into df
'''
rows = df.index.tolist()

images = []
for row in rows:
    img_path = img_folder+df.iloc[row]['Railway']+'/'+df.iloc[row]['Name']+'.png'
    img = Image.open(img_path).convert('RGBA')
    img.thumbnail(img_size, Image.ANTIALIAS)
    data = np.asarray(img)
    data = data.flatten()
    # Append img instead of data if you want as image       
    images.append(data)
    
df['Image'] = images

cols = ['Catenary','Image']
df = df[cols]

df.head()

Unnamed: 0,Catenary,Image
0,1,"[0, 0, 0, 255, 3, 2, 0, 255, 26, 26, 22, 255, ..."
1,1,"[108, 96, 86, 255, 94, 81, 71, 255, 78, 70, 62..."
2,1,"[62, 68, 49, 255, 35, 43, 26, 255, 35, 43, 25,..."
3,1,"[91, 82, 71, 255, 86, 81, 72, 255, 87, 83, 75,..."
4,1,"[30, 31, 23, 255, 66, 66, 60, 255, 84, 82, 78,..."


### Classify

In [12]:
labels = np.asarray(df.Catenary.tolist())
features = np.asarray(df.Image.tolist())

In [13]:
'''
Setup classifiers
'''

BGN = {'Name':'BGN',
       'Method': GaussianNB()}

DTC = {'Name':'DTC',
       'Method': DecisionTreeClassifier()}

KNN = {'Name':'KNN',
       'Method': KNeighborsClassifier()}

SVM = {'Name':'SVM',
       'Method': SVC(gamma=0.001)}


classifiers = [BGN,DTC,KNN,SVM]

In [None]:
'''
Run Classifier
'''

FitAndScoreCLA(features,labels,classifiers)

Classification report for  - BGN
-----------------------------------------------------------------------------------------------
 GaussianNB(priors=None, var_smoothing=1e-09):
              precision    recall  f1-score   support

           0       0.55      0.61      0.58        28
           1       0.67      0.61      0.64        36

    accuracy                           0.61        64
   macro avg       0.61      0.61      0.61        64
weighted avg       0.61      0.61      0.61        64


BGN Confusion Matrix
   P0 	 P1 
A0 17 	 11
A1 14 	 22


