#### Setup

In [None]:
# 10.2.9.20
#  cd ../../media/nvidia/Mercyhurst/wabtec/rail_classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import json
import sys
from datetime import datetime

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.metrics import r2_score

In [2]:
def WriteJSON(obj,filename):
    with open(filename, 'w+') as outfile:
        try:
            obj_json = json.dumps(obj, sort_keys=True, indent=4,default=str)
            outfile.write(obj_json)
        except Exception as e:
            print(e, file=sys.stderr)
            print('File not written.')

In [3]:
def ReadJSON(filename):
    obj = []
    try: 
        with open(filename, 'r') as infile:
            obj = json.load(infile)
    except Exception as e:
        print(e, file=sys.stderr)
        print('File not found.')
        
    return obj

In [4]:
def FitAndScoreCLA(features,labels,classifiers,testSize=0.20):

    X_train, X_test, y_train, y_test = train_test_split(features,labels,test_size = testSize, random_state=42)
    
    clfs = []
    for classifier in classifiers:
        tmp = {}
        clf = classifier['Method']
        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        
        # Get report and matrix for display
        print('Classification report for  -',classifier['Name'])
        print('-----------------------------------------------------------------------------------------------')
        print(" %s:\n%s\n"% (clf, classification_report(y_test, y_pred)))
        
        tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
        print(classifier['Name'],'Confusion Matrix')
        print('   P0 \t P1 ')
        print('A0',tn,'\t',fp)
        print('A1',fn,'\t',tp)
        print('\n')
        
        print('r^2: ',r2)
        
        # Get report and matrix for file
        clr = classification_report(y_test, y_pred,output_dict=True)
        cnm = list(confusion_matrix(y_test,y_pred))
        
        tmp[classifier['Name']] = {'Report':clr,
                                  'Matrix':cnm}
        clfs.append(tmp)  
        
    # Open results file, append new result, write to file
    resultsObj = ReadJSON(results_file)
    
    now = datetime.now()
    date_time = now.strftime("%m/%d/%Y, %H:%M:%S")
    
    currResults = {'Description':description,
                   'classifiers':clfs,
                   'Run Time':date_time,
                   'Sample Size':sample_size,
                   'Image Resolution':img_size,
                   'Counts':{'0':dict(df.Catenary.value_counts())[0],'1':dict(df.Catenary.value_counts())[1]},
              }
    
    resultsObj.append(currResults)
    WriteJSON(resultsObj,results_file)

### Start Program

In [10]:
'''
Parameters 
----------
Set for each test. 


img_folder: Root folder of image collection

results_file: JSON file for output of results and metadata

description: String for labeling/notes

sample_size: Sample size to pull from each csv, 0-1

img_size: Native resolution is 1280x1280

'''

img_folder = '../data/output_images/'

img_set = '2'

results_file = '../data/results/'+'results_4.json'

description = 'All ten image sets. Full resolution.'

sample_size = 1.0

img_size = (640,640)

input_folder = '../data/railways/'

In [11]:
'''
Loads csv only, no images.
'''

# Name of folder
names = [
    'Australia',
    'Germany',
    'Netherlands',
    'Switzerland',
    'Amtrak',
    'BostonMTBA',
    'DenverRTD',
    'LosAngelesMR',
    'NewarkLR',
    'SeattleLLR',
]

# Name of csv
abbr = [
    'AUS',
    'GRM',
    'NET',
    'SWZ',
    'AMT',
    'BOS',
    'DEN',
    'LAA',
    'NEW',
    'SEA',
]
locations = dict(zip(names,abbr))

# Collect each csv into one df adding railway name
frames = []
for key,value in locations.items():
    try:
        filename = img_folder+key+'/'+value+'.csv'
        tmp = pd.read_csv(filename,header=0)
        tmp['Railway'] = key
        
        # Take sample from each folder 
        tmp = tmp.sample(frac=sample_size).reset_index(drop=True)
        frames.append(tmp)
    except Exception as e:
        print(key, e)

df = pd.concat(frames)

df = df.dropna()
df['Catenary'] = df['Catenary'].astype(int)

df.head()

Unnamed: 0,Name,Longitude,Latitude,Catenary,Railway
0,153.0843907_-27.4297185,153.084391,-27.429718,1,Australia
1,151.45096240000004_-25.615014300000002,151.450962,-25.615014,0,Australia
2,170.615245_-45.8181506,170.615245,-45.818151,0,Australia
3,151.17792749999998_-33.5829912,151.177927,-33.582991,1,Australia
4,152.20614780000002_-25.199578699999996,152.206148,-25.199579,0,Australia


In [12]:
df.Catenary.value_counts()

0    903
1    815
Name: Catenary, dtype: int64

In [13]:
'''
Open known non-catenary lines and add differntial to df
'''

zeros = df.Catenary.value_counts()[0]
ones = df.Catenary.value_counts()[1]

names = [
    'Amtrak_non_cat_1',
    'Amtrak_non_cat_2',
    'Amtrak_non_cat_3',
    'Random'
]

abbr = [
    'ANC',
    'ANC2',
    'ANC3',
    'RAN'
]
locations = dict(zip(names,abbr))

diff = ones - zeros
# print(diff)

if diff > 0:
    frames = []
    for key,value in locations.items():
        try:
            filename = img_folder+key+'/'+value+'.csv'
#             print(filename)
            tmp = pd.read_csv(filename,header=0)
            tmp['Railway'] = key
            frames.append(tmp)
#             print(tmp)
        except Exception as e:
            print(e)

    try:
        duds = pd.concat(frames)
        duds = duds.dropna()
        duds['Catenary'] = duds['Catenary'].astype(int) 
        
        print(len(duds))
        duds = duds.sample(n=diff).reset_index(drop=True)
        df = pd.concat([df,duds]).reset_index(drop=True)
    except Exception as e:
        print(e)
        duds = duds.sample(len(duds.index.tolist())).reset_index(drop=True)
        df = pd.concat([df,duds]).reset_index(drop=True)
        
else:
    df.sort_values(by='Catenary',inplace=True)
    df = df.iloc[abs(diff):]
    df = df.sample(frac=1.0)
        
df.shape

(1630, 5)

In [14]:
df.Catenary.value_counts()

1    815
0    815
Name: Catenary, dtype: int64

In [10]:
'''
Load images into df
'''
rows = df.index.tolist()

images = []
for row in rows:
    try:
        img_path = img_folder+df.iloc[row]['Railway']+'/set_'+img_set+'/'+df.iloc[row]['Name']+'.png'
        img = Image.open(img_path).convert('RGBA')
        img.thumbnail(img_size, Image.ANTIALIAS)
        data = np.asarray(img)
        data = data/255
        data = data.flatten()
        # Append img instead of data if you want as image       
        images.append(data)
    except Exception as e:
        print(e)
    
df['Image'] = images

cols = ['Catenary','Image']
df = df[cols]

df.head()

Unnamed: 0,Catenary,Image
86,0,"[0.4823529411764706, 0.4745098039215686, 0.427..."
48,1,"[0.33725490196078434, 0.33725490196078434, 0.3..."
9,1,"[0.3843137254901961, 0.36470588235294116, 0.28..."
104,0,"[0.17647058823529413, 0.14901960784313725, 0.1..."
122,1,"[0.4588235294117647, 0.35294117647058826, 0.28..."


### Classify

In [11]:
labels = np.asarray(df.Catenary.tolist())
features = np.asarray(df.Image.tolist())

In [12]:
'''
Setup classifiers
'''

BGN = {'Name':'BGN',
       'Method': GaussianNB()}

DTC = {'Name':'DTC',
       'Method': DecisionTreeClassifier(random_state=0)}

KNN = {'Name':'KNN',
       'Method': KNeighborsClassifier()}

SVM = {'Name':'SVM',
       'Method': SVC(gamma=0.001)}


classifiers = [DTC]

In [13]:
'''
Run Classifier
'''

FitAndScoreCLA(features,labels,classifiers)

Classification report for  - DTC
-----------------------------------------------------------------------------------------------
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best'):
              precision    recall  f1-score   support

           0       0.48      0.59      0.53       129
           1       0.51      0.41      0.45       138

    accuracy                           0.49       267
   macro avg       0.50      0.50      0.49       267
weighted avg       0.50      0.49      0.49       267


DTC Confusion Matrix
   P0 	 P1 
A0 76 	 53
A1 82 	 56


r^2:  -1.0247724974721941
File not found.


[Errno 2] No such file or directory: '../data/results/results_4.json'
