#### Setup

In [None]:
# 10.2.9.108

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import json
import sys
from datetime import datetime

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.metrics import r2_score

In [2]:
def WriteJSON(obj,filename):
    with open(filename, 'w+') as outfile:
        try:
            obj_json = json.dumps(obj, sort_keys=True, indent=4,default=str)
            outfile.write(obj_json)
        except Exception as e:
            print(e, file=sys.stderr)
            print('File not written.')

In [3]:
def ReadJSON(filename):
    obj = []
    try: 
        with open(filename, 'r') as infile:
            obj = json.load(infile)
    except Exception as e:
        print(e, file=sys.stderr)
        print('File not found.')
        
    return obj

In [4]:
def FitAndScoreCLA(features,labels,classifiers,testSize=0.20):

    X_train, X_test, y_train, y_test = train_test_split(features,labels,test_size = testSize, random_state=42)
    
    clfs = []
    for classifier in classifiers:
        tmp = {}
        clf = classifier['Method']
        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        
        # Get report and matrix for display
        print('Classification report for  -',classifier['Name'])
        print('-----------------------------------------------------------------------------------------------')
        print(" %s:\n%s\n"% (clf, classification_report(y_test, y_pred)))
        
        tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
        print(classifier['Name'],'Confusion Matrix')
        print('   P0 \t P1 ')
        print('A0',tn,'\t',fp)
        print('A1',fn,'\t',tp)
        print('\n')
        
        print('r^2: ',r2)
        
        # Get report and matrix for file
        clr = classification_report(y_test, y_pred,output_dict=True)
        cnm = list(confusion_matrix(y_test,y_pred))
        
        tmp[classifier['Name']] = {'Report':clr,
                                  'Matrix':cnm}
        clfs.append(tmp)  
        
    # Open results file, append new result, write to file
    resultsObj = ReadJSON(results_file)
    
    now = datetime.now()
    date_time = now.strftime("%m/%d/%Y, %H:%M:%S")
    
    currResults = {'Description':description,
                   'classifiers':clfs,
                   'Run Time':date_time,
                   'Sample Size':sample_size,
                   'Image Resolution':img_size,
                   'Counts':{'0':dict(df.Catenary.value_counts())[0],'1':dict(df.Catenary.value_counts())[1]},
              }
    
    resultsObj.append(currResults)
    WriteJSON(resultsObj,results_file)

### Start Program

In [5]:
'''
Parameters 
----------
Set for each test. 


img_folder: Root folder of image collection

results_file: JSON file for output of results and metadata

description: String for labeling/notes

sample_size: Sample size to pull from each csv, 0-1

img_size: Native resolution is 1280x1280

'''

img_folder = '../data/output_images/'

results_file = '../data/results/'+'results3.json'

description = 'All ten image sets. Three quarters resolution.'

sample_size = .50

img_size = (640,640)

In [6]:
'''
Loads csv only, no images.
'''

# Name of folder
names = [
    'Australia',
    'China',
    'Germany',
    'NewarkLR',
    'Switzerland',
    'Amtrak',
    'BostonMTBA',
    'DenverRTD',
    'LosAngelesMR',
    'SeattleLLR',
    'Netherlands'
]

# Name of csv
abbr = [
    'AUS',
    'CHN',
    'GRM',
    'NEW',
    'SWZ',
    'AMT',
    'BOS',
    'DEN',
    'LAA',
    'SEA',
    'NET'
]
locations = dict(zip(names,abbr))

# Collect each csv into one df adding railway name
frames = []
for key,value in locations.items():
    try:
        filename = img_folder+key+'/'+value+'.csv'
        tmp = pd.read_csv(filename,header=0)
        tmp['Railway'] = key
        
        # Take sample from each folder 
        tmp = tmp.sample(frac=sample_size).reset_index(drop=True)
        frames.append(tmp)
    except Exception as e:
        print(e)

df = pd.concat(frames)

df = df.dropna()
df['Catenary'] = df['Catenary'].astype(int)

df.head()

File b'../data/output_images/China/CHN.csv' does not exist


Unnamed: 0,Name,Longitude,Latitude,Catenary,Railway
0,5.680897153322929_52.02627259477863,5.680897,52.026273,1,Netherlands
1,5.966978743706366_50.89235750362882,5.966979,50.892358,1,Netherlands
2,4.514447461059589_51.89960635851382,4.514447,51.899606,1,Netherlands
3,5.679545474955587_52.02653694146236,5.679545,52.026537,1,Netherlands
4,4.870732186384699_52.389796444142426,4.870732,52.389796,1,Netherlands


In [7]:
df.Catenary.value_counts()

1    158
0     85
Name: Catenary, dtype: int64

In [8]:
'''
Open known non-catenary lines and add differntial to df
'''

zeros = df.Catenary.value_counts()[0]
ones = df.Catenary.value_counts()[1]

names = [
    'Amtrak_non_cat_1',
    'Amtrak_non_cat_2',
    'Amtrak_non_cat_3'
]

abbr = [
    'ANC',
    'ANC2',
    'ANC3'
]
locations = dict(zip(names,abbr))

diff = ones - zeros

if diff > 0:
    frames = []
    for key,value in locations.items():
        try:
            filename = img_folder+key+'/'+value+'.csv'
            tmp = pd.read_csv(filename,header=0)
            tmp['Railway'] = key
            frames.append(tmp)
        except Exception as e:
            print(e)

    try:
        duds = pd.concat(frames)
        duds = duds.dropna()
        duds['Catenary'] = duds['Catenary'].astype(int) 
        
        duds = duds.sample(n=diff).reset_index(drop=True)
        df = pd.concat([df,duds]).reset_index(drop=True)
    except Exception as e:
        print(e)
        duds = duds.sample(len(duds.index.tolist())).reset_index(drop=True)
        df = pd.concat([df,duds]).reset_index(drop=True)
        
df.shape

(316, 5)

In [9]:
df.Catenary.value_counts()

1    158
0    158
Name: Catenary, dtype: int64

In [10]:
'''
Load images into df
'''
rows = df.index.tolist()

images = []
for row in rows:
    img_path = img_folder+df.iloc[row]['Railway']+'/'+df.iloc[row]['Name']+'.png'
    img = Image.open(img_path).convert('RGBA')
    img.thumbnail(img_size, Image.ANTIALIAS)
    data = np.asarray(img)
    data = data.flatten()
    # Append img instead of data if you want as image       
    images.append(data)
    
df['Image'] = images

cols = ['Catenary','Image']
df = df[cols]

df.head()

Unnamed: 0,Catenary,Image
0,1,"[34, 46, 41, 255, 53, 64, 61, 255, 63, 75, 69,..."
1,1,"[103, 92, 86, 255, 69, 61, 55, 255, 27, 19, 16..."
2,1,"[98, 85, 82, 255, 103, 94, 92, 255, 112, 110, ..."
3,1,"[33, 35, 24, 255, 19, 28, 24, 255, 36, 36, 29,..."
4,1,"[105, 107, 102, 255, 111, 114, 109, 255, 115, ..."


### Classify

In [11]:
labels = np.asarray(df.Catenary.tolist())
features = np.asarray(df.Image.tolist())

In [12]:
'''
Setup classifiers
'''

BGN = {'Name':'BGN',
       'Method': GaussianNB()}

DTC = {'Name':'DTC',
       'Method': DecisionTreeClassifier()}

KNN = {'Name':'KNN',
       'Method': KNeighborsClassifier()}

SVM = {'Name':'SVM',
       'Method': SVC(gamma=0.001)}


classifiers = [BGN,DTC,KNN,SVM]

In [13]:
'''
Run Classifier
'''

FitAndScoreCLA(features,labels,classifiers)

MemoryError: 