# Creation of framework to run SKLearn models for the EMNIST Dataset

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import neural_network
import time
import matplotlib.pyplot as plt
from pympler import asizeof
import numpy as np
np.random.seed(1)

In [3]:
def fitModel(model, trainingData, trainingLabels):
    #initialize parameters to keep track of Memory Growth and Time elepsed
    startMemory = asizeof.asizeof(model)
    startTime = time.time()
    #fit the model
    model.fit(trainingData, trainingLabels)
    #collect the data required 
    endTime = time.time()
    endMemory = asizeof.asizeof(model)
    #Print gathered data
    print("Time to fit: " + str(endTime - startTime))
    print("Memory growth fit: " + str(endMemory - startMemory))

In [4]:
def predictModel(model, testData):
    #initialize parameter to keep track of time spent predicting
    startTime = time.time()
    #Predict
    predictions = model.predict(testData)
    #Gather and print the time spent
    endTime = time.time()
    print("Time to predict " + str(endTime - startTime))
    return predictions

In [5]:
def runOnModelAndData(model, dataTrain, dataTest, labelsTrain, labelsTest):
    #Designed to calculate the accuracy of the trained model predictions
    fitModel(model, dataTrain, labelsTrain)
    predictions = predictModel(model, dataTest)
    accuracy = accuracy_score(labelsTest.tolist(), predictions)
    print(accuracy)
    #plotBoundary(model, dataTrain, labelsTrain)

In [6]:
def KNNModel():
    #OOD
    return neighbors.KNeighborsClassifier()
    #return neighbors.KNeighborsClassifier(algorithm='brute', weights='distance')

In [7]:
def NBModel():
    #OOD
    return naive_bayes.GaussianNB()

In [8]:
def SVMModel(modelType):
    #OOD
    if modelType == 'linear':
        return svm.SVC(kernel='linear')
    elif modelType == 'rbf':
        return svm.SVC(kernel='rbf', C=1000, gamma = 1)
    else:
        raise RuntimeError('Model must be of type linear or rbf')

In [9]:
def NNModel(modelType):
    #OOD
    if modelType == 'simple':
        return neural_network.MLPClassifier(hidden_layer_sizes=(10, 10))
    elif modelType == 'complex':
        return neural_network.MLPClassifier(hidden_layer_sizes=(20, 20, 20, 20, 20, 20), alpha=0, solver='lbfgs', max_iter=1500)
    else:
        raise RuntimeError('Model must be of type linear or rbf')

# Running of SKLearn KNN, Naive Bayes, and an attempt at SVM that was then moved to the google servers as it took too long to train

In [28]:
runOnModelAndData(KNNModel(), EMtrainDat, EMtestDat, EMtrainLabels, EMtestLabels)

Time to fit: 60.75985288619995
Memory growth fit: 707484016
Time to predict 3037.3791859149933
0.7846276595744681


In [29]:
runOnModelAndData(NBModel(), EMtrainDat, EMtestDat, EMtrainLabels, EMtestLabels)

Time to fit: 2.6485164165496826
Memory growth fit: 591608
Time to predict 8.455987691879272
0.281968085106383


In [None]:
runOnModelAndData(SVMModel("linear"), EMtrainDat, EMtestDat, EMtrainLabels, EMtestLabels)

# Framework to import the EMNIST dataset

In [24]:
import csv
import numpy as np
import os, zipfile
#assign a value for each character to serve as labels
enumToChar = {0:'0',1:'1',2:'2',3 :'3',4 :'4',5 :'5',6 :'6',7 :'7',8 :'8',9 :'9',10: 'A',11: 'B',12: 'C',13: 'D',14: 'E',15: 'F',16: 'G',17: 'H',18 :'I',19 :'J',20 :'K',
21 :'L',22 :'M',23 :'N',24 :'O',25 :'P',26 :'Q',27 :'R',28 :'S',29 :'T',30 :'U',31 :'V',32 :'W',33 :'X',34 :'Y',35 :'Z',36 :'a',37 :'b',38 :'d',39 :'e',40 :'f',41 :'g',42 :'h',43 :'n',44 :'q',45 :'r',46 :'t'
}
#Load the EMNIST data and labels for the data into numpy arrays from the CSV type, ended up unused
def loadEmnistDatasetFromCSV(filename, imageCount, rowLength):
    data = np.ndarray(shape=(imageCount,rowLength*rowLength), dtype = int)
    labels = np.ndarray(imageCount)
    #if file opens read in the file
    with open(filename, 'r') as inFile:
        csvReader = csv.reader(inFile)
        imageNum = -1
        for row in csvReader:
            imageNum += 1
            print("imageNum = "+str(imageNum))
            i = 0
            for p in row:
                if i == 0:
                    labels[imageNum] = p
                else:
                    t = i-1
                    idx =  int(t/rowLength) + rowLength * (t % rowLength)
                    data[imageNum][idx] = p
                i += 1
    return data,labels
#Load the EMNIST dataset using the .npy files
def loadEmnistFromNPY(filename):
    try:
        ret = np.load(filename)
    except FileNotFoundError:
        zipRef = zipfile.ZipFile('../data/EMNIST/balanced-data.zip')
        zipRef.extractall('../data/EMNIST')
        zipRef.close()
        ret = np.load(filename)

    return ret
#Print the images by assigning each pixel over the threshold thresh to be a character
def printImg(data,rowSize,thresh):
    length = max(data.shape)
    if(length == rowSize*rowSize):
        i = 0
    elif(length == rowSize*rowSize + 1):
        print("Label key: " + str(data[0]))
        i = 1
    else:
        print("Invalid data. Cannot print")
        return
    #Create the actual 'image'
    render = ''
    for row in range(0,rowSize):
        for col in range(0,rowSize):
            p = data[i]
            i += 1
            if(p>thresh):
                render += 'X'
            else:
                render += ' '
        render += '\n'
    render += '\n'
    print(render)


In [25]:
# Read in EMNIST test dataset from CSV
# testDat: np.ndarray[18800][28*28]
# testLabels: np.ndarray[18800]
#testDat, testLabels = loadEmnist.loadEmnistDataset('../data/EMNIST/emnist-balanced-test.csv',18800,28)

# Read in EMNIST train data from .npy
EMtrainDat = loadEmnistFromNPY('balanced-train-data.npy')
EMtrainLabels = loadEmnistFromNPY('balanced-train-labels.npy')

# Read in EMNIST train dataset from CSV
# testDat: np.ndarray[112800][28*28]
# testLabels: np.ndarray[112800]
# trainDat, trainLabels = loadEmnist.loadEmnistDataset('../data/EMNIST/emnist-balanced-train.csv', 112800,28)

# Read in EMNIST test data from .npy
EMtestDat = loadEmnistFromNPY('balanced-test-data.npy')
EMtestLabels = loadEmnistFromNPY('balanced-test-labels.npy')

OurOwnData = loadEmnistFromNPY('binaryLetters.npy')
# Print first 10 of test dataset digits to terminal
for i in range(0,12):
    #print('Character: ' + enumToChar[EMtestLabels[i]])
    printImg(OurOwnData[i][:],28,.5)

                            
                            
                            
                            
      X                     
      X                     
                     X      
                     X      
       X            X       
       X            X       
       X            X       
       X           X        
       X           X        
        X         X         
        X         X         
        X         X         
        X                   
        X    X   X          
        X   XX   X          
        X     X             
           X  X X           
         X X  X X           
         XX    X            
          X                 
                            
                            
                            
                            


                            
                            
                            
                            
         XXXX               
        X     XXXXXX        
        X   