In [None]:
import skimage.feature, skimage.io
import numpy as np
import math
from sklearn.neighbors import KNeighborsClassifier as KNNC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

NCharacter_dataset_folder = "../exercicios/NCharacter_SD19_BMP/"
features_folder = 'features'

class Zonas(object):
    def __init__(self):
        pass
    
    def get_zonas(self, n_linhas, n_colunas):
        raise NotImplementedError

class ZonasRetangulares(Zonas):
    
    def __init__(self, zonas_x, zonas_y):
        super(ZonasRetangulares, self).__init__()
        self.zonas_x = zonas_x
        self.zonas_y = zonas_y
        
    
    def get_zonas(self, n_linhas, n_colunas):
        cortes_x = np.floor(np.linspace(0, n_colunas, num=self.zonas_x+1)).astype(int)
        cortes_y = np.floor(np.linspace(0, n_linhas, num=self.zonas_y+1)).astype(int)
        #print (cortes_x)
        #print (cortes_y)
        for i in range(len(cortes_x)-1):
            for j in range(len(cortes_y)-1):
                yield(cortes_x[i], cortes_y[j], cortes_x[i+1], cortes_y[j+1],cortes_x[i] - cortes_x[i+1], cortes_y[i] - cortes_y[i+1] )


In [None]:
def histograma_cor(imagem):
    #print('im', imagem.shape)
    vals, counts = np.unique(imagem, return_counts=True)
    o = vals.argsort()
    vals = vals[o]
    counts = counts[o]
    if len(vals) < 2:
        #imagem com apenas branco ou apenas preto
        if vals[0] == 0:
            return [counts[0], 0]
        else:
            return [0, counts[0]]
        
    return counts

In [None]:
def parse_filelist(path, prefix=''):
    with open(path, 'r') as f:
        c = f.readlines()
    caminhos = list(map(str.strip, c))
    rotulos = [ i.split('/')[1].upper() for i in caminhos]
    arquivos = [i.split('/')[-1] for i in caminhos]
    p = zip(rotulos, arquivos)
    caminhos = [ prefix + '/' + i[0] + '/' + i[1] for i in p]
    
    return list(zip(caminhos,rotulos, arquivos))

def extract_features(filelist, dataset_folder, zonas, features=[histograma_cor]):
    instancias = parse_filelist(filelist, prefix=dataset_folder)
    features = list(map(feature_extraction, instancias, [zonas] * len(instancias), [features] * len(instancias)))
    
    return np.array(features)
    
def feature_extraction(instancia, zonas, features):
    caminho = instancia[0]
    #print(instancia)
    imagem = skimage.io.imread(caminho)
    caracteristicas = np.array([])
    #print("imagem.shape",imagem.shape)
    
    res = []
    for f in features:
        for z in zonas.get_zonas(imagem.shape[1], imagem.shape[0]):
            #print ("%d:%d,%d:%d" % (z[0], z[2], z[1], z[3]))
            f_val = f(imagem[z[0]:z[2],z[1]:z[3]])
            res.extend(f_val)
    
    return np.array(res)

for i in [('NIST_Train_Upper.txt', 'train'), ('NIST_Test_Upper.txt', 'test'), ('NIST_Valid_Upper.txt', 'val')]:
    print('extraindo características de %s' % (i[0]))
    feats = extract_features(NCharacter_dataset_folder + i[0], NCharacter_dataset_folder, ZonasRetangulares(2,2))
    #np.save(open(features_folder + ('/%s_feats.pkl' % (i[1])), 'wb'), feats )
    np.save(features_folder + ('/%s_feats.pkl' % (i[1])), feats)

print('Fim da extração de características!')

#parse_filelist(NCharacter_dataset_folder + 'NIST_Train_Upper.txt')



In [None]:
train_features = np.load(features_folder + '/train_feats.pkl.npy')
train_rotulos = parse_filelist(NCharacter_dataset_folder + 'NIST_Train_Upper.txt')
train_rotulos = [i[1] for i in train_rotulos]
print (len(train_rotulos), train_features.shape)

KNN = KNNC(n_neighbors=3)

KNN.fit(train_features, train_rotulos)

test_features = np.load(features_folder + '/test_feats.pkl.npy')
test_rotulos = parse_filelist(NCharacter_dataset_folder + 'NIST_Test_Upper.txt')
test_rotulos = [i[1] for i in test_rotulos]
print (len(test_rotulos), test_features.shape)

y_pred = KNN.predict(test_features)

print(accuracy_score(test_rotulos, y_pred))
#print(confusion_matrix(test_rotulos, y_pred))

print(classification_report(test_rotulos, y_pred, digits=3))