**Carregando imagens, adicionando ao dataset, normalizando dados e treinando o modelo utilizando técnicas kNN | Aprendizado de Máquina BTI/UFRN @2023.2**
- Tecnologies: HOG, PCA, 10-fold cv e 10-fold holdout.



**Carregando imagens e alimentando o dataset**

In [None]:
## Importando as libs necessárias
from google.colab import files
import os, io, time

# Criando o diretório inicial: /content/
os.chdir('/content/')
try:
  os.mkdir('Imagens', )
except:
  print('A pasta já existe.')
os.chdir('./Imagens')
os.listdir()
uploaded_images = files.upload()

**HOG**

In [None]:

## Reduzir as imagens
from skimage.io import imread, imshow
from skimage.transform import resize
from skimage.feature import hog

hog_features = []
for filename in uploaded_images.keys():
  image = imread(filename)
  image_resized = resize(image, (128,128))
  fd, hog_image = hog(image_resized, orientations=9, pixels_per_cell=(16, 16), #pode alterar o tamanho da pixels_per_cell
                    cells_per_block=(2, 2), visualize=True, channel_axis=-1)
  hog_features.append(fd)

In [None]:
import pandas as pd
## Adicionar as colunas com os valores do hog
data = {"pathfile": uploaded_images.keys(), "hog_features": hog_features}
df = pd.DataFrame(data)
df.head(10)

In [None]:
# transformando cada valor do hog_feature em um atributo (coluna)
df2 = pd.DataFrame(df['hog_features'].tolist())
df2.columns = df2.columns.map(lambda x: f'hog_feature_{x+1}')
df = pd.concat([df.drop('hog_features', axis=1), df2], axis=1)

df.head(10)

In [None]:
# Salvando csv resultante -> pode alterar o tamanho do pixels_per_cell
df.to_csv('hog_16x16.csv', sep=';', index=False)

from google.colab import files
files.download('hog_16x16.csv')

**PCA**

In [None]:
### Carregar as Libraries
import numpy as np
import pandas as pd

### Importing Dataset
dataset = pd.read_csv('hog_16x16.csv',encoding='utf-8')
dataset = pd.read_csv('hog_16x16.csv', header = 0, sep = ',', encoding = 'utf-8',engine='c')

### Mostrando as colunas do dataset
dataset.info()

In [None]:
tipos_numericos = ['float64']
atributos_numericos = dataset.select_dtypes(include=tipos_numericos)

colunas_numericas = list(atributos_numericos.columns)

print(colunas_numericas)

X = dataset[colunas_numericas] # Features
y = dataset.pathfile      # Target variable (classe)

print(X)

In [None]:
### Importando PCA do SkLearn #################

from sklearn.decomposition import PCA

pca_obj = PCA(n_components=0.90, whiten=True) #posso alterar a porcentagem
pca_result = pca_obj.fit_transform(X)

print(pca_result.shape)

columns = ["pca_"+str(i) for i in range(1,pca_result.shape[1]+1)]

print(columns)
pca_dataset = pd.DataFrame(data = pca_result, columns=columns)

In [None]:
pca_dataset.head()

In [None]:
final_data = pca_dataset.join(y)

df = pd.DataFrame(final_data)
df.to_csv('hog_16_pca_90.csv', index=False) #posso alterar a porcentagem

# Importando arquivo transformado
from google.colab import files
files.download('hog_16_pca_90.csv')

In [None]:
final_data.head()

**NORMALIZANDO OS DADOS E ADICIONANDO O ATRIBUTO CLASSE**
- Neste exemplo o atributo classe será 'Cachorro' ou 'Gato' pois o dataset que está sendo utilizado contém imagens de cachorros e gatos.

In [None]:
### Carregar as Libraries
import pandas as pd
from numpy import mean
from numpy import std

### Importing Dataset
dataset = pd.read_csv('hog_16_pca_90.csv',encoding='utf-8')

# Visualização dos atributos
dataset.head()

In [None]:
#adicionar o atributo classe que determina 'cachorro' ou 'gato'
def determinar_classe(row):
    if row['pathfile'][0].islower():
        return 'cachorro'
    else:
        return 'gato'
dataset['classe'] = dataset.apply(determinar_classe, axis=1)


dataset

In [None]:
#normaliza os dados numéricos
from sklearn.preprocessing import minmax_scale

tipos_numericos = ['float64']
cols_num = dataset.select_dtypes(include=tipos_numericos)

colunas_numericas = list(cols_num.columns)

dados_normalizados = dataset.copy()
dados_normalizados[colunas_numericas] = dataset[colunas_numericas].apply(minmax_scale)

In [None]:
# Obtendo os nomes das colunas Numéricas
tipos_numericos = ['int32', 'int64', 'float16', 'float32', 'float64']
cols_num = dados_normalizados.select_dtypes(include=tipos_numericos)

## Selecionando os atributos numéricos
colunas_numericas = list(cols_num.columns)

## Pegar a classe
coluna_classe = dataset['classe']

## Separando os atributos da classe
X = dataset[colunas_numericas] # Features
y = coluna_classe             # Target variable (classe)

dados_normalizados.head()

**10-fold | Cross Validation**

In [None]:

## Carregando o algoritmo / método / técnica k-NN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics

## Implementando k-fold CV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict


# 10-fold CV
kf = KFold(n_splits=10, random_state=1, shuffle=True)
for i in range(1, 11):
  # Instanciando um objeto KNeighborsClassifier
  knn = KNeighborsClassifier(n_neighbors=i, metric='euclidean')

  # Model Accuracy
  scores = cross_val_score(knn, X, y, scoring='accuracy', cv=kf)
  print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

  # Matriz de confusão p/ kf
  y_pred = cross_val_predict(knn, X, y, cv=kf)
  confusion_matrix(y, y_pred)

**10-fold | Holdout**

In [None]:

## Carregando o algoritmo / método / técnica k-NN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix


# Separando dataset em duas partes: treinamento e teste
# 70% training and 30% test
X_train_70, X_test_30, y_train_70, y_test_30 = train_test_split(X, y, test_size=0.10, random_state=1)


### Instanciando kNN and varying k from 1 to 10
print('HoldOut 70-30')
for i in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=i, metric='euclidean')
    knn.fit(X_train_70, y_train_70)

    # Utilizando a parte de teste para fazer a predição
    y_pred = knn.predict(X_test_30)

    # Accuracia do Modelo
    acuracia = metrics.accuracy_score(y_test_30, y_pred)
    print('%d-NN Accuracy: %.3f' % (i, acuracia))