In [1]:
# %pip install opencv-python

from cv2 import imread, IMREAD_GRAYSCALE
import numpy as np
import os
from pathlib import Path
# from matplotlib.image import imread


# Mapeamento das pastas e imagens

In [2]:
dataset_path = Path(os.path.join('..', 'data_raw'))
path_labels = {}

path_labels['a'] = dataset_path.joinpath('a_l/train_61')
path_labels['e'] = dataset_path.joinpath('e_l/train_65')
path_labels['i'] = dataset_path.joinpath('i_l/train_69')
path_labels['o'] = dataset_path.joinpath('o_l/train_6f')
path_labels['u'] = dataset_path.joinpath('u_l/train_75')
path_labels['A'] = dataset_path.joinpath('A_u/train_41')
path_labels['E'] = dataset_path.joinpath('E_u/train_45')
path_labels['I'] = dataset_path.joinpath('I_u/train_49')
path_labels['O'] = dataset_path.joinpath('O_u/train_4f')
path_labels['U'] = dataset_path.joinpath('U_u/train_55')

In [3]:
label_to_file = {
    k: [path_label / file for file in os.listdir(path_label)]  # NOQA:E501
    for k, path_label in path_labels.items()
}

In [4]:
label_counts = {k: len(v) for k, v in label_to_file.items()}
grand_total = sum(label_counts.values())

print("Label counts:", label_counts)
print("Grand total:", grand_total)

Label counts: {'a': 11196, 'e': 28299, 'i': 2788, 'o': 2761, 'u': 2837, 'A': 7010, 'E': 5420, 'I': 13179, 'O': 28680, 'U': 14146}
Grand total: 116316


# Conversão dos dados

In [5]:
X = []
y = []
for key, files in label_to_file.items():
    for path in files:
        y.append(key)
        image = imread(path, IMREAD_GRAYSCALE)
        binary_matrix = (image > 0).astype(np.bool_)
        # trnasfotrm the lowest memory usage

        X.append(binary_matrix)

In [6]:
uni_X = np.array([x.reshape(-1) for x in X])

In [7]:
y = np.array(y)
y = (y == 'i') | (y == 'I')

# Train Test Split

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

split_test_threshold = 0.2
selection_iter = StratifiedShuffleSplit(n_splits=1, test_size=split_test_threshold, random_state=42)

In [9]:
train_index, test_index = next(selection_iter.split(X, y))

X_train = uni_X[train_index]
X_test = uni_X[test_index]
y_train = y[train_index]
y_test = y[test_index]

In [10]:
print(f'Total registers y: {len(y)}, y_train: {len(y_train)}, y_test: {len(y_test)}')

Total registers y: 116316, y_train: 93052, y_test: 23264


In [11]:
print(f'Number of i our I in y: {y.sum()}, y_train: {y_train.sum()}, y_test: {y_test.sum()}')

Number of i our I in y: 15967, y_train: 12773, y_test: 3194


In [12]:
print(f'Ration of i or I in y: {np.sum(y) / len(y):.2f}, y_train: {np.sum(y_train) / len(y_train):.2f}, y_test: {np.sum(y_test) / len(y_test):.2f}')

Ration of i or I in y: 0.14, y_train: 0.14, y_test: 0.14


# Salvando os dados

In [13]:
with open('../data_processed/y.npy', 'wb') as f:
  np.save(f, y)

In [14]:
with open('../data_processed/X_train.npy', 'wb') as f:
  np.save(f, X_train)

In [15]:
with open('../data_processed/X_test.npy', 'wb') as f:
  np.save(f, X_test)

In [16]:
with open('../data_processed/y_train.npy', 'wb') as f:
  np.save(f, y_train)

In [17]:
with open('../data_processed/y_test.npy', 'wb') as f:
  np.save(f, y_test)

In [18]:
with open('../data_processed/X.npy', 'wb') as f:
  np.save(f, X)