### Llibreries

In [1]:
import cv2
import mediapipe as mp
import numpy as np
import os
import pandas as pd
import joblib

from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

### Funcions

Inicialitzem i guardem les funcions per fer el face mesh. El 'index_lips' són tots els punts que defineixen els llavis.

In [2]:
mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils
mp_face_detection = mp.solutions.face_detection
index_lips = [61, 76, 62, 78, 
              185, 184, 183, 191, 95, 96, 77, 146, 
              40, 74, 42, 80, 88, 89, 90, 91, 
              39, 73, 41, 81, 178, 179, 180, 181, 
              37, 72, 38, 82, 87, 86, 85, 84,
              0, 11, 12, 13, 14, 15, 16, 17,
              267, 302, 268, 312, 317, 316, 315, 314, 
              269, 303, 271, 311, 402, 403, 404, 405, 
              270, 304, 272, 310, 318, 319, 320, 321, 
              409, 408, 407, 415, 324, 325, 307, 375, 
              308, 292, 306, 291]

## Vowel Recognition utilitzant 'Orthographic projection' per a les distancies

### Creació del dataset

En aquesta part cal puntualitzar que estic utilitzant la carpeta 'img_webcam' perquè és més petita, ja que si utilitzo la carpeta original amb més imatges, surten problemes perquè aquesta manera de crear un conjunt de dades consumeix molta memòria.

In [3]:
carpeta_principal = 'data/img_webcam/'
vocales_info_b = []
img_lip_info_b = []

img_scale_info = []
scale_info = []

for entrada in os.listdir(carpeta_principal):
    ruta_absoluta = os.path.join(carpeta_principal, entrada)
    # Verifica si la entrada es una carpeta
    if os.path.isdir(ruta_absoluta):
        # Ruta de la carpeta que contiene las imágenes
        carpeta = 'data/img_webcam/' + entrada

    # Itera sobre todos los archivos en la carpeta
    for archivo in os.listdir(carpeta):
        # Verifica si el archivo es una imagen (puedes ajustar esta condición según el tipo de imágenes que tengas)
        if archivo.endswith('.jpg') or archivo.endswith('.png') or archivo.endswith('.jpeg'):
            ruta_imagen = os.path.join(carpeta, archivo)

            face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5)
            face_mesh = mp_face_mesh.FaceMesh(static_image_mode=True, min_detection_confidence=0.5, min_tracking_confidence=0.5)
            
            lip_info_b = []
            image = cv2.imread(ruta_imagen)
            height, width, _ = image.shape
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            results_detection = face_detection.process(image_rgb)
            # Realizar el seguimiento de los puntos faciales si se detecta una cara
            if results_detection.detections:
                for detection in results_detection.detections:
                    bboxC = detection.location_data.relative_bounding_box
                    xmin = int(bboxC.xmin * width)
                    ymin = int(bboxC.ymin * height)
                    w = int(bboxC.width * width)
                    h = int(bboxC.height * height)
                    xmax = xmin + w
                    ymax = ymin + h
                    x_distance = xmax - xmin
                    y_distance = ymax - ymin
                    x_scale = x_distance/width
                    y_scale = y_distance/height
                    
                    # Realizar el seguimiento de los puntos faciales
                    results_mesh = face_mesh.process(image_rgb)
        
                    if results_mesh.multi_face_landmarks is not None:
                        for face_landmarks in results_mesh.multi_face_landmarks:
                            for index in index_lips:
                                lip_info_b.append([face_landmarks.landmark[index].x, face_landmarks.landmark[index].y])
        
                    pos_x = []
                    pos_y = []
                    for i in lip_info_b:
                        pos_x.append(i[0])
                        pos_y.append(i[1])
        
                    aux_x = np.mean(pos_x)
                    aux_y = np.mean(pos_y)
                    lip_info_b.append([aux_x, aux_y])
        
        img_lip_info_b.append(lip_info_b)
        img_scale_info.append([xmin, ymin, x_scale, y_scale])
    vocales_info_b.append(img_lip_info_b)
    scale_info.append(img_scale_info)
    img_lip_info_b = []
    img_scale_info = []

In [4]:
scale_info[0][0]

[363, 251, 0.20078125, 0.35694444444444445]

In [5]:
len_file = len(vocales_info_b[0])
len_file

251

In [6]:
len_vi = len(vocales_info_b)
len_vi

5

In [7]:
for i in vocales_info_b:
    print(len(i))

251
218
268
318
311


In [8]:
for i in scale_info:
    print(len(i))

251
218
268
318
311


In [9]:
min = 1000
for i in vocales_info_b:
    if (len(i) < min) and (len(i) > 0):
        min = len(i)
        
for i in range(len(vocales_info_b)):
    vocales_info_b[i] = vocales_info_b[i][:min]
    scale_info[i] = scale_info[i][:min]
    
for i in vocales_info_b:
    print(len(i))

218
218
218
218
218


In [10]:
for i in scale_info:
    print(len(i))

218
218
218
218
218


In [11]:
vocales_info_np_b = np.array(vocales_info_b)

In [12]:
vocales_info_np_b[0][0][0][0]

0.3319104313850403

In [13]:
scale_info[0][0][0]

363

In [14]:
import math
coordenada_central = 80
vocales_info_b_2 = []
img_lip_info_b_2 = []
for i in range(len(vocales_info_np_b)):
    for j in range(len(vocales_info_np_b[i])):
        lip_info_b_2 = []
        aux_x_1 = (abs(vocales_info_np_b[i][j][coordenada_central][0] - scale_info[i][j][0]) / scale_info[i][j][2])
        aux_y_1 = (abs(vocales_info_np_b[i][j][coordenada_central][1] - scale_info[i][j][1]) / scale_info[i][j][3])
        for k in range(len(vocales_info_np_b[i][j]) - 1):
            aux_x_2 = (abs(vocales_info_np_b[i][j][k][0] - scale_info[i][j][0]) / scale_info[i][j][2])
            aux_y_2 = (abs(vocales_info_np_b[i][j][k][1] - scale_info[i][j][1]) / scale_info[i][j][3])
            distancia = math.sqrt((aux_x_1 - aux_x_2)**2 + (aux_y_1 - aux_y_2)**2)
            lip_info_b_2.append(distancia)
        img_lip_info_b_2.append(lip_info_b_2)
    vocales_info_b_2.append(img_lip_info_b_2)
    img_lip_info_b_2 = []

In [15]:
len(vocales_info_b_2[0][0])

80

In [16]:
vocales_info_b_2[0][0][0]

0.167790216894874

In [17]:
longitud_array_vocals = min*len_vi
aux_a_v = 0

array_vocals = []
for i in range(longitud_array_vocals):
    if i % (longitud_array_vocals/5) == 0:
        aux_a_v += 1  
    array_vocals.append(aux_a_v)

df_array_vocals = pd.DataFrame(array_vocals, columns=['Y'])
df_array_vocals

Unnamed: 0,Y
0,1
1,1
2,1
3,1
4,1
...,...
1085,5
1086,5
1087,5
1088,5


In [18]:
df_2 = pd.DataFrame()
for i in range(len(vocales_info_b_2)):
    df_aux = pd.DataFrame(vocales_info_b_2[i])
    df_2 = pd.concat([df_2, df_aux])

In [19]:
df_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,0.167790,0.156070,0.143436,0.135475,0.166558,0.151888,0.135035,0.124169,0.117359,0.132504,...,0.140482,0.132535,0.135244,0.148129,0.161338,0.174245,0.154687,0.161340,0.173148,0.184620
1,0.171376,0.159939,0.147352,0.139214,0.171121,0.156479,0.139701,0.128851,0.120170,0.135246,...,0.147812,0.139812,0.142198,0.155114,0.168316,0.181086,0.163553,0.170343,0.182302,0.193675
2,0.163407,0.152076,0.139784,0.132115,0.163331,0.148782,0.132162,0.121547,0.114229,0.128830,...,0.140165,0.132408,0.136038,0.148539,0.161627,0.174528,0.156199,0.162691,0.174401,0.185730
3,0.160119,0.148635,0.136355,0.128923,0.158902,0.144326,0.127730,0.117220,0.111936,0.126229,...,0.134417,0.126551,0.130001,0.142256,0.155372,0.168459,0.149177,0.155363,0.166944,0.178300
4,0.167354,0.155795,0.143297,0.135462,0.166632,0.151906,0.135125,0.124409,0.117357,0.132195,...,0.142007,0.134071,0.137112,0.149891,0.163176,0.176062,0.157113,0.163833,0.175785,0.187216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,0.127608,0.116439,0.106359,0.102190,0.122680,0.109130,0.094492,0.085380,0.089213,0.098951,...,0.099217,0.092357,0.099135,0.107058,0.118672,0.132563,0.115133,0.118389,0.127618,0.139263
214,0.136031,0.124972,0.114680,0.109691,0.129874,0.116356,0.101498,0.091569,0.096220,0.106517,...,0.110160,0.103366,0.107122,0.115045,0.126546,0.140461,0.126741,0.129282,0.138108,0.149488
215,0.133580,0.122692,0.112380,0.107287,0.127569,0.114138,0.099347,0.089571,0.094459,0.104803,...,0.108646,0.102254,0.105653,0.113242,0.124583,0.138398,0.125086,0.127263,0.135957,0.147084
216,0.129770,0.118881,0.108515,0.103485,0.125078,0.111690,0.096979,0.087287,0.091050,0.101101,...,0.105429,0.099234,0.103336,0.110740,0.121977,0.135610,0.121661,0.124112,0.132836,0.143987


### Entrenament del model Support Vector Machine

In [20]:
# Definir los parámetros a ajustar
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.01, 0.1, 1], 'kernel': ['linear', 'rbf', 'poly']}

# Realizar la búsqueda en cuadrícula para encontrar la mejor combinación de hiperparámetros
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(df_2, df_array_vocals.values.ravel())

# Mostrar los mejores hiperparámetros encontrados
print("Mejores hiperparámetros:", grid_search.best_params_)

Mejores hiperparámetros: {'C': 100, 'gamma': 0.001, 'kernel': 'linear'}


In [21]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df_2, df_array_vocals.values.ravel(), test_size=0.2, random_state=42)

# Crear un clasificador SVM
svm_classifier_2 = SVC(kernel='poly', C=1000, gamma=1)

# Entrenar el clasificador
svm_classifier_2.fit(X_train, y_train)

# Predecir las etiquetas de clase para el conjunto de prueba
y_pred = svm_classifier_2.predict(X_test)

# Calcular la precisión del clasificador
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [22]:
# Guardar el modelo
dump(svm_classifier_2, 'models/svm/svm_classifier_OP_webcam.joblib')

['models/svm/svm_classifier_OP_webcam.joblib']

In [23]:
# Realizar validación cruzada para evaluar el rendimiento del modelo SVM
scores = cross_val_score(SVC(), df_2, df_array_vocals.values.ravel(), cv=5)
print("Precisión de validación cruzada:", scores.mean())

Precisión de validación cruzada: 0.9431192660550458


### Entrenament del model Random Forest

In [24]:
X_train, X_test, y_train, y_test = train_test_split(df_2, df_array_vocals.values.ravel(), test_size=0.2, random_state=42)

rf_classifier_2 = RandomForestClassifier(n_estimators=1000, random_state=42)

rf_classifier_2.fit(X_train, y_train)

y_pred = rf_classifier_2.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9908256880733946


In [25]:
# Guardar el modelo
joblib.dump(rf_classifier_2, 'models/rf/rf_classifier_OP_webcam.pkl')

['models/rf/rf_classifier_OP_webcam.pkl']