In [4]:
def lip_recognition_1(image_in):
    import joblib
    import cv2
    import mediapipe as mp
    import numpy as np
    import os
    import pandas as pd
    from PIL import Image
    import io
    
    rf_classifier = joblib.load('rf_classifier_a_op.pkl')
    mp_face_mesh = mp.solutions.face_mesh
    mp_drawing = mp.solutions.drawing_utils
    index_lips = [61, 76, 62, 78, 
                  185, 184, 183, 191, 95, 96, 77, 146, 
                  40, 74, 42, 80, 88, 89, 90, 91, 
                  39, 73, 41, 81, 178, 179, 180, 181, 
                  37, 72, 38, 82, 87, 86, 85, 84,
                  0, 11, 12, 13, 14, 15, 16, 17,
                  267, 302, 268, 312, 317, 316, 315, 314, 
                  269, 303, 271, 311, 402, 403, 404, 405, 
                  270, 304, 272, 310, 318, 319, 320, 321, 
                  409, 408, 407, 415, 324, 325, 307, 375, 
                  308, 292, 306, 291]
    
    with mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1, min_detection_confidence=0.5) as face_mesh:
        lip_info = []
        image = cv2.imread(image_in)
        height, width, _ = image.shape
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = face_mesh.process(image_rgb)

        if results.multi_face_landmarks is not None:
            for face_landmarks in results.multi_face_landmarks:
                for index in index_lips:
                    #lip_info.append(face_landmarks.landmark[index])
                    lip_info.append([face_landmarks.landmark[index].x, face_landmarks.landmark[index].y, face_landmarks.landmark[index].z])

        pos_x = []
        pos_y = []
        pos_z = []
        for i in lip_info:
            pos_x.append(i[0])
            pos_y.append(i[1])
            pos_z.append(i[2])

        aux_x = np.mean(pos_x)
        aux_y = np.mean(pos_y)
        aux_z = np.mean(pos_z)
        lip_info.append([aux_x, aux_y, aux_z])

    lip_info = np.array(lip_info)
    coordenada_central = 80
    img_lip_info_2 = []
    for i in range(len(lip_info) - 1):
        distancia = np.linalg.norm(lip_info[i] - lip_info[coordenada_central])
        img_lip_info_2.append(distancia)

    lip_info = pd.DataFrame(img_lip_info_2).transpose()
    prediccion = rf_classifier.predict(lip_info)

    switch = {
        1: "Vocal A",
        2: "Vocal E",
        3: "Vocal I",
        4: "Vocal O",
        5: "Vocal U"
    }
    
    return switch.get(prediccion[0])

In [2]:
def lip_recognition_2(image_input):
    import joblib
    import cv2
    import mediapipe as mp
    import numpy as np
    import os
    import pandas as pd
    from PIL import Image
    import io

    # Cargar el modelo RandomForest previamente entrenado
    rf_classifier = joblib.load('rf_classifier.pkl')
    
    # Inicializar el objeto de detección facial de MediaPipe
    mp_face_mesh = mp.solutions.face_mesh
    mp_drawing = mp.solutions.drawing_utils
    index_lips = [61, 76, 62, 78, 
                  185, 184, 183, 191, 95, 96, 77, 146, 
                  40, 74, 42, 80, 88, 89, 90, 91, 
                  39, 73, 41, 81, 178, 179, 180, 181, 
                  37, 72, 38, 82, 87, 86, 85, 84,
                  0, 11, 12, 13, 14, 15, 16, 17,
                  267, 302, 268, 312, 317, 316, 315, 314, 
                  269, 303, 271, 311, 402, 403, 404, 405, 
                  270, 304, 272, 310, 318, 319, 320, 321, 
                  409, 408, 407, 415, 324, 325, 307, 375, 
                  308, 292, 306, 291]

    with open(image_input, "rb") as img_file:
        image_in = img_file.read()

    # Convertir la imagen de entrada en un formato adecuado
    if isinstance(image_in, bytes):
        # Si la imagen es en formato de bytes, conviértela en una imagen de PIL
        image = Image.open(io.BytesIO(image_in))
        # Guarda temporalmente la imagen en el sistema de archivos
        temp_image_path = "temp_image.jpg"
        image.save(temp_image_path)
        # Carga la imagen con OpenCV
        image = cv2.imread(temp_image_path)
        # Borra la imagen temporal
        os.remove(temp_image_path)
    elif isinstance(image_in, str):
        # Si la imagen es una cadena de caracteres, simplemente cárgala con OpenCV
        image = cv2.imread(image_in)
    else:
        raise ValueError("El parámetro 'image_in' debe ser una cadena de caracteres (str) o una imagen en formato de bytes")

    # Procesar la imagen con MediaPipe Face Mesh
    with mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1, min_detection_confidence=0.5) as face_mesh:
        lip_info = []
        height, width, _ = image.shape
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = face_mesh.process(image_rgb)

        # Extraer las coordenadas de los puntos de los labios si se detectan caras en la imagen
        if results.multi_face_landmarks is not None:
            for face_landmarks in results.multi_face_landmarks:
                for index in index_lips:
                    lip_info.append([face_landmarks.landmark[index].x, face_landmarks.landmark[index].y, face_landmarks.landmark[index].z])

        # Calcular las coordenadas del punto medio de los labios
        pos_x = [point[0] for point in lip_info]
        pos_y = [point[1] for point in lip_info]
        pos_z = [point[2] for point in lip_info]
        mean_x = np.mean(pos_x)
        mean_y = np.mean(pos_y)
        mean_z = np.mean(pos_z)
        lip_info.append([mean_x, mean_y, mean_z])

    # Calcular la distancia de cada punto a un punto central
    coordenada_central = 80
    img_lip_info_2 = []
    for i in range(len(lip_info) - 1):
        distancia = np.linalg.norm(np.array(lip_info[i]) - np.array(lip_info[coordenada_central]))
        img_lip_info_2.append(distancia)

    # Convertir los datos de los labios en un DataFrame de Pandas
    lip_info_df = pd.DataFrame([img_lip_info_2])

    # Hacer una predicción utilizando el modelo RandomForest cargado
    prediccion = rf_classifier.predict(lip_info_df)

    # Mapear el resultado de la predicción a una vocal
    switch = {
        1: "Vocal A",
        2: "Vocal E",
        3: "Vocal I",
        4: "Vocal O",
        5: "Vocal U"
    }

    # Devolver la vocal predicha
    return switch.get(prediccion[0])


In [9]:
lip_info = lip_recognition_1('data/img/U/scene00001.jpg')
print(lip_info)

Vocal E


In [8]:
import gradio as gr

demo = gr.Interface(
    fn=lip_recognition_1,
    inputs=gr.components.Image(height=300, width=300, type="filepath", label="Input Image"),
    outputs="text",
    title="Clasificador de Vocales",
    description="Carga una imagen y el modelo predecirá su vocal."
)

demo.launch()

Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.




In [70]:
from keras.models import load_model
import tensorflow as tf

# Cargar el modelo guardado
loaded_model = load_model("vrdp.keras", custom_objects={'softmax_v2': tf.nn.softmax})

In [71]:
loaded_model.predict

<bound method TensorFlowTrainer.predict of <Sequential name=sequential_9, built=True>>

In [72]:
import pandas as pd
df = pd.read_csv('df_2.csv')

In [73]:
pd.DataFrame(df.iloc[0]).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,0.099192,0.092959,0.086554,0.083164,0.089762,0.082877,0.075147,0.070456,0.06995,0.076266,...,0.070576,0.065389,0.072129,0.078125,0.084359,0.090847,0.0807,0.084646,0.09149,0.098096


In [74]:
prediccion = loaded_model.predict(pd.DataFrame(df.iloc[0]).transpose())

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step


In [75]:
class_names = ['A', 'E', 'I', 'O', 'U']

In [99]:
def lip_r_dl(image_in):
    import cv2
    import mediapipe as mp
    import numpy as np
    import os
    import pandas as pd
    from PIL import Image
    import io
    from keras.models import load_model
    import tensorflow as tf
    
    vrdp_classifier = load_model("vrdp.keras", custom_objects={'softmax_v2': tf.nn.softmax})
    mp_face_mesh = mp.solutions.face_mesh
    mp_drawing = mp.solutions.drawing_utils
    index_lips = [61, 76, 62, 78, 
                  185, 184, 183, 191, 95, 96, 77, 146, 
                  40, 74, 42, 80, 88, 89, 90, 91, 
                  39, 73, 41, 81, 178, 179, 180, 181, 
                  37, 72, 38, 82, 87, 86, 85, 84,
                  0, 11, 12, 13, 14, 15, 16, 17,
                  267, 302, 268, 312, 317, 316, 315, 314, 
                  269, 303, 271, 311, 402, 403, 404, 405, 
                  270, 304, 272, 310, 318, 319, 320, 321, 
                  409, 408, 407, 415, 324, 325, 307, 375, 
                  308, 292, 306, 291]
    
    with mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1, min_detection_confidence=0.5) as face_mesh:
        lip_info = []
        image = cv2.imread(image_in)
        height, width, _ = image.shape
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = face_mesh.process(image_rgb)

        if results.multi_face_landmarks is not None:
            for face_landmarks in results.multi_face_landmarks:
                for index in index_lips:
                    #lip_info.append(face_landmarks.landmark[index])
                    lip_info.append([face_landmarks.landmark[index].x, face_landmarks.landmark[index].y, face_landmarks.landmark[index].z])

        pos_x = []
        pos_y = []
        pos_z = []
        for i in lip_info:
            pos_x.append(i[0])
            pos_y.append(i[1])
            pos_z.append(i[2])

        aux_x = np.mean(pos_x)
        aux_y = np.mean(pos_y)
        aux_z = np.mean(pos_z)
        lip_info.append([aux_x, aux_y, aux_z])

    lip_info = np.array(lip_info)
    coordenada_central = 80
    img_lip_info_2 = []
    for i in range(len(lip_info) - 1):
        distancia = np.linalg.norm(lip_info[i] - lip_info[coordenada_central])
        img_lip_info_2.append(distancia)

    lip_info = pd.DataFrame(img_lip_info_2).transpose()
    prediccion = vrdp_classifier.predict(lip_info)

    #vocales = ['A', 'E', 'I', 'O', 'U']
    #posicion_maxima = np.argmax(np.array(prediccion))
    #vocal_maxima = vocales[posicion_maxima]
    
    return prediccion 

In [103]:
lip_info = lip_r_dl('data/img/I/scene00001.jpg')
print(lip_info)

  trackable.load_own_variables(weights_store.get(inner_path))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[[0.7835485  0.0787283  0.01461062 0.11100323 0.01210934]]


In [96]:
import numpy as np
vocales = ['A', 'E', 'I', 'O', 'U']
# Encontrar la posición del valor más alto en el array de entrada
posicion_maxima = np.argmax(np.array(lip_info))

# Obtener la vocal correspondiente a la posición máxima
vocal_maxima = vocales[posicion_maxima]

# Output
print(vocal_maxima)

A


In [92]:
import gradio as gr

demo = gr.Interface(
    fn=lip_r_dl,
    inputs=gr.components.Image(type="filepath", label="Input Image"),
    outputs="text",
    title="Clasificador de Vocales",
    description="Carga una imagen y el modelo predecirá su clase."
)

demo.launch()

Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.




In [5]:
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
import os
import PIL
import tensorflow as tf
import mediapipe as mp
import pandas as pd
import pathlib

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [6]:
dataset_url = "data/img"
data_dir = pathlib.Path(dataset_url)
data_dir

WindowsPath('data/img')

In [7]:
E = list(data_dir.glob('E/*'))
print(E[0])

data\img\E\scene00001.jpg


In [8]:
img_height,img_width=180,180
batch_size=32
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

Found 717 files belonging to 5 classes.
Using 574 files for training.


In [9]:
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

Found 717 files belonging to 5 classes.
Using 143 files for validation.


In [10]:
class_names = train_ds.class_names
print(class_names)

['A', 'E', 'I', 'O', 'U']


In [11]:
num_classes = 5

model = Sequential([
  keras.Input(shape=(180, 180, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes,activation='softmax')
])

In [12]:
from keras.losses import sparse_categorical_crossentropy
model.compile(optimizer='adam',
              loss=sparse_categorical_crossentropy,
              metrics=['accuracy'])

In [13]:
epochs=20
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs
)

Epoch 1/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 186ms/step - accuracy: 0.2293 - loss: 127.1779 - val_accuracy: 0.4406 - val_loss: 1.4168
Epoch 2/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 170ms/step - accuracy: 0.5470 - loss: 1.1805 - val_accuracy: 0.5944 - val_loss: 0.9134
Epoch 3/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 171ms/step - accuracy: 0.8192 - loss: 0.5368 - val_accuracy: 0.6923 - val_loss: 0.8747
Epoch 4/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 170ms/step - accuracy: 0.8972 - loss: 0.3022 - val_accuracy: 0.8462 - val_loss: 0.3669
Epoch 5/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 173ms/step - accuracy: 0.9955 - loss: 0.0586 - val_accuracy: 0.9371 - val_loss: 0.1843
Epoch 6/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 171ms/step - accuracy: 0.9914 - loss: 0.0372 - val_accuracy: 0.9441 - val_loss: 0.1600
Epoch 7/20
[1m18/18[0m 

In [14]:
def predict_image(img):
    import cv2
    img = cv2.imread(img)
    imagen_redimensionada = cv2.resize(img, (180, 180))
    imagen_redimensionada_expandida = np.expand_dims(imagen_redimensionada, axis=0)
    #img_4d=img.reshape(-1,180,180,3)
    prediction=model.predict(imagen_redimensionada_expandida)[0]
    return {class_names[i]: float(prediction[i]) for i in range(5)}

In [15]:
gr.Interface(fn=predict_image, inputs=gr.Image(height=180, width=180, type="filepath", label="Input Image"), outputs=gr.Label(num_top_classes=5)).launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step


In [16]:
score = model.evaluate(train_ds, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 6.749868771294132e-05
Test accuracy: 1.0


In [17]:
score = model.evaluate(val_ds, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.09293024986982346
Test accuracy: 0.9650349617004395
