# Inferencia de red ResNet para clasificación de signo a texto.

---
---

# Índice.

- [Configuración](#configuración)
  - [Configuración de la red](#configuración-de-la-red)
- [Elección del model a inferir](#elección-del-model-a-inferir)
  - [Carga del modelo](#carga-del-modelo)
- [Inferencia](#inferencia)
  - [Por webcam](#por-webcam)
  - [Por archivo](#desde-archivo)

## Configuración

---

In [6]:
from config.const import *
from config.torch_config import get_transform
from config.dataset import get_dataset_path
from lib.video_dataset import VideoFrameDataset


### Configuración de la red


In [7]:
DATASETS = ["WLASL/videos", "actions/frames"]
MODELS_NAME = ["WLASL_9", "actions_small"]

index = 0


In [8]:
data_path, model_path = get_dataset_path(dataset=DATASETS[index], model_name=MODELS_NAME[index])
multiple_transform = get_transform(IMAGE_SIZE)


In [9]:
dataset = VideoFrameDataset(
    root_path=data_path,
    transform=multiple_transform,
    num_segments=NUM_SEGMENTS,
    frames_per_segment=FRAMES_PER_SEGMENT,
    image_size=IMAGE_SIZE,
)

classes = dataset.classes
classes

['all', 'before', 'book', 'deaf', 'drink', 'help', 'no', 'walk', 'yes']

## Elección del modelo a inferir

---

### Carga del modelo

In [10]:
from torch import load

In [11]:
model = load(model_path)


### Carga del modelo onnx

In [12]:
import onnx

In [13]:
onnx_path = model_path.replace(".pth", ".onnx")
onnx_model = onnx.load(onnx_path)


In [14]:
import onnxruntime as ort

providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]


In [15]:
ort_session = ort.InferenceSession(
    onnx_path,
    providers=providers,
)


## Inferencia

---

In [16]:
import sys

sys.path.append("../")

from common.inference import video_webcam_inference


### Con ONNX session

#### Test a random input on onnx model

In [17]:
import numpy as np


def oxx_inference(video, session):
    outputs = session.run(
        None,
        {"input": video},
    )
    return classes[outputs[0][0].argmax(0)]


In [18]:
target = oxx_inference(
    np.random.randn(
        1, FRAMES_PER_SEGMENT * NUM_SEGMENTS, 3, IMAGE_SIZE, IMAGE_SIZE
    ).astype(np.float32),
    ort_session,
)

print(target)


all


#### Ejemplo de video con onnx

In [19]:
from utils.loader import split_dataset

In [20]:
train_loader, test_loader, validation_loader = split_dataset(
    dataset, train_split=0.70, validation_split=0.1, batch_size=1
)

first_batch, (ground_classes, _) = next(iter(train_loader))
video = first_batch[0]
ground = classes[ground_classes[0]]

In [21]:
normal_target = oxx_inference(first_batch.numpy(), ort_session)

In [22]:
print(f"Target is {normal_target}. Ground truth is {ground}")

if normal_target == ground:
    print("Letsaaa gooo")


Target is drink. Ground truth is drink
Letsaaa gooo


### Probamos con modelo cuantizado

In [23]:
quant_model_path = model_path.replace(".pth", "_quantized.onnx")
quant_onnx_model = onnx.load(quant_model_path)


In [None]:
quant_ort_session = ort.InferenceSession(
    quant_model_path,
    providers=providers,
)

In [None]:
quant_target = oxx_inference(first_batch.numpy(), quant_ort_session)

In [None]:
print(
    f"Normal target is {normal_target}. Quant target is {quant_target}. Ground truth is {ground}"
)


Normal target is book. Quant target is book. Ground truth is book


### Por webcam con PyTorch

In [None]:
video_webcam_inference(
    model,
    classes,
    "cuda",
    multiple_transform,
    fps_interval=NUM_SEGMENTS * FRAMES_PER_SEGMENT,
)
