In [27]:
!pip install pandas
!pip install torch
!pip install torchvision
!pip install scikit-learn


Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Downloading joblib-1.5.1-py3-none-any.whl (307 kB)
Using cached scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.3 MB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [scikit-learn]0m [32m3/4[0m 

In [1]:
import os
import json
import ast
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset
import torchvision.transforms as T
import pandas as pd

# Função para limpar o campo 'image' no DataFrame
def clean_img_name(name):
    # Se for string no formato de lista (ex: "['nome.jpg']"), converte para lista e pega o primeiro item
    if isinstance(name, str) and name.startswith('[') and name.endswith(']'):
        try:
            parsed = ast.literal_eval(name)
            if isinstance(parsed, list) and len(parsed) > 0:
                return parsed[0]
        except:
            pass
    return name

# Carregar CSV e limpar nomes das imagens
df = pd.read_csv('../dados/dataset_pose.csv')
df['image'] = df['image'].apply(clean_img_name)

# Converter DataFrame em lista de dicionários
samples = df.to_dict(orient='records')

# Mapear ações para índices
actions = sorted(set(s['action'] for s in samples))
action_to_idx = {a: i for i, a in enumerate(actions)}


In [2]:

def collate_fn_pad(batch):
    """
    batch: lista de tuplas (image, keypoints_tensor, label)
    Faz padding dos keypoints para o tamanho do maior do batch.
    """
    images = [item[0] for item in batch]
    keypoints = [item[1] for item in batch]
    labels = [item[2] for item in batch]
    
    # Empilha imagens normalmente (mesmo shape)
    images = torch.stack(images)
    
    # Pega o tamanho máximo de keypoints no batch
    max_kps = max(kp.shape[0] for kp in keypoints)
    
    # Padding dos keypoints com zeros
    padded_kps = []
    for kp in keypoints:
        pad_size = max_kps - kp.shape[0]
        if pad_size > 0:
            padding = torch.zeros((pad_size, 3), dtype=torch.float32)
            kp_padded = torch.cat([kp, padding], dim=0)
        else:
            kp_padded = kp
        padded_kps.append(kp_padded)
    
    keypoints_batch = torch.stack(padded_kps)
    labels = torch.tensor(labels, dtype=torch.long)
    
    return images, keypoints_batch, labels

In [72]:
class PoseDataset(Dataset):
    def __init__(self, data, images_dir, img_key='image', keypoints_key='keypoints', label_key='action',
                 image_size=(256, 256), transform=None):
     
        self.data = data
        self.images_dir = images_dir
        self.img_key = img_key
        self.keypoints_key = keypoints_key
        self.label_key = label_key
        self.image_size = image_size
        self.transform = transform
        
        # Se não passar transform, cria padrão de redimensionar e converter em tensor
        if self.transform is None:
            self.transform = T.Compose([
                T.Resize(self.image_size),
                T.ToTensor(),
            ])
    
    def __len__(self):
        return len(self.data)
    

    
    def __getitem__(self, idx):
        row = self.data[idx]
    
        img_filename = row[self.img_key]
        if isinstance(img_filename, list):
            img_filename = img_filename[0]
        if isinstance(img_filename, str) and img_filename.startswith("[") and img_filename.endswith("]"):
            img_filename = img_filename.strip("[]").replace("'", "").replace('"', "").strip()
    
        img_path = os.path.join(self.images_dir, img_filename)
    
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
    
        keypoints = row[self.keypoints_key]
    
        if isinstance(keypoints, str):
            keypoints = json.loads(keypoints)
    
        if isinstance(keypoints, list) and isinstance(keypoints[0], dict):
            keypoints = [[kp['x'], kp['y']] for kp in keypoints]
        keypoints = np.array(keypoints)
        if keypoints.shape[1] == 2:
            # Adiciona uma terceira coluna com 1 (visibilidade = 1, por exemplo)
            vis = np.ones((keypoints.shape[0], 1))
            keypoints = np.hstack((keypoints, vis))
        keypoints = torch.tensor(keypoints, dtype=torch.float32)
    
        label = row[self.label_key]
        
        # Se label for string numérica
        try:
            label = int(label)
        except:
            # se for categoria textual, faça seu mapeamento aqui
            label_map = {'fishing and hunting': 0, 'occupation': 1, 'sports': 2}  # ajuste conforme seu caso
            label = label_map[label]
        
        label = torch.tensor(label, dtype=torch.long)
        

        max_kps = 16
        if keypoints.shape[0] < max_kps:
            pad_size = max_kps - keypoints.shape[0]
            padding = torch.zeros(pad_size, keypoints.shape[1])  # assume (N,3)
            keypoints = torch.cat([keypoints, padding], dim=0)
        elif keypoints.shape[0] > max_kps:
            keypoints = keypoints[:max_kps]
    
        return image, keypoints, label

In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# 1. Ler CSV
df = pd.read_csv('../dados/dataset_pose.csv')

# 2. Criar mapeamento das ações para índices
actions = sorted(df['action'].unique())
action_to_idx = {a: i for i, a in enumerate(actions)}

# 3. Mapear coluna 'action' para numérico
df['action_idx'] = df['action'].map(action_to_idx)  # Usa a coluna esperada pela classe

# 4. Separar treino/val (80/20), estratificado pela label
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['action_idx'], random_state=42)

# 5. Converter DataFrame para lista de dicts (records)
train_samples = train_df.to_dict('records')
val_samples = val_df.to_dict('records')


In [74]:
train_dataset = PoseDataset(train_samples, images_dir='../dados/mpii_human_pose_v1/images')
val_dataset = PoseDataset(val_samples, images_dir='../dados/mpii_human_pose_v1/images')


In [75]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2, collate_fn=collate_fn_pad)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2, collate_fn=collate_fn_pad)


In [68]:
import torch.nn as nn
import torch.nn.functional as F

class PoseActionCNN(nn.Module):
    def __init__(self, num_keypoints=16, num_classes=3):
        super(PoseActionCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),  # (B, 32, 256, 256)
            nn.ReLU(),
            nn.MaxPool2d(2),                # (B, 32, 128, 128)

            nn.Conv2d(32, 64, 3, padding=1),# (B, 64, 128, 128)
            nn.ReLU(),
            nn.MaxPool2d(2),                # (B, 64, 64, 64)

            nn.Conv2d(64, 128, 3, padding=1),# (B, 128, 64, 64)
            nn.ReLU(),
            nn.MaxPool2d(2),                # (B, 128, 32, 32)
        )

        self.flatten = nn.Flatten()

        self.fc = nn.Sequential(
            nn.Linear(128 * 32 * 32, 512),
            nn.ReLU()
        )

        self.fc_keypoints = nn.Linear(512, num_keypoints * 2)
        self.fc_action = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.flatten(x)
        x = self.fc(x)

        keypoints_out = self.fc_keypoints(x)
        action_out = self.fc_action(x)

        return keypoints_out, action_out

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = PoseActionCNN(num_keypoints=16, num_classes=3).to(device)

mse_loss = nn.MSELoss()
ce_loss = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train_one_epoch(model, dataloader, optimizer, mse_loss, ce_loss, device):
    model.train()
    total_loss = 0

    for images, keypoints, labels in dataloader:
        images = images.to(device)
        keypoints = keypoints[:, :, :2].reshape(images.size(0), -1).to(device)  # (B, 16*2)
        labels = labels.to(device)

        pred_kps, pred_action = model(images)
        
        loss_kps = mse_loss(pred_kps, keypoints)
        loss_action = ce_loss(pred_action, labels)
        loss = loss_kps + loss_action

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def validate(model, dataloader, mse_loss, ce_loss, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, keypoints, labels in dataloader:
            images = images.to(device)
            keypoints = keypoints[:, :, :2].reshape(images.size(0), -1).to(device)
            labels = labels.to(device)

            pred_kps, pred_action = model(images)

            loss_kps = mse_loss(pred_kps, keypoints)
            loss_action = ce_loss(pred_action, labels)
            loss = loss_kps + loss_action

            total_loss += loss.item()

            preds = pred_action.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    return total_loss / len(dataloader), acc
num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_loader, optimizer, mse_loss, ce_loss, device)
    val_loss, val_acc = validate(model, val_loader, mse_loss, ce_loss, device)

    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")


In [77]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
import numpy as np

# Classe modelo adaptada para filtro variável (parâmetro)
class PoseActionCNN(nn.Module):
    def __init__(self, num_keypoints=16, num_classes=3, base_filters=32):
        super(PoseActionCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, base_filters, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(base_filters, base_filters*2, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(base_filters*2, base_filters*4, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        self.flatten = nn.Flatten()
        # calcula o tamanho de entrada para fc de forma dinâmica (assumindo 256x256 input)
        fc_input_size = base_filters*4 * 32 * 32

        self.fc = nn.Sequential(
            nn.Linear(fc_input_size, 512),
            nn.ReLU()
        )
        self.fc_keypoints = nn.Linear(512, num_keypoints * 2)
        self.fc_action = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.flatten(x)
        x = self.fc(x)

        keypoints_out = self.fc_keypoints(x)
        action_out = self.fc_action(x)
        return keypoints_out, action_out

# Funções train_one_epoch e validate continuam iguais (copie a sua)

def train_one_epoch(model, dataloader, optimizer, mse_loss, ce_loss, device):
    model.train()
    total_loss = 0
    for images, keypoints, labels in dataloader:
        images = images.to(device)
        keypoints = keypoints[:, :, :2].reshape(images.size(0), -1).to(device)
        labels = labels.to(device)

        pred_kps, pred_action = model(images)

        loss_kps = mse_loss(pred_kps, keypoints)
        loss_action = ce_loss(pred_action, labels)
        loss = loss_kps + loss_action

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

def validate(model, dataloader, mse_loss, ce_loss, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, keypoints, labels in dataloader:
            images = images.to(device)
            keypoints = keypoints[:, :, :2].reshape(images.size(0), -1).to(device)
            labels = labels.to(device)

            pred_kps, pred_action = model(images)

            loss_kps = mse_loss(pred_kps, keypoints)
            loss_action = ce_loss(pred_action, labels)
            loss = loss_kps + loss_action

            total_loss += loss.item()
            preds = pred_action.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    return total_loss / len(dataloader), acc

# --- Validação cruzada com busca em hiperparâmetros ---

from torch.utils.data import Subset, DataLoader

def cross_validate(dataset, device, k=5, epochs=10, batch_size=32, param_grid=None):
    if param_grid is None:
        param_grid = [
            {'lr': 1e-3, 'base_filters': 32},
            {'lr': 1e-3, 'base_filters': 64},
            {'lr': 5e-4, 'base_filters': 32},
        ]

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    results = []

    for params in param_grid:
        print(f"\nTestando parâmetros: lr={params['lr']}, base_filters={params['base_filters']}")
        fold_losses = []
        fold_accuracies = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
            print(f"Fold {fold + 1}/{k}")

            train_subset = Subset(dataset, train_idx)
            val_subset = Subset(dataset, val_idx)

            train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True, num_workers=4)
            val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False, num_workers=4)

            model = PoseActionCNN(num_keypoints=16, num_classes=3, base_filters=params['base_filters']).to(device)
            optimizer = optim.Adam(model.parameters(), lr=params['lr'])
            mse_loss = nn.MSELoss()
            ce_loss = nn.CrossEntropyLoss()

            for epoch in range(epochs):
                train_loss = train_one_epoch(model, train_loader, optimizer, mse_loss, ce_loss, device)
                val_loss, val_acc = validate(model, val_loader, mse_loss, ce_loss, device)
                print(f"Epoch {epoch+1}/{epochs} - Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

            fold_losses.append(val_loss)
            fold_accuracies.append(val_acc)

        avg_loss = np.mean(fold_losses)
        avg_acc = np.mean(fold_accuracies)
        print(f"Parâmetros lr={params['lr']}, base_filters={params['base_filters']} | "
              f"Média Val Loss: {avg_loss:.4f} | Média Val Acc: {avg_acc:.4f}")

        results.append({'params': params, 'val_loss': avg_loss, 'val_acc': avg_acc})

    return results


# Exemplo de uso



In [None]:
results = cross_validate(train_dataset, device=device, k=5, epochs=10, batch_size=32)


Testando parâmetros: lr=0.001, base_filters=32
Fold 1/5
Epoch 1/10 - Val Loss: 116017.0820 | Val Acc: 0.3958
Epoch 2/10 - Val Loss: 108775.4414 | Val Acc: 0.3958
Epoch 3/10 - Val Loss: 103752.8438 | Val Acc: 0.3333
Epoch 4/10 - Val Loss: 92876.3613 | Val Acc: 0.2708
Epoch 5/10 - Val Loss: 86701.7480 | Val Acc: 0.3958
Epoch 6/10 - Val Loss: 84826.5605 | Val Acc: 0.3958
Epoch 7/10 - Val Loss: 88343.0664 | Val Acc: 0.3958
Epoch 8/10 - Val Loss: 84034.5781 | Val Acc: 0.3333
Epoch 9/10 - Val Loss: 87097.4961 | Val Acc: 0.3958
Epoch 10/10 - Val Loss: 84223.8789 | Val Acc: 0.3333
Fold 2/5
Epoch 1/10 - Val Loss: 74108.2109 | Val Acc: 0.2708
Epoch 2/10 - Val Loss: 82902.1289 | Val Acc: 0.3333
Epoch 3/10 - Val Loss: 76706.7773 | Val Acc: 0.3958
Epoch 4/10 - Val Loss: 61048.3770 | Val Acc: 0.2708
Epoch 5/10 - Val Loss: 61149.6562 | Val Acc: 0.2708
Epoch 6/10 - Val Loss: 56459.2402 | Val Acc: 0.3333
Epoch 7/10 - Val Loss: 58045.3906 | Val Acc: 0.2917
Epoch 8/10 - Val Loss: 54967.0059 | Val Acc: 0