### 使用holistic获取人体关键点位置

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
import mediapipe.python.solutions as sol
from IPython.display import clear_output

def draw_styled_landmarks(image, results):
    # Draw face connections
    sol.drawing_utils.draw_landmarks(image, results.face_landmarks, sol.holistic.FACEMESH_TESSELATION,
                              landmark_drawing_spec=None,
                              connection_drawing_spec=sol.drawing_styles.get_default_face_mesh_tesselation_style())
    sol.drawing_utils.draw_landmarks(image, results.face_landmarks, sol.holistic.FACEMESH_CONTOURS,
                              landmark_drawing_spec=None,
                              connection_drawing_spec=sol.drawing_styles.get_default_face_mesh_contours_style()) 
    # Draw pose connections
    sol.drawing_utils.draw_landmarks(image, results.pose_landmarks, sol.holistic.POSE_CONNECTIONS,
                              sol.drawing_styles.get_default_pose_landmarks_style()) 
    # Draw left hand connections
    sol.drawing_utils.draw_landmarks(image, results.left_hand_landmarks, sol.holistic.HAND_CONNECTIONS,
                             sol.drawing_styles.get_default_hand_landmarks_style(),sol.drawing_styles.get_default_hand_connections_style()) 
    # Draw right hand connections  
    sol.drawing_utils.draw_landmarks(image, results.right_hand_landmarks, sol.holistic.HAND_CONNECTIONS,
                              sol.drawing_styles.get_default_hand_landmarks_style(),sol.drawing_styles.get_default_hand_connections_style()) 
def extract_landmarks(x):
    result=[]
    if not x.pose_landmarks is None:
        a=x.pose_landmarks.landmark
        for i in range(len(a)):
            result.append([a[i].x,a[i].y,a[i].z])
    else:
        result+=[[0,0,0]]*33
    if not x.left_hand_landmarks is None:
        a=x.left_hand_landmarks.landmark
        for i in range(len(a)):
            result.append([a[i].x,a[i].y,a[i].z])
    else:
        result+=[[0,0,0]]*21
    if not x.right_hand_landmarks is None:
        a=x.right_hand_landmarks.landmark
        for i in range(len(a)):
            result.append([a[i].x,a[i].y,a[i].z])
    else:
        result+=[[0,0,0]]*21
    if not x.face_landmarks is None:
        a=x.face_landmarks.landmark
        for i in range(len(a)):
            result.append([a[i].x,a[i].y,a[i].z])
    else:
        result+=[[0,0,0]]*468
    assert len(result)==543
    return result
    
def start_listen(detect):
    camera=cv2.VideoCapture(0,cv2.CAP_DSHOW)
    camera.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
    camera.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
    camera.set(cv2.CAP_PROP_FPS,60)
    with sol.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5, model_complexity=2) as holistic:
        seq=[]
        while camera.isOpened():
            ret, frame = camera.read()
            clear_output(wait=True)
            frame=frame[:,::-1,:]
            
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
            image.flags.writeable = False                  # Image is no longer writeable
            results = holistic.process(image)                 # Make prediction
            image.flags.writeable = True                   # Image is now writeable 
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    
            draw_styled_landmarks(image, results)
            image=detect(image,extract_landmarks(results))
            
            cv2.imshow('OpenCV Feed', image)
    
            if cv2.waitKey(20) & 0xFF == ord('q'):
                break
        camera.release()
        cv2.destroyAllWindows()

In [4]:
# train_dir="D:\\sjtu\\project\\asl_alphabet_train\\asl_alphabet_train"
train_dir="D:\\sjtu\\project\\dataset\\data\\"
# test_dir="D:\\sjtu\\project\\asl_alphabet_test\\asl_alphabet_test"
train_detect="D:\\sjtu\\project\\dataset\\data_detect\\"
# train_detect="D:\\sjtu\\project\\asl_alphabet_train\\asl_alphabet_train_detect"
token_list=[]

import os
import cv2 as cv
import mediapipe as mp
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
# from mediapipe.tasks import python
# from mediapipe.tasks.python import vision
from tqdm import tqdm

HandLandmarkerResult = mp.tasks.vision.HandLandmarkerResult
hand_path='D:\\sjtu\\project\\hand_landmarker.task'
hand_file=open(hand_path,"rb")
hand_data=hand_file.read()
hand_file.close()
base_options=mp.tasks.BaseOptions(model_asset_buffer=hand_data)
options=mp.tasks.vision.HandLandmarkerOptions(base_options=base_options,num_hands=1)
hand_detector=mp.tasks.vision.HandLandmarker.create_from_options(options)

MARGIN = 10  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # vibrant green
def draw_hand_landmarks_on_image(rgb_image, detection_result):
    hand_landmarks_list = detection_result.hand_landmarks
    handedness_list = detection_result.handedness
    annotated_image = np.copy(rgb_image)
    # Loop through the detected hands to visualize.
    for idx in range(len(hand_landmarks_list)):
        hand_landmarks = hand_landmarks_list[idx]
        handedness = handedness_list[idx]
        # Draw the hand landmarks.
        hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
        hand_landmarks_proto.landmark.extend([
        landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
        ])
        solutions.drawing_utils.draw_landmarks(
        annotated_image,
        hand_landmarks_proto,
        solutions.hands.HAND_CONNECTIONS,
        solutions.drawing_styles.get_default_hand_landmarks_style(),
        solutions.drawing_styles.get_default_hand_connections_style())
        # Get the top left corner of the detected hand's bounding box.
        height, width, _ = annotated_image.shape
        x_coordinates = [landmark.x for landmark in hand_landmarks]
        y_coordinates = [landmark.y for landmark in hand_landmarks]
        text_x = int(min(x_coordinates) * width)
        text_y = int(min(y_coordinates) * height) - MARGIN
        # Draw handedness (left or right hand) on the image.
        cv2.putText(annotated_image, f"{handedness[0].category_name}",
                    (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                    FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)
    return annotated_image
if not os.path.exists(train_detect):
    os.mkdir(train_detect)
token_list=sorted(os.listdir(train_dir))
print(token_list)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'space']


### 数据集收集

In [138]:
for i in range(len(token_list)):
    token=token_list[i]
    camera=cv2.VideoCapture(0,cv2.CAP_DSHOW)
    camera.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
    camera.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)
    camera.set(cv2.CAP_PROP_FPS,60)
    cnt=0
    data=[]
    with sol.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5, model_complexity=2) as holistic:
        seq=[]
        while camera.isOpened():
            ret, frame = camera.read()
            if not ret:
                continue
            clear_output(wait=True)
            frame=frame[:,::-1,:]
            
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
            image.flags.writeable = False                  # Image is no longer writeable
            results = holistic.process(image)                 # Make prediction
            image.flags.writeable = True                   # Image is now writeable 
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
            cv.putText(image,f"Token: {token} cnt {cnt}",(100,100),cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),2)
            
            draw_styled_landmarks(image, results)
            res=extract_landmarks(results)
            if res[33][0]!=0:
                res=res[33:54]
            else:
                res=res[54:75]                
            cv2.imshow('OpenCV Feed', image)
            a=cv2.waitKey(20)
            if a & 0xFF == ord('q'):
                break
            if a & 0xFF == ord(' '):
                cnt+=1
                data.append([i,res])
                cv2.waitKey(100)
                if cnt>=100:
                    break
        camera.release()
        cv2.destroyAllWindows()
with open(os.path.join(train_dir,"./../datas_diy.json"),"w") as f:
    f.write(str(data))

### 处理train数据  
1. 获取图片
2. 识别位点位置

In [3]:
datas=[]
for i in range(len(token_list)):
    token=token_list[i]
    path=os.path.join(train_dir,token)
    detect_path=os.path.join(train_detect,token)
    if not os.path.exists(detect_path):
        os.mkdir(detect_path)
    files=os.listdir(path)
    cnt=0
    for file in tqdm(files):
        image=cv.imread(os.path.join(path,file))
        mp_image=mp.Image(image_format=mp.ImageFormat.SRGB,data=np.array(cv.cvtColor(image,cv.COLOR_BGR2RGB)))
        result=hand_detector.detect(mp_image)
        if len(result.hand_landmarks)==0:
            continue
        points=[]
        for j in range(21):
            points.append([result.hand_landmarks[0][j].x,result.hand_landmarks[0][j].y,result.hand_landmarks[0][j].z])
        datas.append([i,points])
        image=draw_hand_landmarks_on_image(image,result)
        cv.imwrite(os.path.join(detect_path,file),image)
        cnt+=1
    print(f"image for {token}: total {len(files)}, detect {cnt}")
# print(datas)
with open(os.path.join(train_dir,"./../datas_3d.json"),"w") as f:
    f.write(str(datas))
train_datas=datas

  7%|█████▋                                                                         | 290/4000 [00:22<04:41, 13.17it/s]


KeyboardInterrupt: 

### 处理test数据
@Deprecated 现在test由train取样获得

In [48]:
datas=[]
files=os.listdir(test_dir)
cnt=0
for file in files:
    image=cv.imread(os.path.join(test_dir,file))
    mp_image=mp.Image(image_format=mp.ImageFormat.SRGB,data=np.array(cv.cvtColor(image,cv.COLOR_BGR2RGB)))
    result=hand_detector.detect(mp_image)
    if len(result.hand_landmarks)==0:
        continue
    points=[]
    for j in range(21):
        points.append([result.hand_landmarks[0][j].x,result.hand_landmarks[0][j].y,result.hand_landmarks[0][j].z])
    datas.append([i,points])
    image=draw_hand_landmarks_on_image(image,result)
    cv.imshow(f"{file.split('_')[0]}",image)
    cv.waitKey(2000)
    cv.destroyAllWindows()
    cnt+=1
print(f"test total 28, detect {cnt}")
test_datas=datas



test total 28, detect 14


### 读取数据并准备DataLoader  

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
import random
import json
import numpy as np
import random
import os

train_data=[]
with open(os.path.join(train_dir,"./../datas.json"),"r") as f:
    train_data=json.loads(f.read())

def normalize(points):
    res=[]
    maxx,maxy,minx,miny=-1,-1,1,1
    maxz,minz=-1,1
    for point in points:
        maxx=max(maxx,point[0])
        maxy=max(maxy,point[1])
        minx=min(minx,point[0])
        miny=min(miny,point[1])
        maxz=max(maxz,point[2])
        minz=min(minz,point[2])
    max_delta=max(maxx-minx,maxy-miny)
    if max_delta<=1e-5:
        return [[0,0,0]]*21
    for point in points:
        newx=(point[0]-(maxx+minx)/2)/max_delta
        newy=(point[1]-(maxy+miny)/2)/max_delta
        newz=(point[2]-(maxz+minz)/2)/(maxz-minz)
        if newx<-0.5-1e-6 or newx>0.5+1e-6 or newy<-0.5-1e-6 or newy>0.5+1e-6 or newz<-0.5-1e-6 or newz>0.5+1e-6:
            print([point[0],point[1],newx,newy,max_delta,maxx,maxy,minx,miny])
            raise ValueError
        res.append([newx,newy,newz])
    return res
train_data=[[x[0],normalize(x[1])] for x in tqdm(train_data)]
# print(train_data[0])

def rotate_points(points):
    output_rotated_points=[]
    angle=random.randint(-15,15)
    rad_angle=np.deg2rad(angle)
    rotation_matrix = np.array([[np.cos(rad_angle), -np.sin(rad_angle),0],
                                [np.sin(rad_angle), np.cos(rad_angle),0],
                               [0,0,1]])
    for point in points:
        rotated_points = np.dot(point, rotation_matrix)
        output_rotated_points.append(rotated_points.tolist())
    return output_rotated_points

def modify(x):
    normalize(rotate_points(np.array(x)))
    if random.randint(0,1)==0:
        x=[[-i[0],-i[1],i[2]] for i in x]
    return x

class myDataset(Dataset):
    def __init__(self,x):
        self.data=x
    def __getitem__(self,x):
        label=self.data[x][0]
        points=np.array(self.data[x][1])
        return label,points
    def __len__(self):
        return len(self.data)
train_data=train_data
random.shuffle(train_data)
test_data=train_data[:len(train_data)//100]
train_data=train_data[len(train_data)//100+1:]

train_dataset=myDataset(train_data)
train_loader=DataLoader(train_dataset,batch_size=64)
print(f"Loaded train data {len(train_dataset)}({len(train_loader)} batch)")

test_dataset=myDataset(test_data)
test_loader=DataLoader(test_dataset,batch_size=64)
print(f"Loaded test data {len(test_dataset)}({len(test_loader)} batch)")

100%|█████████████████████████████████████████████████████████████████████████| 86785/86785 [00:05<00:00, 15774.90it/s]


Loaded train data 85917(1343 batch)
Loaded test data 867(14 batch)


### 设计并训练模型   
手部点位总计21个，将其映射到~~29~~28个点作为模型输出

In [8]:
class SignClassifier(nn.Module):
    def __init__(self):
        super(SignClassifier,self).__init__()
        self.ff1=nn.Linear(21*3,1024)
        self.ff2=nn.Linear(1024,512)
        # self.ff3=nn.Linear(512,29)
        self.ff3=nn.Linear(512,28)
        self.relu=nn.ReLU()
        self.flatten=nn.Flatten()
        self.dropout1=nn.Dropout(0.2)
        self.dropout2=nn.Dropout(0.2)
    def forward(self,x):
        x=self.flatten(x)
        x=self.dropout1(self.relu(self.ff1(x)))
        x=self.dropout2(self.relu(self.ff2(x)))
        return self.ff3(x)

In [7]:
def train(dataloader,model,loss_fn,optimizer):
    model.train()
    pbar=tqdm(dataloader)
    for y,X in pbar:
        # print(X.shape)
        X=[modify(i) for i in X.tolist()]
        X=torch.Tensor(X).to(device)
        # print(X.shape)
        # print(X)
        y=torch.Tensor(y).to(device)
        # print(y.shape)
        # break
        pred=model(X)
        loss=loss_fn(pred,y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        pbar.set_description(f"loss: {loss.item():.6f}")
def test(dataloader,model,loss_fn):
    model.eval()
    pbar=tqdm(dataloader)
    losssum=0
    cnt=0
    acc=0
    for y,X in pbar:
        X=torch.Tensor(X).type(torch.float).to(device)
        # print(X.shape)
        y=torch.Tensor(y).to(device)
        # print(y.shape)
        # break
        pred=model(X)
        loss=loss_fn(pred,y)
        acc+=(pred.argmax(1) == y).type(torch.float).sum().item()
        losssum+=loss.item()
        cnt+=1
    print(f"loss {losssum/cnt} acc {acc/len(dataloader.dataset)}")
        
device='cuda'
model=SignClassifier()
model.to(device)
loss_fn=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=1e-4)

epoch=5

for i in range(epoch):
    print(f"Epoch {i+1}")
    train(train_loader,model,loss_fn,optimizer)  
    test(test_loader,model,loss_fn)  

Epoch 1


loss: 0.337205: 100%|██████████████████████████████████████████████████████████████| 1343/1343 [00:20<00:00, 66.65it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 257.07it/s]


loss 0.5110419669321605 acc 0.8719723183391004
Epoch 2


loss: 0.218919: 100%|██████████████████████████████████████████████████████████████| 1343/1343 [00:20<00:00, 64.82it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 252.98it/s]


loss 0.30631272388356073 acc 0.9284890426758939
Epoch 3


loss: 0.118539: 100%|██████████████████████████████████████████████████████████████| 1343/1343 [00:22<00:00, 59.85it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 305.36it/s]


loss 0.23159179144671985 acc 0.9504036908881199
Epoch 4


loss: 0.083978: 100%|██████████████████████████████████████████████████████████████| 1343/1343 [00:21<00:00, 63.80it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 335.23it/s]


loss 0.19141275488904544 acc 0.9550173010380623
Epoch 5


loss: 0.092011: 100%|██████████████████████████████████████████████████████████████| 1343/1343 [00:22<00:00, 58.75it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 217.48it/s]


loss 0.16612651944160461 acc 0.9573241061130334


In [8]:
torch.save(model,'PointDetect_3d.pth')

### 实时检测
通过mediapipe获取到关键点位置之后传入模型进行分类

In [13]:
model=torch.load('PointDetect_3d.pth')
device='cuda'
import math
from datetime import datetime
start_time=0
last_token="@"
sentence="@"
ACTIVATE_RATE=60/100
def detection(image,x):
    global last_token,start_time,sentence,ACTIVATE_RATE
    a=normalize(x[33:54])
    b=normalize(x[54:75])
    print(x[33:54],a)
    print(x[54:75],b)
    data=torch.Tensor([a,b]).to(device)
    # data=torch.Tensor([normalize(x)]).to(device)
    # print(data)
    print(data.shape)
    res=F.softmax(model(data))
    # print(F.softmax(res).tolist())
    for i in res[0]:
        print(f"{i:>.5f}",end=" ")
    print()
    for i in res[1]:
        print(f"{i:>.5f}",end=" ")
    print()
    a=res.argmax(1)
    for i in range(len(token_list)):
        if x[54][0]!=0:
            cv.rectangle(image,(0,25*i+25),(math.ceil(200*res[1][i]),25*i),(0,255,0),-1)
            cv.putText(image,f"{res[1][i]*100:>.3f}%",(210,25*i+25),cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,255),2)
        cv.putText(image,f"Token {token_list[i]}",(0,25*i+25),cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,255),2)
    # for i in range(len(token_list)):
    #     cv.rectangle(image,(1300,25*i+25),(math.ceil(200*res[0][i])+1300,25*i),(0,255,0),-1)
    #     cv.putText(image,f"Token {token_list[i]}",(1300,25*i+25),cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,255),2)
    if x[54][0]==0:
        cv.putText(image,f"{sentence[1:]}",(400,200),cv2.FONT_HERSHEY_SIMPLEX,2,(0,0,255),2)
        start_time=0
        last_token="@"
        return image
    if res[1][a[1]]>=ACTIVATE_RATE:
        now=datetime.now().timestamp()*1000
        if start_time==0 or not last_token==a[1]:
            start_time=now
            last_token=a[1]
        elif now-start_time>1000:
            if token_list[a[1]]=='space':
                sentence+='_'
            else:
                sentence+=token_list[a[1]]
            start_time=1e18
            cv.putText(image,f"Recognize: {token_list[a[1]]}",(400,100),cv2.FONT_HERSHEY_SIMPLEX,2,(0,0,255),2)
    else:
        start_time==0
    cv.putText(image,f"{sentence[1:]}",(400,200),cv2.FONT_HERSHEY_SIMPLEX,2,(0,0,255),2)
    last_rate=res[1][a[1]]
    print(token_list[a[0]])
    print(token_list[a[1]])
    return image
model.eval()
start_listen(detection)
    

[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]] [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]] [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
torch.Size([2, 21, 3])
0.00219 0.00376 0.00250 0.09469 0.00521 0.02640 0

In [17]:
start_time=0
last_token="@"
sentence="@"
start_listen(detection)

[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]] [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]] [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
torch.Size([2, 21, 3])
0.00219 0.00376 0.00250 0.09469 0.00521 0.02640 0