# final

In [1]:
pip install git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-rgrws3ni
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-rgrws3ni
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.8/44.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369489 sha256=8caea93407e14b792007b8bf2cbc3c5aa4dc20155f430be14832f6d07786e2c9
  Stored in 

In [2]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import clip
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# üîπ Chemins des fichiers
DATA_PATH = "/kaggle/input/human-motion-description-hmd-motion-to-text"
MOTIONS_PATH = os.path.join(DATA_PATH, "motions")
TEXTS_PATH = os.path.join(DATA_PATH, "texts")
train_file = os.path.join(DATA_PATH, "train.txt")
test_file = os.path.join(DATA_PATH, "test.txt")

# üîπ Chargement des IDs
print("üîÑ Chargement des IDs...")
train_ids = [line.strip() for line in open(train_file).readlines()]
test_ids = [line.strip() for line in open(test_file).readlines()]
print("‚úÖ Chargement des IDs termin√© !")

# üîπ Chargement du mod√®le CLIP
print("üîÑ Chargement du mod√®le CLIP...")
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, _ = clip.load("ViT-B/32", device=device)
clip_model.eval()
print("‚úÖ CLIP charg√© avec succ√®s !")

# üîπ Chargement du mod√®le GPT-2
print("üîÑ Chargement du mod√®le GPT-2...")
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token  # Ajout du token de padding
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
print("‚úÖ GPT-2 charg√© avec succ√®s !")

# üîπ Fonction pour g√©n√©rer les embeddings texte avec CLIP
def get_text_embedding(text):
    tokenized_text = clip.tokenize([text[:75]]).to(device)
    with torch.no_grad():
        text_embedding = clip_model.encode_text(tokenized_text)
        return text_embedding.float() / text_embedding.norm(dim=-1, keepdim=True)

# üîπ Dataset Motion + Text Embeddings
class MotionTextDataset(Dataset):
    def __init__(self, motion_ids):
        self.motion_ids = motion_ids

    def __len__(self):
        return len(self.motion_ids)

    def __getitem__(self, idx):
        motion_id = self.motion_ids[idx]
        motion = np.load(os.path.join(MOTIONS_PATH, f"{motion_id}.npy"))
        motion_tensor = torch.tensor(motion.reshape(motion.shape[0], -1), dtype=torch.float32)
        
        text_file = os.path.join(TEXTS_PATH, f"{motion_id}.txt")
        text = np.random.choice(open(text_file).readlines()).strip() if os.path.exists(text_file) else "No description available"
        text_embedding = get_text_embedding(text).squeeze()
        
        return motion_tensor, text_embedding, text

# üîπ DataLoader
train_loader = DataLoader(MotionTextDataset(train_ids), batch_size=16, shuffle=True)

# üîπ Encodeur bas√© sur LSTM
class MotionEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, latent_dim):
        super(MotionEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, latent_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        return self.fc(hidden)

# üîπ Initialisation du mod√®le
encoder = MotionEncoder(input_size=66, hidden_size=128, latent_dim=512).to(device)
optimizer = optim.AdamW(encoder.parameters(), lr=1e-4)
criterion = nn.MSELoss()
num_epochs = 3

# üîπ Entra√Ænement de l'encodeur
print("üöÄ D√©but de l'entra√Ænement...")
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for motion, text_embedding, _ in progress_bar:
        motion, text_embedding = motion.to(device), text_embedding.to(device)
        optimizer.zero_grad()
        z_motion = encoder(motion)
        loss = criterion(z_motion, text_embedding)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")
print("‚úÖ Entra√Ænement termin√© !")

# üîπ Sauvegarde du mod√®le
torch.save(encoder.state_dict(), "motion_encoder.pth")
print("‚úÖ Encodeur sauvegard√© avec succ√®s !")



üîÑ Chargement des IDs...
‚úÖ Chargement des IDs termin√© !
üîÑ Chargement du mod√®le CLIP...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 338M/338M [00:23<00:00, 15.1MiB/s]


‚úÖ CLIP charg√© avec succ√®s !
üîÑ Chargement du mod√®le GPT-2...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

‚úÖ GPT-2 charg√© avec succ√®s !
üöÄ D√©but de l'entra√Ænement...


Epoch 1/3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [04:49<00:00,  2.81it/s, loss=0.000693]


Epoch 1/3, Loss: 0.0006930335275102315


Epoch 2/3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [02:15<00:00,  6.00it/s, loss=0.000378]


Epoch 2/3, Loss: 0.00037780532630415594


Epoch 3/3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [02:16<00:00,  5.95it/s, loss=0.000362]

Epoch 3/3, Loss: 0.00036236785227233766
‚úÖ Entra√Ænement termin√© !
‚úÖ Encodeur sauvegard√© avec succ√®s !





In [3]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import GPT2Tokenizer  # On utilisera le tokenizer de GPT-2 pour le vocabulaire

device = "cuda" if torch.cuda.is_available() else "cpu"

# ------------------------------------------------------------
# Chemins des fichiers
# ------------------------------------------------------------
DATA_PATH = "/kaggle/input/human-motion-description-hmd-motion-to-text"
MOTIONS_PATH = os.path.join(DATA_PATH, "motions")
TEXTS_PATH = os.path.join(DATA_PATH, "texts")
train_file = os.path.join(DATA_PATH, "train.txt")
test_file = os.path.join(DATA_PATH, "test.txt")

with open(train_file, "r") as f:
    train_ids = [line.strip() for line in f.readlines()]
with open(test_file, "r") as f:
    test_ids = [line.strip() for line in f.readlines()]

# ------------------------------------------------------------
# Utilisation du tokenizer de GPT-2 (pour le vocabulaire)
# ------------------------------------------------------------
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# On s'assure d'avoir un token de padding. Par d√©faut GPT-2 n'en poss√®de pas.
tokenizer.pad_token = tokenizer.eos_token

# ------------------------------------------------------------
# Dataset pour le d√©codeur
# Pour chaque motion, on charge la motion (.npy) et le texte associ√© (ex: "000009.txt")
# On prend la premi√®re ligne du texte (avant un √©ventuel '#' si pr√©sent)
# On pr√©pare les inputs pour le teacher forcing en d√©calant d'un token.
# ------------------------------------------------------------
class MotionTextDecoderDataset(Dataset):
    def __init__(self, motion_ids, motions_path, texts_path, tokenizer, max_length=64):
        self.motion_ids = motion_ids
        self.motions_path = motions_path
        self.texts_path = texts_path
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.motion_ids)
    
    def __getitem__(self, idx):
        motion_id = self.motion_ids[idx]
        # Charger la motion
        motion_path = os.path.join(self.motions_path, f"{motion_id}.npy")
        motion = np.load(motion_path)
        motion_tensor = torch.tensor(motion.reshape(motion.shape[0], -1), dtype=torch.float32)
        
        # Charger le texte associ√©
        text_path = os.path.join(self.texts_path, f"{motion_id}.txt")
        if os.path.exists(text_path):
            with open(text_path, encoding="utf-8") as f:
                line = f.readline().strip()
            # Si le texte contient plusieurs descriptions s√©par√©es par '#', on prend la premi√®re
            text = line.split('#')[0].strip()
        else:
            text = "No description available"
        
        # Tokenisation
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length,
                                  padding="max_length", return_tensors="pt")
        input_ids = encoding["input_ids"].squeeze(0)  # s√©quence de tokens (longueur max_length)
        # Pour le teacher forcing, on d√©calle : 
        # On utilise input_ids[:-1] comme entr√©e et input_ids[1:] comme cible.
        decoder_input_ids = input_ids[:-1]
        target_ids = input_ids[1:]
        
        return motion_tensor, decoder_input_ids, target_ids

# ------------------------------------------------------------
# D√©finition de l'encodeur (le m√™me que vous avez entra√Æn√©)
# ------------------------------------------------------------
class MotionEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, latent_dim):
        super(MotionEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, latent_dim)
    
    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(hidden)

encoder = MotionEncoder(input_size=66, hidden_size=128, latent_dim=512).to(device)
encoder.load_state_dict(torch.load("motion_encoder.pth", map_location=device))
encoder.eval()
# On g√®le l'encodeur pour l'entra√Ænement du d√©codeur
for param in encoder.parameters():
    param.requires_grad = False

# ------------------------------------------------------------
# D√©finition du d√©codeur LSTM
# ------------------------------------------------------------
class LSTMDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1):
        super(LSTMDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, input_ids, hidden):
        # input_ids : (B, T)
        embeds = self.embedding(input_ids)  # (B, T, embed_dim)
        outputs, hidden = self.lstm(embeds, hidden)  # outputs : (B, T, hidden_dim)
        logits = self.fc(outputs)  # (B, T, vocab_size)
        return logits, hidden

# ------------------------------------------------------------
# Mod√®le complet : l'encodeur (gel√©) + le d√©codeur LSTM
# On initialise l'√©tat cach√© du d√©codeur √† partir de l'embedding de motion.
# ------------------------------------------------------------
class MotionToTextDecoder(nn.Module):
    def __init__(self, encoder, decoder, latent_dim, hidden_dim):
        super(MotionToTextDecoder, self).__init__()
        self.encoder = encoder  # Gel√©
        self.decoder = decoder
        # Projection de l'embedding latent vers l'√©tat initial du LSTM (hidden state)
        self.latent_to_hidden = nn.Linear(latent_dim, hidden_dim)
    
    def forward(self, motion, decoder_input_ids):
        batch_size = motion.size(0)
        latent = self.encoder(motion)  # (B, latent_dim)
        # Initialisation de l'√©tat cach√© et de la cellule
        h0 = self.latent_to_hidden(latent).unsqueeze(0)  # (1, B, hidden_dim)
        c0 = torch.zeros_like(h0)  # (1, B, hidden_dim)
        hidden = (h0, c0)
        logits, _ = self.decoder(decoder_input_ids, hidden)  # (B, T, vocab_size)
        return logits

vocab_size = len(tokenizer)
embed_dim = 512
hidden_dim = 512
decoder = LSTMDecoder(vocab_size, embed_dim, hidden_dim, num_layers=1).to(device)
model_decoder = MotionToTextDecoder(encoder, decoder, latent_dim=512, hidden_dim=hidden_dim).to(device)

# ------------------------------------------------------------
# Cr√©ation du dataset et DataLoader pour l'entra√Ænement du d√©codeur
# ------------------------------------------------------------
train_dataset_decoder = MotionTextDecoderDataset(train_ids, MOTIONS_PATH, TEXTS_PATH, tokenizer, max_length=64)
train_loader_decoder = DataLoader(train_dataset_decoder, batch_size=16, shuffle=True)

# ------------------------------------------------------------
# Entra√Ænement du d√©codeur
# ------------------------------------------------------------
optimizer_decoder = optim.AdamW(model_decoder.parameters(), lr=1e-4)
criterion_decoder = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
num_epochs_decoder = 60

print("üöÄ D√©but de l'entra√Ænement du d√©codeur...")
model_decoder.train()
for epoch in range(num_epochs_decoder):
    total_loss = 0
    pbar = tqdm(train_loader_decoder, desc=f"Epoch {epoch+1}/{num_epochs_decoder}")
    for motion, decoder_input_ids, target_ids in pbar:
        motion = motion.to(device)
        decoder_input_ids = decoder_input_ids.to(device)
        target_ids = target_ids.to(device)
        
        optimizer_decoder.zero_grad()
        logits = model_decoder(motion, decoder_input_ids)  # (B, T, vocab_size)
        loss = criterion_decoder(logits.view(-1, vocab_size), target_ids.view(-1))
        loss.backward()
        optimizer_decoder.step()
        
        total_loss += loss.item()
        pbar.set_postfix(loss=f"{total_loss/(pbar.n+1):.4f}")
    print(f"Epoch {epoch+1}/{num_epochs_decoder}, Loss: {total_loss/len(train_loader_decoder):.4f}")
print("‚úÖ Fin de l'entra√Ænement du d√©codeur!")

# ------------------------------------------------------------
# Fonction d'inf√©rence pour g√©n√©rer une phrase √† partir d'une motion donn√©e
# ------------------------------------------------------------
def generate_sentence_decoder(model, motion_id, max_len=128):
    model.eval()
    # Charger la motion
    motion = np.load(os.path.join(MOTIONS_PATH, f"{motion_id}.npy"))
    motion_tensor = torch.tensor(motion.reshape(motion.shape[0], -1), dtype=torch.float32).unsqueeze(0).to(device)
    with torch.no_grad():
        latent = model.encoder(motion_tensor)  # (1, 512)
        h0 = model.latent_to_hidden(latent).unsqueeze(0)  # (1, 1, hidden_dim)
        c0 = torch.zeros_like(h0)
        hidden = (h0, c0)
    
    # Initialiser avec un token de d√©part.
    # Si le tokenizer dispose d'un token BOS, on l'utilise, sinon on utilise EOS.
    start_token = tokenizer.bos_token_id if tokenizer.bos_token_id is not None else tokenizer.eos_token_id
    input_token = torch.tensor([[start_token]], device=device)
    generated_tokens = [start_token]
    
    # G√©n√©ration token par token
    for _ in range(max_len):
        with torch.no_grad():
            embeds = model.decoder.embedding(input_token)  # (1, 1, embed_dim)
            output, hidden = model.decoder.lstm(embeds, hidden)  # (1, 1, hidden_dim)
            logits = model.decoder.fc(output.squeeze(1))  # (1, vocab_size)
            next_token = torch.argmax(logits, dim=-1).unsqueeze(0)  # (1, 1) - Greedy search
            token_id = next_token.item()
            if token_id == tokenizer.eos_token_id:
                break
            generated_tokens.append(token_id)
            input_token = next_token
    sentence = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return sentence

# ------------------------------------------------------------
# Exemple d'inf√©rence sur une motion du test set
# ------------------------------------------------------------
motion_id_example = test_ids[0]  # Par exemple, "000009"
sentence_generated = generate_sentence_decoder(model_decoder, motion_id_example, max_len=30)
print(f"Motion {motion_id_example} -> {sentence_generated}")


  encoder.load_state_dict(torch.load("motion_encoder.pth", map_location=device))


üöÄ D√©but de l'entra√Ænement du d√©codeur...


Epoch 1/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.14it/s, loss=5.2163]


Epoch 1/60, Loss: 5.2163


Epoch 2/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:00<00:00, 13.45it/s, loss=3.9945]


Epoch 2/60, Loss: 3.9896


Epoch 3/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.74it/s, loss=3.6808]


Epoch 3/60, Loss: 3.6763


Epoch 4/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.69it/s, loss=3.4708]


Epoch 4/60, Loss: 3.4665


Epoch 5/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.76it/s, loss=3.3077]


Epoch 5/60, Loss: 3.3036


Epoch 6/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:58<00:00, 13.85it/s, loss=3.1646]


Epoch 6/60, Loss: 3.1607


Epoch 7/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:58<00:00, 13.99it/s, loss=3.0345]


Epoch 7/60, Loss: 3.0308


Epoch 8/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:00<00:00, 13.44it/s, loss=2.9189]


Epoch 8/60, Loss: 2.9153


Epoch 9/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:03<00:00, 12.77it/s, loss=2.8159]


Epoch 9/60, Loss: 2.8124


Epoch 10/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.22it/s, loss=2.7119]


Epoch 10/60, Loss: 2.7086


Epoch 11/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.58it/s, loss=2.6207]


Epoch 11/60, Loss: 2.6175


Epoch 12/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:58<00:00, 13.80it/s, loss=2.5285]


Epoch 12/60, Loss: 2.5254


Epoch 13/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.74it/s, loss=2.4439]


Epoch 13/60, Loss: 2.4409


Epoch 14/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:00<00:00, 13.44it/s, loss=2.3610]


Epoch 14/60, Loss: 2.3581


Epoch 15/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:00<00:00, 13.42it/s, loss=2.2807]


Epoch 15/60, Loss: 2.2779


Epoch 16/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.26it/s, loss=2.2039]


Epoch 16/60, Loss: 2.2012


Epoch 17/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.19it/s, loss=2.1301]


Epoch 17/60, Loss: 2.1275


Epoch 18/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.15it/s, loss=2.0578]


Epoch 18/60, Loss: 2.0553


Epoch 19/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.20it/s, loss=1.9889]


Epoch 19/60, Loss: 1.9865


Epoch 20/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.27it/s, loss=1.9212]


Epoch 20/60, Loss: 1.9189


Epoch 21/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.15it/s, loss=1.8580]


Epoch 21/60, Loss: 1.8557


Epoch 22/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:00<00:00, 13.56it/s, loss=1.7943]


Epoch 22/60, Loss: 1.7921


Epoch 23/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.64it/s, loss=1.7380]


Epoch 23/60, Loss: 1.7359


Epoch 24/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.18it/s, loss=1.6779]


Epoch 24/60, Loss: 1.6758


Epoch 25/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:00<00:00, 13.47it/s, loss=1.6234]


Epoch 25/60, Loss: 1.6214


Epoch 26/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.72it/s, loss=1.5712]


Epoch 26/60, Loss: 1.5692


Epoch 27/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.66it/s, loss=1.5198]


Epoch 27/60, Loss: 1.5179


Epoch 28/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.67it/s, loss=1.4696]


Epoch 28/60, Loss: 1.4677


Epoch 29/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.59it/s, loss=1.4244]


Epoch 29/60, Loss: 1.4227


Epoch 30/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:02<00:00, 12.98it/s, loss=1.3794]


Epoch 30/60, Loss: 1.3777


Epoch 31/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:03<00:00, 12.89it/s, loss=1.3355]


Epoch 31/60, Loss: 1.3338


Epoch 32/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.21it/s, loss=1.2952]


Epoch 32/60, Loss: 1.2936


Epoch 33/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:03<00:00, 12.79it/s, loss=1.2559]


Epoch 33/60, Loss: 1.2543


Epoch 34/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:03<00:00, 12.81it/s, loss=1.2193]


Epoch 34/60, Loss: 1.2178


Epoch 35/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:02<00:00, 13.11it/s, loss=1.1841]


Epoch 35/60, Loss: 1.1827


Epoch 36/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:03<00:00, 12.90it/s, loss=1.1482]


Epoch 36/60, Loss: 1.1468


Epoch 37/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:00<00:00, 13.53it/s, loss=1.1180]


Epoch 37/60, Loss: 1.1167


Epoch 38/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:58<00:00, 13.85it/s, loss=1.0856]


Epoch 38/60, Loss: 1.0843


Epoch 39/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:00<00:00, 13.55it/s, loss=1.0574]


Epoch 39/60, Loss: 1.0561


Epoch 40/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:58<00:00, 13.81it/s, loss=1.0307]


Epoch 40/60, Loss: 1.0294


Epoch 41/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:58<00:00, 13.91it/s, loss=1.0026]


Epoch 41/60, Loss: 1.0013


Epoch 42/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.76it/s, loss=0.9783]


Epoch 42/60, Loss: 0.9771


Epoch 43/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.77it/s, loss=0.9547]


Epoch 43/60, Loss: 0.9535


Epoch 44/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:00<00:00, 13.49it/s, loss=0.9326]


Epoch 44/60, Loss: 0.9314


Epoch 45/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.33it/s, loss=0.9087]


Epoch 45/60, Loss: 0.9076


Epoch 46/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.76it/s, loss=0.8871]


Epoch 46/60, Loss: 0.8860


Epoch 47/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:58<00:00, 13.81it/s, loss=0.8686]


Epoch 47/60, Loss: 0.8675


Epoch 48/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.75it/s, loss=0.8493]


Epoch 48/60, Loss: 0.8482


Epoch 49/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.74it/s, loss=0.8315]


Epoch 49/60, Loss: 0.8305


Epoch 50/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.71it/s, loss=0.8167]


Epoch 50/60, Loss: 0.8156


Epoch 51/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.63it/s, loss=0.7995]


Epoch 51/60, Loss: 0.7985


Epoch 52/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:01<00:00, 13.33it/s, loss=0.7849]


Epoch 52/60, Loss: 0.7840


Epoch 53/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:00<00:00, 13.56it/s, loss=0.7716]


Epoch 53/60, Loss: 0.7716


Epoch 54/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.71it/s, loss=0.7551]


Epoch 54/60, Loss: 0.7542


Epoch 55/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.74it/s, loss=0.7410]


Epoch 55/60, Loss: 0.7401


Epoch 56/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:58<00:00, 13.84it/s, loss=0.7302]


Epoch 56/60, Loss: 0.7293


Epoch 57/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:58<00:00, 13.83it/s, loss=0.7199]


Epoch 57/60, Loss: 0.7190


Epoch 58/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:58<00:00, 13.89it/s, loss=0.7078]


Epoch 58/60, Loss: 0.7069


Epoch 59/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.67it/s, loss=0.6960]


Epoch 59/60, Loss: 0.6951


Epoch 60/60: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [00:59<00:00, 13.77it/s, loss=0.6860]


Epoch 60/60, Loss: 0.6852
‚úÖ Fin de l'entra√Ænement du d√©codeur!
Motion 014295 ->  on the leg and start moving. they bring hands up and down while holding the legs apart. they then stand back up and down facing forward. 


In [4]:
# üîπ Chargement du mod√®le sauvegard√© (optionnel, si vous partez d'une nouvelle session)
encoder = MotionEncoder(input_size=66, hidden_size=128, latent_dim=512).to(device)
encoder.load_state_dict(torch.load("motion_encoder.pth", map_location=device))
encoder.eval()  # Mode √©valuation

# üîπ Pr√©paration du DataLoader pour l'ensemble de test
test_dataset = MotionTextDataset(test_ids)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# üîπ Boucle de test
total_test_loss = 0.0
criterion = nn.MSELoss()

with torch.no_grad():
    for motion, text_embedding, text in tqdm(test_loader, desc="Test"):
        motion, text_embedding = motion.to(device), text_embedding.to(device)
        predicted_embedding = encoder(motion)
        loss = criterion(predicted_embedding, text_embedding)
        total_test_loss += loss.item()

avg_test_loss = total_test_loss / len(test_loader)
print(f"Test Loss moyenne : {avg_test_loss:.4f}")


  encoder.load_state_dict(torch.load("motion_encoder.pth", map_location=device))
Test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.28it/s]

Test Loss moyenne : 0.0005





In [21]:
import pandas as pd

# Mettez le mod√®le en mode √©valuation
model_decoder.eval()

results = []
# Parcourir toutes les motions du test
for motion_id in test_ids:
    sentence = generate_sentence_decoder(model_decoder, motion_id, max_len=10)
    results.append({"id": motion_id, "text": sentence})
     # Optionnel : affiche chaque r√©sultat

# Cr√©ation du DataFrame et sauvegarde en CSV
submission = pd.DataFrame(results)
submission.to_csv("submissionv3.csv", index=False)


In [25]:
pd.read_csv("submission.csv")['text'].unique()

array([' on the leg and start moving. they bring hands up and down while holding the legs apart. they then stand back up and down facing forward.  he walks in a complete position before taking a circle back in the way he started to walk back a few times. he turns to the left and walks',
       ' is on the floor and crossing their legs. they are doing the handrail to walk back and forth. then the figure is walking back and forth. the figure turns and turns to the front. then they step back the opposite end of the plane. they are doing the air and turns and takes 3',
       ' the hands and band the legs to the legs. the person is walking and then backwards to the side. then they stop running, stop hands and put something down from behind them and then to stop. he looks like someone else is in fighting but and then they do running and then running from back to',
       ' is walking on a treadmill with arms outstretched. they step over the left foot while moving both arms up and down. he t

# Result : 0.2
# $$$

In [28]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import GPT2Tokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

################################################################################
# 1) Param√®tres & chemins
################################################################################
DATA_PATH = "/kaggle/input/human-motion-description-hmd-motion-to-text"
MOTIONS_PATH = os.path.join(DATA_PATH, "motions")
TEXTS_PATH   = os.path.join(DATA_PATH, "texts")

train_file = os.path.join(DATA_PATH, "train.txt")
val_file   = os.path.join(DATA_PATH, "val.txt")   # suppose qu'il existe
test_file  = os.path.join(DATA_PATH, "test.txt")

num_epochs         = 40
batch_size         = 16
embed_dim          = 512
hidden_dim         = 512
latent_dim         = 512
dropout_p          = 0.2
weight_decay_val   = 1e-2
learning_rate      = 1e-4
num_layers_decoder = 2  # => On aura besoin (2, B, hidden_dim) pour (h0, c0)

# Early stopping
patience       = 3
best_val_loss  = float('inf')
no_improve_cnt = 0

max_length_dataset = 128  # on a choisi 128 tokens

################################################################################
# 2) Chargement des IDs
################################################################################
with open(train_file, "r") as f:
    train_ids = [line.strip() for line in f]

with open(val_file, "r") as f:
    val_ids = [line.strip() for line in f]

with open(test_file, "r") as f:
    test_ids = [line.strip() for line in f]

################################################################################
# 3) Tokenizer GPT-2
################################################################################
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  
vocab_size = len(tokenizer)

################################################################################
# 4) Dataset
################################################################################
class MotionTextDecoderDataset(Dataset):
    def __init__(self, motion_ids, motions_path, texts_path, tokenizer, max_length=64):
        self.motion_ids   = motion_ids
        self.motions_path = motions_path
        self.texts_path   = texts_path
        self.tokenizer    = tokenizer
        self.max_length   = max_length

    def __len__(self):
        return len(self.motion_ids)
    
    def __getitem__(self, idx):
        motion_id = self.motion_ids[idx]
        
        # Charger la motion
        motion_path = os.path.join(self.motions_path, f"{motion_id}.npy")
        motion      = np.load(motion_path)
        motion_tensor = torch.tensor(motion.reshape(motion.shape[0], -1), dtype=torch.float32)
        
        # Charger le texte
        text_path = os.path.join(self.texts_path, f"{motion_id}.txt")
        if os.path.exists(text_path):
            with open(text_path, encoding="utf-8") as f:
                line = f.readline().strip()
            text = line.split('#')[0].strip()
        else:
            text = "No description available"
        
        # Tokenisation
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze(0)

        # D√©calage
        decoder_input_ids = input_ids[:-1]
        target_ids        = input_ids[1:]
        
        return motion_tensor, decoder_input_ids, target_ids

################################################################################
# 5) Encodeur gel√©
################################################################################
class MotionEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, latent_dim):
        super(MotionEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc   = nn.Linear(hidden_size * 2, latent_dim)
    
    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(hidden_cat)

encoder = MotionEncoder(input_size=66, hidden_size=128, latent_dim=latent_dim).to(device)
encoder.load_state_dict(torch.load("motion_encoder.pth", map_location=device))
encoder.eval()
for param in encoder.parameters():
    param.requires_grad = False

################################################################################
# 6) D√©codeur LSTM (2 couches + dropout=0.2)
################################################################################
class LSTMDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout=0.2, num_layers=2):
        super(LSTMDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # Le dropout sera effectif entre les 2 couches
        self.lstm      = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, 
                                 batch_first=True, dropout=dropout)
        self.fc        = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, input_ids, hidden):
        embeds = self.embedding(input_ids)  # (B, T, embed_dim)
        outputs, hidden = self.lstm(embeds, hidden)  # outputs: (B, T, hidden_dim)
        logits = self.fc(outputs)                   # (B, T, vocab_size)
        return logits, hidden

################################################################################
# 7) Mod√®le global : encodeur gel√© + d√©codeur
################################################################################
class MotionToTextDecoder(nn.Module):
    def __init__(self, encoder, decoder, latent_dim, hidden_dim, num_layers=2):
        super(MotionToTextDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.latent_to_hidden = nn.Linear(latent_dim, hidden_dim)
        self.num_layers = num_layers  # pour r√©p√©ter h0, c0

    def forward(self, motion, dec_in):
        latent = self.encoder(motion)      # (B, latent_dim)

        # Au lieu de (1, B, hidden_dim), on cr√©e (num_layers, B, hidden_dim)
        # Si num_layers=2, on r√©p√®te 2 fois la m√™me init (ou on pourrait init diff√©remment).
        h0_1 = self.latent_to_hidden(latent)              # (B, hidden_dim)
        h0   = h0_1.unsqueeze(0).repeat(self.num_layers, 1, 1) # (num_layers, B, hidden_dim)
        c0   = torch.zeros_like(h0)

        logits, _ = self.decoder(dec_in, (h0, c0))
        return logits

################################################################################
# 8) DataLoader (train + val)
################################################################################
train_dataset = MotionTextDecoderDataset(train_ids, MOTIONS_PATH, TEXTS_PATH, tokenizer, max_length_dataset)
val_dataset   = MotionTextDecoderDataset(val_ids,   MOTIONS_PATH, TEXTS_PATH, tokenizer, max_length_dataset)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)

################################################################################
# 9) Instanciation mod√®le, optimizer & loss
################################################################################
decoder_model = LSTMDecoder(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    dropout=dropout_p,
    num_layers=num_layers_decoder
).to(device)

model_decoder = MotionToTextDecoder(
    encoder, 
    decoder_model, 
    latent_dim, 
    hidden_dim,
    num_layers=num_layers_decoder
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.AdamW(model_decoder.parameters(), lr=learning_rate, weight_decay=weight_decay_val)

################################################################################
# 10) Entra√Ænement + Validation + Early Stopping
################################################################################
print("üöÄ D√©but de l'entra√Ænement (avec validation) ...")

for epoch in range(num_epochs):
    # --------------------
    # Phase d'entra√Ænement
    # --------------------
    model_decoder.train()
    total_train_loss = 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
    for motion, dec_in, targets in pbar:
        motion  = motion.to(device)
        dec_in  = dec_in.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        logits = model_decoder(motion, dec_in)  # (B, T, vocab_size)
        loss   = criterion(logits.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        pbar.set_postfix({'loss': f"{total_train_loss/(pbar.n+1):.4f}"})
    
    avg_train_loss = total_train_loss / len(train_loader)
    
    # --------------------
    # Phase de validation
    # --------------------
    model_decoder.eval()
    total_val_loss = 0
    with torch.no_grad():
        for motion, dec_in, targets in val_loader:
            motion  = motion.to(device)
            dec_in  = dec_in.to(device)
            targets = targets.to(device)

            logits = model_decoder(motion, dec_in)
            val_loss = criterion(logits.view(-1, vocab_size), targets.view(-1))
            total_val_loss += val_loss.item()
    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    # Early stopping (simple exemple)
    global best_val_loss, no_improve_cnt
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improve_cnt = 0
        torch.save(model_decoder.state_dict(), "best_decoder.pth")
    else:
        no_improve_cnt += 1
        if no_improve_cnt >= patience:
            print(f"Early stopping d√©clench√© (val loss stagne depuis {patience} √©poques).")
            break

print("‚úÖ Fin de l'entra√Ænement !")

################################################################################
# 11) Inf√©rence (optionnel)
################################################################################
def generate_sentence_decoder(model, motion_id, max_len=50):
    model.eval()
    
    motion_path = os.path.join(MOTIONS_PATH, f"{motion_id}.npy")
    motion = np.load(motion_path)
    motion_tensor = torch.tensor(
        motion.reshape(motion.shape[0], -1), 
        dtype=torch.float32
    ).unsqueeze(0).to(device)

    with torch.no_grad():
        latent = model.encoder(motion_tensor)
        # h0 pour num_layers=2
        h0_1 = model.latent_to_hidden(latent)                # (1, hidden_dim)
        h0   = h0_1.unsqueeze(0).repeat(model.num_layers, 1, 1)  # (num_layers, 1, hidden_dim)
        c0   = torch.zeros_like(h0)
        hidden = (h0, c0)

    start_token = tokenizer.bos_token_id if tokenizer.bos_token_id else tokenizer.eos_token_id
    input_token = torch.tensor([[start_token]], device=device)
    generated_tokens = [start_token]

    for _ in range(max_len):
        with torch.no_grad():
            embeds = model.decoder.embedding(input_token)
            output, hidden = model.decoder.lstm(embeds, hidden)
            logits = model.decoder.fc(output.squeeze(1))

            next_token_id = torch.argmax(logits, dim=-1).unsqueeze(0)
            token_id = next_token_id.item()

            if token_id == tokenizer.eos_token_id:
                break

            generated_tokens.append(token_id)
            input_token = next_token_id
    
    text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return text

# Exemple d'inf√©rence (si tu veux tester sur un ID du val set)
if len(val_ids) > 0:
    sample_id = val_ids[0]
    gen_text = generate_sentence_decoder(model_decoder, sample_id, max_len=50)
    print(f"\nExemple g√©n√©ration (motion {sample_id}):\n{gen_text}")


  encoder.load_state_dict(torch.load("motion_encoder.pth", map_location=device))


üöÄ D√©but de l'entra√Ænement (avec validation) ...


Epoch 1/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:38<00:00,  8.29it/s, loss=5.5372]


Epoch 1/40 | Train Loss: 5.5372 | Val Loss: 4.9799


Epoch 2/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:38<00:00,  8.26it/s, loss=4.6884]


Epoch 2/40 | Train Loss: 4.6884 | Val Loss: 4.3415


Epoch 3/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.35it/s, loss=4.1202]


Epoch 3/40 | Train Loss: 4.1202 | Val Loss: 3.9528


Epoch 4/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:39<00:00,  8.22it/s, loss=3.8122]


Epoch 4/40 | Train Loss: 3.8122 | Val Loss: 3.7242


Epoch 5/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.31it/s, loss=3.5951]


Epoch 5/40 | Train Loss: 3.5951 | Val Loss: 3.5524


Epoch 6/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.35it/s, loss=3.4138]


Epoch 6/40 | Train Loss: 3.4138 | Val Loss: 3.4067


Epoch 7/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:38<00:00,  8.29it/s, loss=3.2584]


Epoch 7/40 | Train Loss: 3.2584 | Val Loss: 3.2859


Epoch 8/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:38<00:00,  8.28it/s, loss=3.1235]


Epoch 8/40 | Train Loss: 3.1235 | Val Loss: 3.1908


Epoch 9/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:38<00:00,  8.30it/s, loss=2.9965]


Epoch 9/40 | Train Loss: 2.9965 | Val Loss: 3.1034


Epoch 10/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.36it/s, loss=2.8841]


Epoch 10/40 | Train Loss: 2.8841 | Val Loss: 3.0268


Epoch 11/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.33it/s, loss=2.7827]


Epoch 11/40 | Train Loss: 2.7827 | Val Loss: 2.9623


Epoch 12/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:39<00:00,  8.19it/s, loss=2.6788]


Epoch 12/40 | Train Loss: 2.6788 | Val Loss: 2.8966


Epoch 13/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:38<00:00,  8.26it/s, loss=2.5870]


Epoch 13/40 | Train Loss: 2.5870 | Val Loss: 2.8451


Epoch 14/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.32it/s, loss=2.4965]


Epoch 14/40 | Train Loss: 2.4965 | Val Loss: 2.7870


Epoch 15/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.36it/s, loss=2.4077]


Epoch 15/40 | Train Loss: 2.4077 | Val Loss: 2.7383


Epoch 16/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:38<00:00,  8.27it/s, loss=2.3235]


Epoch 16/40 | Train Loss: 2.3235 | Val Loss: 2.6830


Epoch 17/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.35it/s, loss=2.2421]


Epoch 17/40 | Train Loss: 2.2421 | Val Loss: 2.6394


Epoch 18/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:38<00:00,  8.27it/s, loss=2.1628]


Epoch 18/40 | Train Loss: 2.1628 | Val Loss: 2.5924


Epoch 19/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:38<00:00,  8.30it/s, loss=2.0901]


Epoch 19/40 | Train Loss: 2.0901 | Val Loss: 2.5519


Epoch 20/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.37it/s, loss=2.0180]


Epoch 20/40 | Train Loss: 2.0180 | Val Loss: 2.5050


Epoch 21/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.35it/s, loss=1.9496]


Epoch 21/40 | Train Loss: 1.9496 | Val Loss: 2.4657


Epoch 22/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:43<00:00,  7.90it/s, loss=1.8834]


Epoch 22/40 | Train Loss: 1.8834 | Val Loss: 2.4264


Epoch 23/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:36<00:00,  8.43it/s, loss=1.8203]


Epoch 23/40 | Train Loss: 1.8203 | Val Loss: 2.3861


Epoch 24/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:36<00:00,  8.43it/s, loss=1.7576]


Epoch 24/40 | Train Loss: 1.7576 | Val Loss: 2.3457


Epoch 25/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:35<00:00,  8.48it/s, loss=1.7013]


Epoch 25/40 | Train Loss: 1.7013 | Val Loss: 2.3103


Epoch 26/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:36<00:00,  8.45it/s, loss=1.6456]


Epoch 26/40 | Train Loss: 1.6456 | Val Loss: 2.2836


Epoch 27/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.37it/s, loss=1.5922]


Epoch 27/40 | Train Loss: 1.5922 | Val Loss: 2.2449


Epoch 28/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:34<00:00,  8.57it/s, loss=1.5397]


Epoch 28/40 | Train Loss: 1.5397 | Val Loss: 2.2069


Epoch 29/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:34<00:00,  8.59it/s, loss=1.4932]


Epoch 29/40 | Train Loss: 1.4932 | Val Loss: 2.1758


Epoch 30/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.34it/s, loss=1.4450]


Epoch 30/40 | Train Loss: 1.4450 | Val Loss: 2.1473


Epoch 31/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.34it/s, loss=1.4011]


Epoch 31/40 | Train Loss: 1.4011 | Val Loss: 2.1175


Epoch 32/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:36<00:00,  8.48it/s, loss=1.3601]


Epoch 32/40 | Train Loss: 1.3601 | Val Loss: 2.0894


Epoch 33/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:35<00:00,  8.51it/s, loss=1.3197]


Epoch 33/40 | Train Loss: 1.3197 | Val Loss: 2.0696


Epoch 34/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.33it/s, loss=1.2805]


Epoch 34/40 | Train Loss: 1.2805 | Val Loss: 2.0464


Epoch 35/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:39<00:00,  8.20it/s, loss=1.2421]


Epoch 35/40 | Train Loss: 1.2421 | Val Loss: 2.0072


Epoch 36/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.37it/s, loss=1.2054]


Epoch 36/40 | Train Loss: 1.2054 | Val Loss: 1.9811


Epoch 37/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.36it/s, loss=1.1712]


Epoch 37/40 | Train Loss: 1.1712 | Val Loss: 1.9711


Epoch 38/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:35<00:00,  8.49it/s, loss=1.1427]


Epoch 38/40 | Train Loss: 1.1427 | Val Loss: 1.9391


Epoch 39/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:35<00:00,  8.54it/s, loss=1.1089]


Epoch 39/40 | Train Loss: 1.1089 | Val Loss: 1.9172


Epoch 40/40 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:35<00:00,  8.50it/s, loss=1.0817]


Epoch 40/40 | Train Loss: 1.0817 | Val Loss: 1.9017
‚úÖ Fin de l'entra√Ænement !

Exemple g√©n√©ration (motion M006646):
 standing on one leg, the person walks forward, turns left and walks to the right. they then stop facing backwards.  he then steps back to original position. he then stands back up by continuing to walk.  he steps forward and stops once


In [38]:
import pandas as pd

# Mettez le mod√®le en mode √©valuation
model_decoder.eval()

results = []
# Parcourir toutes les motions du test
for motion_id in test_ids:
    sentence = generate_sentence_decoder(model_decoder, motion_id, max_len=50)
    results.append({"id": motion_id, "text": sentence})
     # Optionnel : affiche chaque r√©sultat

# Cr√©ation du DataFrame et sauvegarde en CSV
submission = pd.DataFrame(results)
submission.to_csv("submissionv4.csv", index=False)


In [39]:
pd.read_csv("submissionv4.csv")['text'].unique()

array([' person is doing a dance and stretches.  lifting both arms overhead, he tiptoes in a circle and returning to the spot with his hands. he stops, then returns to his original position. he then takes a few steps forward. he sits',
       ' standing, a person lifts his right hand to his face. he then lifts his hand and makes a small gesture with his right hand. he then raises his left hand, returns the object at his shoulder, and finally repeats a few times. then repeats',
       ' standing, a person steps forward, lifts their left hand up to mouth, then lowers it down and then walks backwards to the original position. they then raise their left hand and pull an object from waist height, placing the item then is swimming for a',
       ' person is walking forward slowly, bracing their hands together, and then their hands are together. they stop at the end of the push, and do them again. the person is using both hands together to see, and finally is in another of them',
       ' per

In [None]:
#

# t

In [None]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import GPT2Tokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

################################################################################
# 1) Param√®tres & chemins
################################################################################
DATA_PATH = "/kaggle/input/human-motion-description-hmd-motion-to-text"
MOTIONS_PATH = os.path.join(DATA_PATH, "motions")
TEXTS_PATH   = os.path.join(DATA_PATH, "texts")

train_file = os.path.join(DATA_PATH, "train.txt")
val_file   = os.path.join(DATA_PATH, "val.txt")
test_file  = os.path.join(DATA_PATH, "test.txt")

num_epochs         = 20   # Moins d'√©poques pour gagner du temps
batch_size         = 16
embed_dim          = 512
hidden_dim         = 512
latent_dim         = 512
dropout_p          = 0.2
weight_decay_val   = 1e-2
learning_rate      = 1e-4
num_layers_decoder = 2

# Early stopping
patience       = 3
best_val_loss  = float('inf')
no_improve_cnt = 0

max_length_dataset = 128  # Longueur max tokens

################################################################################
# 2) Chargement des IDs
################################################################################
with open(train_file, "r") as f:
    train_ids = [line.strip() for line in f]
with open(val_file, "r") as f:
    val_ids = [line.strip() for line in f]
with open(test_file, "r") as f:
    test_ids = [line.strip() for line in f]

################################################################################
# 3) Tokenizer GPT-2
################################################################################
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  
vocab_size = len(tokenizer)

################################################################################
# 4) Dataset (choix al√©atoire de la description)
################################################################################
class MotionTextDecoderDataset(Dataset):
    def __init__(self, motion_ids, motions_path, texts_path, tokenizer, max_length=64):
        self.motion_ids   = motion_ids
        self.motions_path = motions_path
        self.texts_path   = texts_path
        self.tokenizer    = tokenizer
        self.max_length   = max_length

    def __len__(self):
        return len(self.motion_ids)
    
    def __getitem__(self, idx):
        motion_id = self.motion_ids[idx]
        
        # Charger la motion
        motion_path = os.path.join(self.motions_path, f"{motion_id}.npy")
        motion = np.load(motion_path)
        motion_tensor = torch.tensor(motion.reshape(motion.shape[0], -1), dtype=torch.float32)
        
        # Charger les lignes du fichier texte
        text_path = os.path.join(self.texts_path, f"{motion_id}.txt")
        if os.path.exists(text_path):
            with open(text_path, encoding="utf-8") as f:
                lines = [l.strip() for l in f.readlines() if l.strip()]
            # Choisir une ligne au hasard
            line = random.choice(lines)
            # Couper au '#' si besoin
            text = line.split('#')[0].strip()
        else:
            text = "No description available"
        
        # Tokenisation
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze(0)

        # D√©calage
        decoder_input_ids = input_ids[:-1]
        target_ids        = input_ids[1:]
        
        return motion_tensor, decoder_input_ids, target_ids

################################################################################
# 5) Encodeur gel√©
################################################################################
class MotionEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, latent_dim):
        super(MotionEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc   = nn.Linear(hidden_size * 2, latent_dim)
    
    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(hidden_cat)

encoder = MotionEncoder(input_size=66, hidden_size=128, latent_dim=latent_dim).to(device)
encoder.load_state_dict(torch.load("motion_encoder.pth", map_location=device))
encoder.eval()
for param in encoder.parameters():
    param.requires_grad = False

################################################################################
# 6) D√©codeur LSTM (2 couches + dropout)
################################################################################
class LSTMDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout=0.2, num_layers=2):
        super(LSTMDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm      = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, 
                                 batch_first=True, dropout=dropout)
        self.fc        = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, input_ids, hidden):
        embeds = self.embedding(input_ids)  
        outputs, hidden = self.lstm(embeds, hidden)
        logits = self.fc(outputs)
        return logits, hidden

################################################################################
# 7) Mod√®le complet
################################################################################
class MotionToTextDecoder(nn.Module):
    def __init__(self, encoder, decoder, latent_dim, hidden_dim, num_layers=2):
        super(MotionToTextDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.latent_to_hidden = nn.Linear(latent_dim, hidden_dim)
        self.num_layers = num_layers

    def forward(self, motion, dec_in):
        latent = self.encoder(motion)  # (B, latent_dim)
        # h0 : (num_layers, B, hidden_dim)
        h0_1 = self.latent_to_hidden(latent)
        h0   = h0_1.unsqueeze(0).repeat(self.num_layers, 1, 1)
        c0   = torch.zeros_like(h0)
        logits, _ = self.decoder(dec_in, (h0, c0))
        return logits

################################################################################
# 8) DataLoader (train + val)
################################################################################
train_dataset = MotionTextDecoderDataset(train_ids, MOTIONS_PATH, TEXTS_PATH, tokenizer, max_length_dataset)
val_dataset   = MotionTextDecoderDataset(val_ids,   MOTIONS_PATH, TEXTS_PATH, tokenizer, max_length_dataset)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)

################################################################################
# 9) Instanciation mod√®le, optimizer & loss
################################################################################
decoder_model = LSTMDecoder(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    dropout=dropout_p,
    num_layers=num_layers_decoder
).to(device)

model_decoder = MotionToTextDecoder(
    encoder, 
    decoder_model, 
    latent_dim, 
    hidden_dim,
    num_layers=num_layers_decoder
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.AdamW(model_decoder.parameters(), lr=learning_rate, weight_decay=weight_decay_val)

################################################################################
# 10) Entra√Ænement + Validation + Early Stopping
################################################################################
print("üöÄ D√©but de l'entra√Ænement (avec validation) ...")

best_val_loss = float('inf')
no_improve_cnt = 0

for epoch in range(num_epochs):
    # --------------------
    # Phase d'entra√Ænement
    # --------------------
    model_decoder.train()
    total_train_loss = 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
    for motion, dec_in, targets in pbar:
        motion  = motion.to(device)
        dec_in  = dec_in.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        logits = model_decoder(motion, dec_in)
        loss   = criterion(logits.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        pbar.set_postfix({'loss': f"{total_train_loss/(pbar.n+1):.4f}"})
    
    avg_train_loss = total_train_loss / len(train_loader)
    
    # --------------------
    # Phase de validation
    # --------------------
    model_decoder.eval()
    total_val_loss = 0
    with torch.no_grad():
        for motion, dec_in, targets in val_loader:
            motion  = motion.to(device)
            dec_in  = dec_in.to(device)
            targets = targets.to(device)

            logits = model_decoder(motion, dec_in)
            val_loss = criterion(logits.view(-1, vocab_size), targets.view(-1))
            total_val_loss += val_loss.item()
    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improve_cnt = 0
        torch.save(model_decoder.state_dict(), "best_decoder.pth")
    else:
        no_improve_cnt += 1
        if no_improve_cnt >= patience:
            print(f"Early stopping d√©clench√© (val loss ne s'am√©liore plus depuis {patience} √©poques).")
            break

print("‚úÖ Fin de l'entra√Ænement !")

################################################################################
# 11) Inf√©rence (optionnel)
################################################################################
def generate_sentence_decoder(model, motion_id, max_len=50):
    model.eval()
    
    motion_path = os.path.join(MOTIONS_PATH, f"{motion_id}.npy")
    motion = np.load(motion_path)
    motion_tensor = torch.tensor(motion.reshape(motion.shape[0], -1), 
                                 dtype=torch.float32).unsqueeze(0).to(device)

    with torch.no_grad():
        latent = model.encoder(motion_tensor)
        h0_1 = model.latent_to_hidden(latent)  
        h0   = h0_1.unsqueeze(0).repeat(model.num_layers, 1, 1)
        c0   = torch.zeros_like(h0)
        hidden = (h0, c0)

    start_token = tokenizer.bos_token_id if tokenizer.bos_token_id else tokenizer.eos_token_id
    input_token = torch.tensor([[start_token]], device=device)
    generated_tokens = [start_token]

    for _ in range(max_len):
        with torch.no_grad():
            embeds = model.decoder.embedding(input_token)
            output, hidden = model.decoder.lstm(embeds, hidden)
            logits = model.decoder.fc(output.squeeze(1))

            next_token_id = torch.argmax(logits, dim=-1).unsqueeze(0)
            token_id = next_token_id.item()

            if token_id == tokenizer.eos_token_id:
                break

            generated_tokens.append(token_id)
            input_token = next_token_id
    
    text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return text

# Exemple d'inf√©rence
if len(val_ids) > 0:
    sample_id = val_ids[0]
    gen_text = generate_sentence_decoder(model_decoder, sample_id, max_len=50)
    print(f"\nExemple g√©n√©ration (motion {sample_id}):\n{gen_text}")


  encoder.load_state_dict(torch.load("motion_encoder.pth", map_location=device))


üöÄ D√©but de l'entra√Ænement (avec validation) ...


Epoch 1/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.31it/s, loss=5.5538]


Epoch 1/20 | Train Loss: 5.5538 | Val Loss: 5.0214


Epoch 2/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:34<00:00,  8.60it/s, loss=4.7445]


Epoch 2/20 | Train Loss: 4.7445 | Val Loss: 4.4503


Epoch 3/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:35<00:00,  8.55it/s, loss=4.2019]


Epoch 3/20 | Train Loss: 4.2019 | Val Loss: 4.0218


Epoch 4/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:34<00:00,  8.60it/s, loss=3.8952]


Epoch 4/20 | Train Loss: 3.8952 | Val Loss: 3.8085


Epoch 5/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:34<00:00,  8.60it/s, loss=3.7038]


Epoch 5/20 | Train Loss: 3.7038 | Val Loss: 3.6289


Epoch 6/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:34<00:00,  8.61it/s, loss=3.5546]


Epoch 6/20 | Train Loss: 3.5546 | Val Loss: 3.4970


Epoch 7/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:36<00:00,  8.39it/s, loss=3.4131]


Epoch 7/20 | Train Loss: 3.4131 | Val Loss: 3.3877


Epoch 8/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:35<00:00,  8.49it/s, loss=3.3140]


Epoch 8/20 | Train Loss: 3.3140 | Val Loss: 3.3077


Epoch 9/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:36<00:00,  8.48it/s, loss=3.2193]


Epoch 9/20 | Train Loss: 3.2193 | Val Loss: 3.2191


Epoch 10/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:35<00:00,  8.52it/s, loss=3.1380]


Epoch 10/20 | Train Loss: 3.1380 | Val Loss: 3.1849


Epoch 11/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:36<00:00,  8.47it/s, loss=3.0675]


Epoch 11/20 | Train Loss: 3.0675 | Val Loss: 3.1111


Epoch 12/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:36<00:00,  8.43it/s, loss=3.0169]


Epoch 12/20 | Train Loss: 3.0169 | Val Loss: 3.0224


Epoch 13/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:37<00:00,  8.36it/s, loss=2.9518]


Epoch 13/20 | Train Loss: 2.9518 | Val Loss: 3.0352


Epoch 14/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:41<00:00,  8.03it/s, loss=2.9052]


Epoch 14/20 | Train Loss: 2.9052 | Val Loss: 2.9634


Epoch 15/20 [Train]: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 814/814 [01:40<00:00,  8.11it/s, loss=2.8492]


Epoch 15/20 | Train Loss: 2.8492 | Val Loss: 2.9426


Epoch 16/20 [Train]:  27%|‚ñà‚ñà‚ñã       | 220/814 [00:26<01:11,  8.25it/s, loss=2.7949]