In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import glob
import pandas as pd
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [None]:
import glob

files = glob.glob("/content/*")
print(files[0])

/content/K01AF__07_123160-124680.csv


In [None]:
import pandas as pd

df = pd.read_csv(files[1])
df = df.drop(columns=['Unnamed: 0'])

In [None]:
max(df['FrameNumber'])

11

In [None]:
frame_number = max(df['FrameNumber'])

video_seq = []
for n in range(frame_number):
  frame_df = df[df['FrameNumber']==n]

  frame_seq=[]
  for row in frame_df.iterrows():
      frame_seq.append([row[1]['X'], row[1]['Y'], row[1]['Z']])

  video_seq.append(torch.tensor(frame_seq, dtype=torch.float))

# video_seq = torch.tensor(video_seq, dtype=torch.float)

In [None]:
# for n in range(frame_number):
#   print(len(video_seq[n]))
# video_seq

my_tensor = torch.cat(video_seq, dim=-2)
print(my_tensor.shape)
print(my_tensor)

torch.Size([363, 3])
tensor([[ 0.4203,  0.4312, -0.8396],
        [ 0.4279,  0.3961, -0.8517],
        [ 0.4340,  0.3908, -0.8517],
        ...,
        [ 0.4368,  1.5453, -0.0426],
        [ 0.5010,  1.6314, -0.1309],
        [ 0.4270,  1.6275, -0.2985]])


In [8]:
# class SequenceEmbedder(nn.Module):
#     def __init__(self, input_dim, embed_dim, hidden_dim, output_dim):
#         super(SequenceEmbedder, self).__init__()
#         self.input_dim = input_dim
#         self.hidden_dim = hidden_dim
#         self.embed_dim = embed_dim

#         self.embedding = nn.Linear(input_dim, embed_dim)
#         self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
#         self.attention = nn.Linear(hidden_dim, 1)
#         self.fc = nn.Linear(hidden_dim, output_dim)

#     def forward(self, x, lengths):
#         x = self.embedding(x)

#         # Packing the padded sequence
#         x_packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)

#         # LSTM
#         lstm_out, _ = self.lstm(x_packed)

#         # Unpacking the sequence
#         lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)

#         # Attention
#         attention_weights = F.softmax(self.attention(lstm_out), dim=1)
#         context_vector = torch.sum(lstm_out * attention_weights, dim=1)

#         output = self.fc(context_vector)

#         return output.reshape(16,16)


# # Example usage
# model = SequenceEmbedder(input_dim=3, embed_dim=64, hidden_dim=128, output_dim=256)

# # Example data
# sequences = [my_tensor]  # Convert to float

# # Rest of your code remains the same
# lengths = [len(seq) for seq in sequences]
# padded_sequences = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True)
# # print(padded_sequences)
# output = model(padded_sequences, lengths)

In [None]:
output.shape

torch.Size([16, 16])

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.view(N, value_len, self.heads, self.head_dim)
        keys = keys.view(N, key_len, self.heads, self.head_dim)
        queries = query.view(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Einsum does matrix multiplication for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just a way to do batch matrix multiplication
        attention = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

        if mask is not None:
            attention = attention.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(attention / (self.embed_size ** (1 / 2)), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        out = self.fc_out(out)
        return out

class SequenceEmbedder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, output_dim, num_layers=2, heads=4, dropout=0.5):
        super(SequenceEmbedder, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers

        self.embedding = nn.Linear(99, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, dropout=dropout, bidirectional=True)
        self.attention = MultiHeadAttention(hidden_dim * 2, heads)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths, mask=None):
    # Embedding
        x = self.embedding(x)  # Shape: [batch_size, seq_len, embed_dim]

        # Pack the sequence
        x_packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)

        # LSTM
        lstm_out, _ = self.lstm(x_packed)
        lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
        

        # Transform LSTM output to match attention input requirements
        # Assuming hidden_dim * 2 == embed_size, or use a Linear layer to transform
        lstm_out = lstm_out.contiguous().view(lstm_out.shape[0], lstm_out.shape[1], -1)

        attention_out = self.attention(lstm_out, lstm_out, lstm_out, mask)

        # Further processing
        output = F.relu(self.fc1(attention_out))
        output = self.fc2(output)

        return output

    

model = SequenceEmbedder(input_dim=3, embed_dim=64, hidden_dim=128, output_dim=512).to(device)



In [4]:
class LandmarkDataset(Dataset):
    def __init__(self, file_paths):
        self.data = []
        self.lengths = []
        for file_path in file_paths:
            print("Processing file:", file_path)  # Logging the file path
            df = pd.read_csv(file_path)
            df = df.drop(columns=['Unnamed: 0'], errors='ignore')  # Use errors='ignore' to avoid issues if the column doesn't exist

            if df.empty or 'FrameNumber' not in df.columns:
                print(f"Skipping empty or invalid file: {file_path}")
                continue

            frame_number = max(df['FrameNumber'])

            video_seq = []
            for n in range(frame_number):
                frame_df = df[df['FrameNumber'] == n]
                if frame_df.empty:
                    continue  # Skip empty frames, or you can handle them differently

                # Ensure each frame_seq tensor has consistent dimensions
                frame_seq = frame_df[['X', 'Y', 'Z']].values
                video_seq.append(torch.tensor(frame_seq, dtype=torch.float).unsqueeze(0))

            if not video_seq:
                continue  # Skip if no valid data in video

            my_tensor = torch.cat(video_seq, dim=0)  # Concatenate along the first dimension
            self.data.append(my_tensor)
            self.lengths.append(len(video_seq))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.lengths[idx] 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

    
file_paths = glob.glob("landmarks_all/*/*")
print("Found files:", len(file_paths))

dataset = LandmarkDataset(file_paths)

# Create DataLoader
train_loader = DataLoader(dataset, batch_size=1, shuffle=True)

Found files: 19656
Processing file: landmarks_all\K01AF\K01AF__07_122200-123160.csv
Processing file: landmarks_all\K01AF\K01AF__07_124680-126520.csv
Processing file: landmarks_all\K01AF\K01AF__07_130400-132120.csv
Processing file: landmarks_all\K01AF\K01AF__07_132120-132600.csv
Processing file: landmarks_all\K01AF\K01AF__07_132600-133840.csv
Processing file: landmarks_all\K01AF\K01AF__07_14840-18760.csv
Processing file: landmarks_all\K01AF\K01AF__07_155080-155360.csv
Processing file: landmarks_all\K01AF\K01AF__07_160600-162800.csv
Processing file: landmarks_all\K01AF\K01AF__07_165480-168720.csv
Processing file: landmarks_all\K01AF\K01AF__07_171000-172040.csv
Processing file: landmarks_all\K01AF\K01AF__07_172280-172840.csv
Processing file: landmarks_all\K01AF\K01AF__07_183840-185440.csv
Processing file: landmarks_all\K01AF\K01AF__07_189880-192480.csv
Processing file: landmarks_all\K01AF\K01AF__07_198800-200240.csv
Processing file: landmarks_all\K01AF\K01AF__07_202200-206040.csv
Processi

In [63]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()  # Replace with the appropriate loss function
model.to(device)

num_epochs = 10  # Set the number of epochs
best_loss = float('inf')
epochs_no_improve = 0
n_epochs_stop = 5  # Number of epochs to stop after if no improvement

for epoch in range(num_epochs):
    epoch_loss = 0.0  # Track loss for each epoch

    for data, lengths in train_loader:
        
        inputs = data.squeeze(1)
        inputs = inputs.view(inputs.size(0), inputs.size(1), -1)

        # Move inputs tensor to the device
        inputs = inputs.to(device)

        # Initialize labels tensor on the device
        labels = torch.zeros(inputs.size(0), inputs.size(1), 512).to(device)

        optimizer.zero_grad()

        # Forward pass (ensure lengths is on CPU)
        outputs = model(inputs, lengths.cpu())  # Move lengths to CPU here

        # Calculate loss
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

    # Early stopping check
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve == n_epochs_stop:
        print('Early stopping triggered')
        break

# Save the model
torch.save(model.state_dict(), 'landmark_model.pth')

Epoch 1, Loss: 1.7483399172846248e-09
Epoch 2, Loss: 3.827656414046032e-09
Epoch 3, Loss: 1.3893798334052576e-09
Epoch 4, Loss: 1.8300347903732472e-09
Epoch 5, Loss: 8.750645208444041e-10
Epoch 6, Loss: 5.718143736288539e-10
Early stopping triggered


In [5]:
import json
from collections import defaultdict
import os
import tqdm

model = SequenceEmbedder(input_dim=3, embed_dim=64, hidden_dim=128, output_dim=512).to(device)
model.load_state_dict(torch.load('landmark_model.pth'))
model.to(device)
model.eval()

aggregated_embeddings = defaultdict(list)

with torch.no_grad():
    for i, (data, lengths) in enumerate(train_loader):
        inputs = data.squeeze(1).to(device)
        inputs = inputs.view(inputs.size(0), inputs.size(1), -1)
        outputs = model(inputs, lengths.cpu())  # Get embeddings

        file_name = os.path.basename(file_paths[i])
        video_name, frame_info = file_name.split('__')

        # Flatten and aggregate embeddings for the entire file
        for embedding in outputs:
            flat_embedding = embedding.view(-1).cpu().numpy().tolist()
            aggregated_embeddings[file_name].append(flat_embedding)

# Average the embeddings for each file
final_embeddings = []
for file_name, emb_list in aggregated_embeddings.items():
    avg_emb = [sum(e)/len(emb_list) for e in zip(*emb_list)]  # Average embedding
    final_embeddings.append({
        "CSVName": file_name,
        "Embedding": avg_emb
    })

# Save to JSON
with open('embeddings.json', 'w') as file:
    json.dump(final_embeddings, file)


In [7]:
import ijson

def print_first_n_items(filename, n=20):
    with open(filename, 'rb') as file:
        # Adjust the prefix according to your JSON structure
        items = ijson.items(file, 'item')
        for i, item in enumerate(items):
            if i >= n:
                break
            # Replace 'column1' and 'column2' with the actual keys of the first two columns
            print(item.get('CSVName', 'N/A'))

filename = "embeddings.json"
print_first_n_items(filename)

K01AF__07_122200-123160.csv
K01AF__07_124680-126520.csv
K01AF__07_130400-132120.csv
K01AF__07_132120-132600.csv
K01AF__07_132600-133840.csv
K01AF__07_14840-18760.csv
K01AF__07_155080-155360.csv
K01AF__07_160600-162800.csv
K01AF__07_165480-168720.csv
K01AF__07_171000-172040.csv
K01AF__07_172280-172840.csv
K01AF__07_183840-185440.csv
K01AF__07_189880-192480.csv
K01AF__07_198800-200240.csv
K01AF__07_202200-206040.csv
K01AF__07_206040-208040.csv
K01AF__07_20640-22360.csv
K01AF__07_218160-218600.csv
K01AF__07_22760-24160.csv
K01AF__07_228960-232760.csv


In [1]:
import json

import jsonlines

landmarks_emb_fpath = "embeddings.json"
with open(landmarks_emb_fpath) as landmarks_emb_file:
    landmarks_embeddings = json.load(landmarks_emb_file)
    print("Loaded embeddings.json")

# landmarks_embeddings = [
#     {"CSVName": "a", "Embedding": [1, 2, 3]},
#     {"CSVName": "b", "Embedding": [4, 5, 6]},
#     {"CSVName": "c", "Embedding": [7, 8, 9]},
# ]

with jsonlines.open("./embeddings.jsonl", "w") as writer:
    for _ in range(len(landmarks_embeddings)):
        dict_ = landmarks_embeddings.pop(0)
        writer.write(dict_)

Loaded embeddings.json
