## Load JSON segments

In [None]:
import json
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
with open("F:\\AIM Lab\\Project\\whisper_v2_en\\england_epl\\2014-2015\\2015-05-17 - 18-00 Manchester United 1 - 1 Arsenal\\2_asr.json", encoding="utf-8") as f:
    data = json.load(f)

segments = [
    (float(v[0]), float(v[1]), v[2])
    for k,v in data["segments"].items()
]

resnet_embedding_path = "F:\\AIM Lab\\Project\\SoccerNet\\england_epl\\2014-2015\\2015-05-17 - 18-00 Manchester United 1 - 1 Arsenal\\2_ResNET_TF2_PCA512.npy"  # Update with the correct path
resnet_embeddings = np.load(resnet_embedding_path)
rows = resnet_embeddings.shape[0]


## Compute BERT embeddings

In [None]:

model = SentenceTransformer("all-mpnet-base-v2")  # or all-mpet-basne-v2 for higher quality

# get embeddings for all segments
embeddings = []
for (start, end, text) in segments:
    emb = model.encode(text, convert_to_tensor=True)
    embeddings.append((start, end, emb))


## Build 0.5 s timeline

In [None]:
import numpy as np

dt = 0.5
t_min = 0
t_max = (rows - 1) * dt

time_grid = np.arange(t_min, t_max+dt, dt)
N = len(time_grid)
D = embeddings[0][2].shape[0]

timeline_embeddings = torch.zeros((N, D))


## Forward-fill with last embedding

In [None]:
last_emb = torch.zeros(D)
last_time = -1e9  # very far in past

for i, t in enumerate(time_grid):
    # find active segment
    hit = next((emb for (start, end, emb) in embeddings if start <= t < end), None)
    if hit is not None:
        timeline_embeddings[i] = hit
        last_emb = hit
        last_time = t
    else:
        # silence: forward-fill if gap ≤5 s, else zeros
        if t - last_time <= 5.0:
            timeline_embeddings[i] = last_emb
        else:
            timeline_embeddings[i] = torch.zeros(D)


In [None]:
print(timeline_embeddings.shape)
print(timeline_embeddings[:5])  # first few time steps


In [None]:
resnet_embedding_path = "F:\\AIM Lab\\Project\\SoccerNet\\england_epl\\2014-2015\\2015-05-17 - 18-00 Manchester United 1 - 1 Arsenal\\2_ResNET_TF2_PCA512.npy"  # Update with the correct path
resnet_embeddings = np.load(resnet_embedding_path)
video_embeddings = torch.tensor(resnet_embeddings, dtype=torch.float32)
T = video_embeddings.shape[0]  
# View the first 4 rows
print(resnet_embeddings[:5])

### Time since last speech scalar

In [None]:
time_since_last_speech = torch.zeros((T, 1))
last_speech_time = -1e9  # very far in past

# rebuild segments only start times
segment_starts = torch.tensor([s[0] for s in segments])

for i in range(T):
    t = i * dt
    # check if any speech segment started now
    if (segment_starts == t).any():
        last_speech_time = t
    time_since_last_speech[i] = t - last_speech_time

# normalize: clip & scale to roughly 0..1
time_since_last_speech = torch.clamp(time_since_last_speech / 10.0, 0, 1)  # e.g. 10 sec -> 1.0


### positional encodings (relative time in match)

In [None]:
import math

def positional_encoding(T, dim_pe, max_time):
    pe = torch.zeros((T, dim_pe))
    position = torch.arange(0, T).unsqueeze(1).float()  # [T,1]
    position = position / T * max_time  # scale to match actual time

    div_term = torch.exp(torch.arange(0, dim_pe, 2) * (-math.log(10000.0) / dim_pe))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe

dim_pe = 16
max_time = T * dt  # e.g. total seconds in video
pos_encodings = positional_encoding(T, dim_pe, max_time)  # [T, 16]


### concatenation

In [None]:
import torch.nn.functional as F

class CrossModalTransformerFusion(nn.Module):
    def __init__(self, d_model=512, nhead=8, num_layers=2, dropout=0.1):
        super(CrossModalTransformerFusion, self).__init__()
        self.d_model = d_model
        
        # Multi-head attention for cross-modal fusion
        self.cross_attention = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        
        # Transformer encoder layers for further processing
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=nhead, 
            dim_feedforward=2048,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Layer normalization
        self.layer_norm = nn.LayerNorm(d_model)
        
    def forward(self, video_emb, text_emb):
        # video_emb: [T, d_model]
        # text_emb: [T, d_model]
        
        # Add batch dimension if needed
        if video_emb.dim() == 2:
            video_emb = video_emb.unsqueeze(0)  # [1, T, d_model]
        if text_emb.dim() == 2:
            text_emb = text_emb.unsqueeze(0)    # [1, T, d_model]
        
        # Cross-modal attention: video attends to text
        video_attended, _ = self.cross_attention(
            query=video_emb,
            key=text_emb,
            value=text_emb
        )
        
        # Cross-modal attention: text attends to video
        text_attended, _ = self.cross_attention(
            query=text_emb,
            key=video_emb,
            value=video_emb
        )
        
        # Residual connections
        video_fused = self.layer_norm(video_emb + video_attended)
        text_fused = self.layer_norm(text_emb + text_attended)
        
        # Concatenate the fused embeddings
        multimodal_emb = torch.cat([video_fused, text_fused], dim=-1)  # [1, T, 2*d_model]
        
        # Apply transformer encoder for final fusion
        fused_output = self.transformer_encoder(multimodal_emb)
        
        # Remove batch dimension
        return fused_output.squeeze(0)  # [T, 2*d_model]

# Initialize the fusion module
fusion_module = CrossModalTransformerFusion(d_model=512, nhead=8, num_layers=2)

In [None]:
import torch.nn as nn

# Ensure timeline_embeddings matches video_embeddings length
T = video_embeddings.shape[0]
if timeline_embeddings.shape[0] != T:
    if timeline_embeddings.shape[0] > T:
        timeline_embeddings = timeline_embeddings[:T]
    else:
        # Pad with zeros if timeline is shorter
        padding = torch.zeros(T - timeline_embeddings.shape[0], timeline_embeddings.shape[1])
        timeline_embeddings = torch.cat([timeline_embeddings, padding], dim=0)

D_video = video_embeddings.shape[1]    # e.g. 512
D_text  = timeline_embeddings.shape[1] # e.g. 384
D_proj  = 512                          # common dimension

# Normalization layers
video_norm = nn.LayerNorm(D_video)
text_norm  = nn.LayerNorm(D_text)

# Projection layers
video_proj = nn.Linear(D_video, D_proj)
text_proj  = nn.Linear(D_text, D_proj)

# Normalize
video_normalized = video_norm(video_embeddings)
text_normalized  = text_norm(timeline_embeddings)

# Project to same dim
video_projected = video_proj(video_normalized)  # [T, 512]
text_projected  = text_proj(text_normalized)    # [T, 512]

multimodal_embeddings = fusion_module(video_projected, text_projected)

# Concatenate with additional features
full_embeddings = torch.cat([
    multimodal_embeddings,      # [T, 1024]
    time_since_last_speech,     # [T, 1]
    pos_encodings               # [T, 16]
], dim=1)                       # ➔ [T, 1041]

# Final normalization
norm = nn.LayerNorm(full_embeddings.shape[1])
full_embeddings = norm(full_embeddings)

print(full_embeddings.shape)
print(full_embeddings[:5])

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import torch

# 1. Load one ASR JSON
json_path = "F:\\AIM Lab\\Project\\whisper_v2_en\\england_epl\\2014-2015\\2015-05-17 - 18-00 Manchester United 1 - 1 Arsenal\\2_asr.json"
with open(json_path, encoding="utf-8") as f:
    data = json.load(f)
segments = [(float(v[0]), float(v[1]), v[2]) for v in data["segments"].values()]

# 2. Build timeline embeddings in NumPy (use simple one-hot/silence mask to verify)
dt = 0.5
T = 1000  # or compute from your corresponding ResNet length
time_grid = np.arange(0, T*dt, dt)

mask = np.zeros(T)
for (start, end, _) in segments:
    idx_start = int(start / dt)
    idx_end   = int(end   / dt)
    mask[idx_start:idx_end] = 1

# 3. Plot the mask to verify alignment
times = time_grid
plt.figure()
plt.plot(times, mask)
plt.xlabel("Time (s)")
plt.ylabel("Speech Present")
plt.title("Speech Segment Alignment (1=present, 0=silence)")
plt.show()
