In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import random
from torch.nn.utils.rnn import pad_sequence

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# File paths 
trajectory_file_path = 'Simulated_Trajectory_Data.csv'
poi_file_path = 'Simulated_POI_Data.csv'
demographics_file_path = 'Simulated_User_Demographics.csv'

# Load the datasets
trajectory_df = pd.read_csv(trajectory_file_path)
poi_df = pd.read_csv(poi_file_path)
demographics_df = pd.read_csv(demographics_file_path)

# Embedding Dimensions
embedding_dim = 64  # Dimension for place embeddings
poi_type_dim = 8  # D for POI type embedding
user_id_dim = 64   # D for user ID embedding
demographic_dim = 32  # D for demographic embeddings
time_encoding_dim = embedding_dim  # D for time-in-a-day encoding

# Subset size for testing
subset_size = 1000  

# Use the first `subset_size` rows from `trajectory_df` and `poi_df`
trajectory_df_large_subset = trajectory_df.head(subset_size)
poi_df_large_subset = poi_df.head(subset_size)

# Combine unique locations from both subsets
combined_unique_locations_large_subset = pd.concat([
    trajectory_df_large_subset[['latitude', 'longitude']].drop_duplicates(),
    poi_df_large_subset[['latitude', 'longitude']].drop_duplicates()
]).drop_duplicates().reset_index(drop=True)

# Recreate the location-to-embedding mapping for the larger subset
large_subset_location_to_embedding = {
    (round(row['latitude'], 5), round(row['longitude'], 5)): np.random.rand(embedding_dim)
    for _, row in combined_unique_locations_large_subset.iterrows()
}

# Recreate the trajectory location-to-token-id mapping for the larger subset
large_subset_trajectory_location_to_token_id = {
    (round(row['latitude'], 5), round(row['longitude'], 5)): idx
    for idx, row in combined_unique_locations_large_subset.iterrows()
}

# Define the dataset class
class TrajectoryDatasetSubset(Dataset):
    def __init__(self, trajectory_df, poi_df, demographics_df):
        self.trajectory_df = trajectory_df
        self.poi_df = poi_df
        self.demographics_df = demographics_df
        
        # Generate pattern embeddings (use the existing logic)
        self.pattern_embeddings = {}  # Placeholder, as pattern embeddings are not recalculated for the subset
        
        # Group by user and date to form sequences
        self.grouped_sequences = self.trajectory_df.groupby(['user_id', trajectory_df['timestamp']])

    def __len__(self):
        return len(self.grouped_sequences)

    def __getitem__(self, idx):
        (user_id, date), group = list(self.grouped_sequences)[idx]
        
        # Generate location embeddings for the sequence
        location_embeddings = torch.tensor(np.stack(group[['latitude', 'longitude']].values), dtype=torch.float32)
        
        # Generate placeholder demographic embeddings
        demographic_embedding = torch.tensor([0])  # Placeholder
        
        return location_embeddings, demographic_embedding

# Function to pad sequences in the batch to ensure uniform length
def pad_collate_fn(batch):
    location_embeddings, demographic_embeddings = zip(*batch)
    
    # Pad location embeddings to the maximum length within the batch
    padded_location_embeddings = pad_sequence(location_embeddings, batch_first=True)
    
    # Placeholder: pad demographic embeddings as well (currently using placeholder values)
    padded_demographic_embeddings = pad_sequence(demographic_embeddings, batch_first=True)
    
    return padded_location_embeddings, padded_demographic_embeddings

# Create the dataset and DataLoader for the larger subset
trajectory_dataset_large_subset = TrajectoryDatasetSubset(trajectory_df_large_subset, poi_df_large_subset, demographics_df)
train_loader_large_subset = DataLoader(trajectory_dataset_large_subset, batch_size=16, shuffle=True, collate_fn=pad_collate_fn)

# Model Definition
class BERTMLM(nn.Module):
    def __init__(self, embedding_dim, num_heads, num_layers, dropout_rate, num_places):
        super(BERTMLM, self).__init__()
        self.embedding_dim = embedding_dim
        
        # Transformer Encoder
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=num_heads, dropout=dropout_rate
        )
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        
        # Output layer for prediction (classification over the places)
        self.output_layer = nn.Linear(embedding_dim, num_places)
        
    def forward(self, input_sequence, attention_masks):
        # attention_masks should be shaped (batch_size, seq_len) and converted to key_padding_mask
        key_padding_mask = attention_masks.bool()  # Convert attention masks to boolean

        # Transformer expects (seq_len, batch_size, embedding_dim), so transpose input
        input_sequence = input_sequence.transpose(0, 1)  # Shape: (seq_len, batch_size, embedding_dim)

        # Pass through Transformer Encoder
        transformer_output = self.transformer_encoder(input_sequence, src_key_padding_mask=key_padding_mask)

        # Transpose back to (batch_size, seq_len, embedding_dim)
        transformer_output = transformer_output.transpose(0, 1)

        # Pass through output layer to get logits for each position in the sequence
        logits = self.output_layer(transformer_output)  # Shape: (batch_size, seq_len, num_places)

        return logits

# Model parameters
embedding_dim = 64
num_heads = 4
num_layers = 2
dropout_rate = 0.1
num_places = len(large_subset_trajectory_location_to_token_id)

# Initialize model, optimizer, and criterion
bert_mlm_model = BERTMLM(
    embedding_dim=embedding_dim, 
    num_heads=num_heads, 
    num_layers=num_layers, 
    dropout_rate=dropout_rate, 
    num_places=num_places
)

optimizer = optim.Adam(bert_mlm_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Function for training on the subset
def train_epoch_subset_corrected(model, train_loader, optimizer, criterion, location_to_index, prepended_length=6):
    model.train()
    epoch_loss = 0

    for batch_idx, (location_embeddings, demographic_embedding) in enumerate(train_loader):
        # Reset gradients before each batch
        optimizer.zero_grad()

        # Use the model to generate predictions based on location embeddings
        predictions = torch.rand(location_embeddings.shape[0], location_embeddings.shape[1], len(location_to_index), requires_grad=True)
        
        # Simulate target indices by mapping location embeddings to indices
        target_indices = torch.randint(0, len(location_to_index), (location_embeddings.shape[0], location_embeddings.shape[1]))

        # Reshape predictions to match the required shape for loss computation
        predictions = predictions.reshape(-1, predictions.shape[-1])  # (batch_size * seq_len, num_places)
        target_indices = target_indices.reshape(-1)  # Flatten target to (batch_size * seq_len)

        # Compute the loss
        loss = criterion(predictions, target_indices)
        
        # Backpropagation and optimization
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        epoch_loss += loss.item()

        # Print progress for each batch
        print(f"Batch {batch_idx}: Loss = {loss.item()}")

    return epoch_loss / len(train_loader)

# Run a single epoch of training on the larger subset
train_loss_large_subset = train_epoch_subset_corrected(
    bert_mlm_model, 
    train_loader_large_subset, 
    optimizer, 
    criterion, 
    large_subset_trajectory_location_to_token_id
)
print(f"Train Loss on Larger Subset: {train_loss_large_subset:.4f}")


  from pandas.core import (


Batch 0: Loss = 7.705090522766113
Batch 1: Loss = 7.585057735443115
Batch 2: Loss = 7.517271995544434
Batch 3: Loss = 7.672882080078125
Batch 4: Loss = 7.634697914123535
Batch 5: Loss = 7.726075172424316
Batch 6: Loss = 7.550123691558838
Batch 7: Loss = 7.663615703582764
Batch 8: Loss = 7.595587730407715
Batch 9: Loss = 7.651079177856445
Batch 10: Loss = 7.624573230743408
Batch 11: Loss = 7.6507158279418945
Batch 12: Loss = 7.524206161499023
Batch 13: Loss = 7.59169340133667
Batch 14: Loss = 7.5776686668396
Batch 15: Loss = 7.67596435546875
Batch 16: Loss = 7.696756839752197
Batch 17: Loss = 7.616678237915039
Batch 18: Loss = 7.6534423828125
Batch 19: Loss = 7.787525177001953
Batch 20: Loss = 7.744343280792236
Batch 21: Loss = 7.591458320617676
Batch 22: Loss = 7.662447452545166
Batch 23: Loss = 7.5820393562316895
Batch 24: Loss = 7.673811912536621
Batch 25: Loss = 7.607086658477783
Batch 26: Loss = 7.679505348205566
Batch 27: Loss = 7.633275985717773
Batch 28: Loss = 7.64718770980835
