In [3]:
import pandas as pd
import os
%matplotlib inline 

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

from scipy import stats

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.multioutput import MultiOutputClassifier

from sklearn.model_selection import cross_val_score
import torch
import tqdm

from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.semi_supervised import LabelPropagation
from sklearn.preprocessing import StandardScaler

### Preprocess traindata

In [4]:
def data_pre_processing(file_name):
    df = pd.read_csv(file_name)
    TARGET_COLS = ["sii"]
    
    FEATURE_COLS = [
        "Basic_Demos-Age",
        "Basic_Demos-Sex",
        "CGAS-CGAS_Score",
        "Physical-BMI",
        "Physical-Height",
        "Physical-Weight",
        "Physical-Waist_Circumference",
        "Physical-Diastolic_BP",
        "Physical-HeartRate",
        "Physical-Systolic_BP",
        "Fitness_Endurance-Max_Stage",
        "Fitness_Endurance-Time_Mins",
        "Fitness_Endurance-Time_Sec",
        "FGC-FGC_CU",
        "FGC-FGC_CU_Zone",
        "FGC-FGC_GSND",
        "FGC-FGC_GSND_Zone",
        "FGC-FGC_GSD",
        "FGC-FGC_GSD_Zone",
        "FGC-FGC_PU",
        "FGC-FGC_PU_Zone",
        "FGC-FGC_SRL",
        "FGC-FGC_SRL_Zone",
        "FGC-FGC_SRR",
        "FGC-FGC_SRR_Zone",
        "FGC-FGC_TL",
        "FGC-FGC_TL_Zone",
        "BIA-BIA_Activity_Level_num",
        "BIA-BIA_BMC",
        "BIA-BIA_BMI",
        "BIA-BIA_BMR",
        "BIA-BIA_DEE",
        "BIA-BIA_ECW",
        "BIA-BIA_FFM",
        "BIA-BIA_FFMI",
        "BIA-BIA_FMI",
        "BIA-BIA_Fat",
        "BIA-BIA_Frame_num",
        "BIA-BIA_ICW",
        "BIA-BIA_LDM",
        "BIA-BIA_LST",
        "BIA-BIA_SMM",
        "BIA-BIA_TBW",
        "PAQ_A-PAQ_A_Total",
        "PAQ_C-PAQ_C_Total",
        "SDS-SDS_Total_Raw",
        "SDS-SDS_Total_T",
        "PreInt_EduHx-computerinternet_hoursday"]

    data = df[FEATURE_COLS]
    target = df[TARGET_COLS].fillna(-1).values.flatten()
    patient_ids = df["id"]

    iterative_imputer = IterativeImputer(max_iter=10, random_state=0)
    data_imputed = pd.DataFrame(iterative_imputer.fit_transform(data), columns=FEATURE_COLS)

    scaler = StandardScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(data_imputed))


    label_prop_model = LabelPropagation(kernel='knn', n_neighbors=5)
    label_prop_model.fit(data_scaled, target)
    target_imputed = pd.DataFrame(label_prop_model.transduction_)

    final_data = pd.concat([patient_ids, data_scaled, target_imputed], axis=1)
    final_data.columns = ['id'] + FEATURE_COLS + ["sii"]
    

    return final_data

### Transformer Trained

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils.rnn as rnn_utils
import numpy as np

class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, chunk_size=1000):
        self.sequences = sequences
        self.chunk_size = chunk_size
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        # Convert to tensor and ensure float32
        sequence = torch.FloatTensor(sequence)
        
        # If sequence is longer than chunk_size, average pool it
        if len(sequence) > self.chunk_size:
            # Reshape to (channels, length) for avg_pool1d
            sequence = sequence.transpose(0, 1).unsqueeze(0)
            target_length = self.chunk_size
            kernel_size = len(sequence[0, 0]) // target_length
            if kernel_size > 1:
                sequence = nn.functional.avg_pool1d(sequence, kernel_size)
            sequence = sequence.squeeze(0).transpose(0, 1)
        
        return sequence

def collate_fn(batch):
    # Sort sequences by length in descending order
    batch.sort(key=lambda x: len(x), reverse=True)
    
    # Get lengths of each sequence in the batch
    lengths = torch.LongTensor([len(x) for x in batch])
    
    # Pad sequences to the same length
    padded_seqs = rnn_utils.pad_sequence(batch, batch_first=True)
    
    return padded_seqs, lengths

class TimeSeriesTransformer(nn.Module):
    def __init__(self, input_dim=8, d_model=64, nhead=8, num_layers=3, dim_feedforward=256, max_sequence_length=1000):
        super().__init__()
        
        self.input_projection = nn.Linear(input_dim, d_model)
        
        # Add initial downsampling if needed
        self.downsample = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model),
            nn.GELU()
        )
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            batch_first=True
        )
        
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers
        )
        
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        
    def forward(self, x, lengths):
        # x shape: (batch_size, max_seq_length, input_dim)
        x = self.input_projection(x)
        x = self.downsample(x)
        
        # Create padding mask for transformer
        max_len = x.size(1)
        device = x.device
        arange_tensor = torch.arange(max_len, device=device)[None, :]
        lengths_tensor = lengths[:, None].to(device)
        padding_mask = arange_tensor >= lengths_tensor
        
        x = self.transformer_encoder(x, src_key_padding_mask=padding_mask)
        
        # Global pooling across sequence length
        x = x.transpose(1, 2)  # (batch_size, d_model, max_seq_length)
        x = self.global_pool(x)  # (batch_size, d_model, 1)
        x = x.squeeze(-1)  # (batch_size, d_model)
        
        return x

class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.5):
        super().__init__()
        self.temperature = temperature
        
    def forward(self, embeddings):
        # Normalize embeddings
        embeddings_normalized = nn.functional.normalize(embeddings, dim=1)
        
        # Compute similarity matrix
        similarity_matrix = torch.matmul(embeddings_normalized, embeddings_normalized.T)
        
        # Mask out self-similarity
        mask = torch.eye(similarity_matrix.size(0), device=similarity_matrix.device)
        mask = 1 - mask
        
        # Scale similarities by temperature
        similarity_matrix = similarity_matrix / self.temperature
        
        # Compute loss
        similarity_matrix = similarity_matrix * mask
        positives = similarity_matrix.exp().sum(dim=1)
        negatives = mask.sum(dim=1)
        
        loss = -torch.log(positives / negatives).mean()
        
        return loss

def train_transformer(all_participant_sequences, device='cuda' if torch.cuda.is_available() else 'cpu', chunk_size=1000):
    """
    Train the transformer model on all participant sequences
    """
    # Create dataset and dataloader with custom collate function
    dataset = TimeSeriesDataset(all_participant_sequences, chunk_size=chunk_size)
    dataloader = DataLoader(
        dataset, 
        batch_size=16,  # Reduced batch size
        shuffle=True,
        collate_fn=collate_fn
    )
    
    # Initialize model with smaller sequence length
    model = TimeSeriesTransformer(max_sequence_length=chunk_size).to(device)
    
    # Initialize optimizer with gradient clipping
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Use contrastive loss instead of reconstruction
    criterion = ContrastiveLoss(temperature=0.5).to(device)
    
    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch, lengths in dataloader:
            batch = batch.to(device)
            lengths = lengths.to(device)
            
            optimizer.zero_grad()
            
            # Get embeddings
            embeddings = model(batch, lengths)
            
            # Compute contrastive loss
            loss = criterion(embeddings)
            
            loss.backward()
            
            # Clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            total_loss += loss.item()
            
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.6f}")
    
    return model

def prepare_sequence_data(df):
    """
    Prepare the sequence data by selecting relevant features and normalizing
    """
    # Select features for the sequence
    features = ['X', 'Y', 'Z', 'enmo', 'anglez', 'non-wear_flag', 'light', 'battery_voltage']
    sequence_data = df[features].values
    
    # Normalize the data
    mean = np.mean(sequence_data, axis=0)
    std = np.std(sequence_data, axis=0)
    normalized_data = (sequence_data - mean) / (std + 1e-8)
    
    return normalized_data

### Embeddings

In [8]:
from tqdm import tqdm
import os
parquets = os.listdir('./dataset/series_train.parquet')
parquets = [f'./dataset/series_train.parquet/{parquet}/part-0.parquet' for parquet in parquets]

sequences = []

for i in tqdm(range(700)):
    parquet_file = parquets[i]
    df = pd.read_parquet(parquet_file)
    sequence = prepare_sequence_data(df)
    sequences.append(sequence)

100%|██████████| 700/700 [00:45<00:00, 15.55it/s]


In [9]:
def get_embedding(model, sequence, device='cuda', chunk_size=1000):
    # Make sure model is in eval mode
    model.eval()
    
    # Create a single sequence dataset
    dataset = TimeSeriesDataset([sequence], chunk_size=chunk_size)
    # Get the first (and only) item
    processed_sequence = dataset[0]
    
    # Add batch dimension and move to device
    sequence_tensor = processed_sequence.unsqueeze(0).to(device)  # Shape: [1, seq_length, features]
    lengths = torch.LongTensor([len(processed_sequence)]).to(device)
    
    with torch.no_grad():  # Disable gradient computation
        embedding = model(sequence_tensor, lengths)
        
    return embedding.cpu().numpy()  # Return as numpy array


In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
checkpoint = torch.load('transformer_checkpoint.pth')
model = TimeSeriesTransformer(
    input_dim=checkpoint['input_dim'],
    d_model=checkpoint['d_model']
).to(device)
model.load_state_dict(checkpoint['model_state_dict'])

single_sequence = sequences[0]
embedding = get_embedding(model, single_sequence)

print(embedding)

cuda
[[ 1.2072891  -1.285907    1.0724608  -0.9647128  -1.1721855   0.59914154
  -1.1655712  -1.1702367   1.3399656   0.8503978   0.36611465  1.1322119
  -1.1436546   0.60717887  0.41804457  0.4304364   1.7853976  -0.7040854
  -0.45579344  1.3113045  -0.16199487 -0.21923763 -1.5629803   1.6914625
   0.0046412  -1.1569494   0.810713   -0.50895745 -1.0304841   1.108252
  -0.22500947  0.6542294  -0.70435965  1.6573403  -0.26517263  1.130759
   1.4824502   0.57356733 -0.8271706  -1.5393142   0.59242177 -1.2127146
  -0.74388164  0.22022718 -2.1491644   0.18578534 -0.7996799   0.9271483
  -0.8307708  -0.06390779 -1.1665395   1.3903908  -1.030352    1.4521902
   1.0732347   0.6108448  -1.2699958  -0.68777955  0.7992131  -1.3558935
  -0.02727532 -1.1247472   1.018531    0.22591838]]


  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


In [11]:
from tqdm import tqdm

def add_time_series_features(static_features):
    all_features = []
    
    # Assuming static_features is a DataFrame and get_embedding returns a numpy array
    for i in tqdm(range(len(sequences))):  # or use the full length of your dataset
        # Get the embedding (assuming it is a numpy array with shape [1, 64])
        embedding = get_embedding(model, sequences[i])
        
        # Convert embedding to a PyTorch tensor and flatten to shape [64]
        embedding = torch.tensor(embedding, dtype=torch.float32).squeeze()  # Shape [64]
        
        # Select the corresponding row from static_features and convert to tensor
        selected_feature = torch.tensor(static_features.iloc[i].values, dtype=torch.float32)  # Shape [48]
        
        # Concatenate the selected feature (48) and embedding (64) along dim=0
        new_feature = torch.cat((selected_feature, embedding), dim=0)  # Shape [112]
        
        # Append the new feature and corresponding label to the lists
        all_features.append(new_feature)
        
    
    # Convert lists to tensors
    features_tensor = torch.stack(all_features)  # Shape [N, 112]
    return features_tensor


In [12]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import accuracy_score


# Define the directory path
directory_path = './dataset/series_train.parquet'
# Example: List of IDs with extra features
ids_with_extra_features  = [name.split('=')[1] for name in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, name)) and name.startswith('id=')][:700]

train_cross_sectional = data_pre_processing("./dataset/train.csv")
# Assuming train_cross_sectional is your DataFrame with 'id' and 'label' columns
train_with_extra_features = train_cross_sectional[train_cross_sectional['id'].isin(ids_with_extra_features)]
train_without_extra_features = train_cross_sectional[~train_cross_sectional['id'].isin(ids_with_extra_features)]

# Separate features and labels
X_with_extra = train_with_extra_features.drop(columns=['id', 'sii'])
y_with_extra = train_with_extra_features['sii']
X_with_extra = add_time_series_features(X_with_extra)

X_without_extra = train_without_extra_features.drop(columns=['id', 'sii'])
y_without_extra = train_without_extra_features['sii']

# Initialize XGBoost classifiers
model_with_extra = xgb.XGBClassifier()
model_without_extra = xgb.XGBClassifier()

# Train the models
model_with_extra.fit(X_with_extra, y_with_extra)
model_without_extra.fit(X_without_extra, y_without_extra)

# Predict on the training data (or use separate test sets)
y_pred_with_extra = model_with_extra.predict(X_with_extra)
y_pred_without_extra = model_without_extra.predict(X_without_extra)

# Calculate accuracy for each model
acc_with_extra = accuracy_score(y_with_extra, y_pred_with_extra)
acc_without_extra = accuracy_score(y_without_extra, y_pred_without_extra)

print(f"Accuracy for models with extra features: {acc_with_extra}")
print(f"Accuracy for models without extra features: {acc_without_extra}")

# Combine predictions for overall accuracy
y_pred_combined = y_pred_with_extra.tolist() + y_pred_without_extra.tolist()
y_true_combined = y_with_extra.tolist() + y_without_extra.tolist()

# Calculate overall accuracy
overall_accuracy = accuracy_score(y_true_combined, y_pred_combined)

print(f"Overall accuracy: {overall_accuracy}")


100%|██████████| 700/700 [00:24<00:00, 28.51it/s]


Accuracy for models with extra features: 1.0
Accuracy for models without extra features: 0.9993865030674847
Overall accuracy: 0.9994949494949495
