In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
from torch.utils import data

In [4]:
from torch.optim.lr_scheduler import ExponentialLR
# Define early stopping parameters
patience = 8
min_delta = 0.00001 # Minimum change in validation loss to be considered an improvement
cumulative_delta = False  # Set to True if min_delta defines increase since last patience reset

best_val_loss = float('inf')
#epochs_no_improve = 0

# Define learning rate scheduler
#scheduler = ExponentialLR(optimizer, gamma=0.9)

In [2]:
TNBC_C= pd.read_csv(r"/Users/xinwang/Dropbox (Choate)/Isabella Dropbox/Topology_ST/TNBC_Slides/TNBC_ST_C/TNBC_C_ITF800_HLAB.csv")
TNBC_C= TNBC_C.iloc[:, 1:]
X = TNBC_C.iloc[:, :-1]
y = TNBC_C.iloc[:,-1]
# Step 1: Feature Scaling/Normalization
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
# Step 2: Target Scaling
target_scaler = MinMaxScaler()
y_scaled = target_scaler.fit_transform(y.values.reshape(-1,1))
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)
batch_size = 32

train_data = data.TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
train_loader = data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = data.TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
val_loader = data.DataLoader(val_data, batch_size=batch_size, shuffle=False)

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x, attn_output
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x, attn_output

In [8]:
class TransformerRegression(nn.Module):
    def __init__(self, src_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(TransformerRegression, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        
        self.fc = nn.Linear(d_model, 1)  # Output layer for regression
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        return src_mask

    def forward(self, src):
        src_mask = self.generate_mask(src)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src.long())))  # Use .long() to convert to integer type

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output, att_score  = enc_layer(enc_output, src_mask)

        output = self.fc(enc_output[:, 0])  # Only using the first token's output for regression
        return output

# Instantiate the model
src_vocab_size = X.shape[1]  # Number of input features
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = X.shape[1]  # Use the same as the number of input features
dropout = 0.1

transformer = TransformerRegression(src_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
scheduler = ExponentialLR(optimizer, gamma=0.9)
epochs_no_improve = 0
num_epochs = 50

for epoch in range(num_epochs):
    transformer.train()  # Set the model to training mode
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        output = transformer(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
    # Apply learning rate scheduler
    scheduler.step()
    # Print learning rate
    current_lr = optimizer.param_groups[0]['lr']  # Get the current learning rate from the optimizer
    print(f"Epoch: {epoch+1}, Learning Rate: {current_lr:.6f}")

    transformer.eval()  # Switch to evaluation mode for validation
    with torch.no_grad():
        val_mse_sum = 0.0
        val_mae_sum = 0.0
        val_pearson_sum = 0.0  # Initialize Pearson sum
        num_val_batches = 0

        for batch_X, batch_y in val_loader:
            val_output = transformer(batch_X)
            val_mse_sum += mean_squared_error(batch_y, val_output)
            val_mae_sum += mean_absolute_error(batch_y, val_output)
            val_pearson, _ = pearsonr(batch_y.view(-1), val_output.view(-1))  # Calculate Pearson correlation
            val_pearson_sum += val_pearson
            num_val_batches += 1

        # Calculate average metrics over all validation batches
        avg_val_mse = val_mse_sum / num_val_batches
        avg_val_mae = val_mae_sum / num_val_batches
        avg_val_pearson = val_pearson_sum / num_val_batches
    # Check for early stopping
    if avg_val_mse < best_val_loss:
        best_val_loss = avg_val_mse
        epochs_no_improve = 0
    else:
       # best_val_loss = avg_val_mse
        epochs_no_improve += 1

    if epochs_no_improve == patience:
        print("Early stopping triggered.")
        break
    print(f"Epoch: {epoch+1}, Avg. Val MSE: {avg_val_mse:.4f}, Avg. Val MAE: {avg_val_mae:.4f}, Avg. Val Pearson: {avg_val_pearson:.4f}")


Epoch: 1, Learning Rate: 0.000900
Epoch: 1, Avg. Val MSE: 0.0099, Avg. Val MAE: 0.0717, Avg. Val Pearson: 0.1176
Epoch: 2, Learning Rate: 0.000810
Epoch: 2, Avg. Val MSE: 0.0102, Avg. Val MAE: 0.0716, Avg. Val Pearson: -0.0278
Epoch: 3, Learning Rate: 0.000729
Epoch: 3, Avg. Val MSE: 0.0101, Avg. Val MAE: 0.0716, Avg. Val Pearson: 0.3181


In [None]:
torch.save(transformer.state_dict(), r"/Users/xinwang/Dropbox (Choate)/Isabella Dropbox/VSCODE/TransformerAttention800/HLAB_TNBC_C.pty")