# Transformer Architecture for Time Series:

- Self-attention mechanism captures long-range dependencies
- Positional encoding preserves sequence order
- More parallelizable than LSTM (faster training)
- Better at capturing complex patterns in longer sequences

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Device configuration for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Reference date for time-based calculations
REFERENCE_DATE = pd.Timestamp("2015-01-01")

Using device: cuda


In [2]:
import kagglehub

# Download latest version
TRAIN_DF_PATH = kagglehub.dataset_download("sachinchaudhary123/rossman-fe")
TRAIN_FILE_NAME = "/train_with_clusters.csv"
 
# Load dataset from feature engineering
train = pd.read_csv(TRAIN_DF_PATH+TRAIN_FILE_NAME, index_col="Date", parse_dates=True)
display(train.head())

  train = pd.read_csv(TRAIN_DF_PATH+TRAIN_FILE_NAME, index_col="Date", parse_dates=True)


Unnamed: 0_level_0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,SalePerCustomer,LogSales,...,IsSchoolHoliday,Sales_Rolling_Mean_7,Sales_Rolling_Mean_14,Sales_Rolling_Mean_28,Sales_Lag_7,Sales_Lag_14,Sales_Lag_21,Sales_Lag_28,Lag_SalePerCustomer_7day,StoreCluster
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-07-31,1,4,5263,555,1,1,0,1,9.482883,8.568646,...,1,,,,,,,,,0
2015-07-31,2,4,6064,625,1,1,0,1,9.7024,8.71029,...,1,,,,,,,,,1
2015-07-31,3,4,8314,821,1,1,0,1,10.126675,9.025816,...,1,,,,,,,,,1
2015-07-31,4,4,13995,1498,1,1,0,1,9.342457,9.546527,...,1,,,,,,,,,0
2015-07-31,5,4,4822,559,1,1,0,1,8.626118,8.481151,...,1,,,,,,,,,2


In [3]:
def rmspe(y_true, y_pred):
    """
    Calculate Root Mean Square Percentage Error (Kaggle metric).

    Args:
        y_true: Actual values (numpy array)
        y_pred: Predicted values (numpy array)

    Returns:
        RMSPE score (float)

    Note: Only considers samples where y_true > 0
    """
    # Convert to numpy if tensor
    if isinstance(y_true, torch.Tensor):
        y_true = y_true.cpu().numpy()
    if isinstance(y_pred, torch.Tensor):
        y_pred = y_pred.cpu().numpy()

    # Flatten arrays
    y_true = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()

    # Filter out zero sales (as per Kaggle rules)
    mask = y_true > 0
    y_true_filtered = y_true[mask]
    y_pred_filtered = y_pred[mask]

    if len(y_true_filtered) == 0:
        return 0.0

    # Calculate percentage errors
    pct_errors = (y_true_filtered - y_pred_filtered) / y_true_filtered

    # Calculate RMSPE
    rmspe_score = np.sqrt(np.mean(pct_errors ** 2))

    return rmspe_score

In [4]:
class PositionalEncoding(nn.Module):
    """
    Injects positional information into the sequence.
    
    PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
    PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
    """
    
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        
        pe = pe.unsqueeze(0)  # Add batch dimension: (1, max_len, d_model)
        
        # Register as buffer (not a parameter, but part of state)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, seq_len, d_model)
        Returns:
            Tensor with positional encoding added
        """
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [5]:
import time

start_time = time.time()

# Features for Transformer
transformer_features = [
    "Sales", "Promo", "IsStateHoliday",
    "IsSchoolHoliday", "Sales_Lag_7", "Sales_Rolling_Mean_7"
]

# Create fresh copy with only needed columns
transformer_df = train[transformer_features + ["Store", "StoreCluster"]].copy()

# Step 1: Drop NaN rows (required for sequence creation anyway)
transformer_clean = transformer_df.dropna(subset=transformer_features)
print(f"Original rows: {len(transformer_df):,}")
print(f"After dropna: {len(transformer_clean):,}")

# Step 2: Fit Sales Scaler (for inverse transform)
sales_scaler = MinMaxScaler()
sales_scaler.fit(transformer_clean[["Sales"]])
print(f"\nSales Range: {transformer_clean['Sales'].min():,.0f} to {transformer_clean['Sales'].max():,.0f}")

# Step 3: Scale ALL features at once using a single scaler
feature_scaler = MinMaxScaler()
transformer_clean[transformer_features] = feature_scaler.fit_transform(
    transformer_clean[transformer_features]
)

elapsed = time.time() - start_time
print(f"\nScaling complete in {elapsed:.2f} seconds!")
print(f"Scaled data shape: {transformer_clean.shape}")

display(transformer_clean.head())

Original rows: 1,017,209
After dropna: 1,009,404

Sales Range: 0 to 41,551

Scaling complete in 0.17 seconds!
Scaled data shape: (1009404, 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transformer_clean[transformer_features] = feature_scaler.fit_transform(


Unnamed: 0_level_0,Sales,Promo,IsStateHoliday,IsSchoolHoliday,Sales_Lag_7,Sales_Rolling_Mean_7,Store,StoreCluster
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-07-24,0.089192,0.0,0.0,0.0,0.126664,0.029059,1,0
2015-07-24,0.092753,0.0,0.0,1.0,0.145941,0.045787,2,1
2015-07-24,0.122259,0.0,0.0,1.0,0.200091,0.07161,3,1
2015-07-24,0.200284,0.0,0.0,1.0,0.336815,0.133684,4,0
2015-07-24,0.091815,0.0,0.0,1.0,0.11605,0.147202,5,2


In [6]:

transformer_features = [
    "Sales", "Promo", "IsStateHoliday",
    "IsSchoolHoliday", "Sales_Lag_7", "Sales_Rolling_Mean_7"
]

# Get required columns
cols_needed = transformer_features + ["Store", "StoreCluster"]
transformer_df = train[cols_needed].copy()

# Drop NaN
print(f"\nOriginal rows: {len(transformer_df):,}")
transformer_clean = transformer_df.dropna()
print(f"After dropna: {len(transformer_clean):,}")


# PROPER SCALING - Scale Features before Creating Sequences

# Create a copy for scaling
scaled_df = transformer_clean.copy()

# Scale ALL features together
feature_scaler = MinMaxScaler()
scaled_df[transformer_features] = feature_scaler.fit_transform(
    scaled_df[transformer_features]
)

# Verify scaling worked
print(f"\After Scaling:")
print(f"  Features min: {scaled_df[transformer_features].min().min():.4f}")
print(f"  Features max: {scaled_df[transformer_features].max().max():.4f}")

# Should be 0.0 to 1.0
assert scaled_df[transformer_features].min().min() >= 0.0, "Min should be >= 0"
assert scaled_df[transformer_features].max().max() <= 1.0, "Max should be <= 1"
print("All features properly scaled to [0, 1]")


Original rows: 1,017,209
After dropna: 1,009,404


  print(f"\After Scaling:")


\After Scaling:
  Features min: 0.0000
  Features max: 1.0000
All features properly scaled to [0, 1]


In [7]:
def create_sequences_with_dates(df, features, target, window=30):
    """Create sequences with date tracking for analysis."""
    X, y, store_ids, dates = [], [], [], []
    
    for store_id, store_df in df.groupby("Store"):
        store_df = store_df.dropna().sort_index()
        values = store_df[features].values
        targets = store_df[target].values
        date_index = store_df.index
        
        for i in range(window, len(store_df)):
            X.append(values[i-window:i])
            y.append(targets[i])
            store_ids.append(store_id)
            dates.append(date_index[i])
    
    return np.array(X), np.array(y), np.array(store_ids), dates

WINDOW_SIZE = 30

X_trans, y_trans, store_ids_trans, date_indices_trans = create_sequences_with_dates(
    scaled_df, transformer_features, "Sales", WINDOW_SIZE
)

# Get cluster IDs
cluster_lookup = (
    train[["Store", "StoreCluster"]]
    .drop_duplicates()
    .set_index("Store")["StoreCluster"]
)
cluster_ids_trans = np.array([cluster_lookup[s] for s in store_ids_trans])

print(f"Transformer Dataset:")
print(f"   Sequences: {X_trans.shape[0]:,}")
print(f"   Window size: {X_trans.shape[1]}")
print(f"   Features: {X_trans.shape[2]}")

Transformer Dataset:
   Sequences: 975,954
   Window size: 30
   Features: 6


In [8]:
split_idx = int(len(X_trans) * 0.8)

X_tr = X_trans[:split_idx]
X_va = X_trans[split_idx:]
y_tr = y_trans[:split_idx]
y_va = y_trans[split_idx:]
c_tr = cluster_ids_trans[:split_idx]
c_va = cluster_ids_trans[split_idx:]
dates_va = date_indices_trans[split_idx:]
stores_va = store_ids_trans[split_idx:]

print(f"Split Summary:")
print(f"   Training: {len(X_tr):,} samples")
print(f"   Validation: {len(X_va):,} samples")

Split Summary:
   Training: 780,763 samples
   Validation: 195,191 samples


In [9]:
class TransformerDataset(Dataset):
    """Dataset for Transformer model."""
    
    def __init__(self, X, y, cluster_ids):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.cluster_ids = torch.tensor(cluster_ids, dtype=torch.long)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.cluster_ids[idx], self.y[idx]

BATCH_SIZE = 128

train_ds = TransformerDataset(X_tr, y_tr, c_tr)
val_ds = TransformerDataset(X_va, y_va, c_va)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=3)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=3)

print(f"DataLoader Info:")
print(f"   Training batches: {len(train_loader)}")
print(f"   Validation batches: {len(val_loader)}")

num_features = X_tr.shape[2]
num_clusters = int(cluster_ids_trans.max() + 1)

DataLoader Info:
   Training batches: 6100
   Validation batches: 1525


In [10]:
class SalesTransformer(nn.Module):
    """
    Transformer model for sales forecasting with cluster embeddings.
    
    Architecture:
    1. Input projection: (num_features) -> (d_model)
    2. Positional encoding: Add position information
    3. Transformer encoder: Self-attention layers
    4. Cluster embedding: Store cluster information
    5. Output projection: Predict sales
    """
    
    def __init__(
        self,
        num_features,
        d_model=64,
        nhead=4,
        num_encoder_layers=2,
        dim_feedforward=128,
        dropout=0.1,
        num_clusters=4,
        cluster_embed_dim=8
    ):
        super().__init__()
        
        self.d_model = d_model
        
        # Input projection layer
        self.input_projection = nn.Linear(num_features, d_model)
        
        # Positional encoding
        self.pos_encoder = PositionalEncoding(d_model, dropout=dropout)
        
        # Transformer encoder layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True  # Input shape: (batch, seq, features)
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_encoder_layers
        )
        
        # Cluster embedding
        self.cluster_embedding = nn.Embedding(num_clusters, cluster_embed_dim)
        
        # Output layers
        self.fc = nn.Sequential(
            nn.Linear(d_model + cluster_embed_dim, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Initialize weights for better convergence."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def forward(self, x, cluster_id, src_mask=None):
        """
        Args:
            x: Input tensor (batch_size, seq_len, num_features)
            cluster_id: Store cluster IDs (batch_size,)
            src_mask: Optional attention mask
        
        Returns:
            Predicted sales (batch_size,)
        """
        # Project input to d_model dimensions
        x = self.input_projection(x) * math.sqrt(self.d_model)
        
        # Add positional encoding
        x = self.pos_encoder(x)
        
        # Pass through transformer encoder
        x = self.transformer_encoder(x, src_mask)
        
        # Use the last time step's output (like LSTM)
        x = x[:, -1, :]  # (batch_size, d_model)
        
        # Get cluster embedding
        cluster_emb = self.cluster_embedding(cluster_id)  # (batch_size, cluster_embed_dim)
        
        # Concatenate and predict
        combined = torch.cat([x, cluster_emb], dim=1)
        output = self.fc(combined)
        
        return output.squeeze(1)

In [11]:
transformer_model = SalesTransformer(
        num_features=num_features,
        d_model=64,
        nhead=4,
        num_encoder_layers=2,
        dim_feedforward=128,
        dropout=0.1,
        num_clusters=num_clusters,
        cluster_embed_dim=8
    ).to(device)

print("Using: SalesTransformer")

Using: SalesTransformer


In [12]:
# Print model summary
print("TRANSFORMER MODEL ARCHITECTURE")
print(transformer_model)

total_params = sum(p.numel() for p in transformer_model.parameters())
trainable_params = sum(p.numel() for p in transformer_model.parameters() if p.requires_grad)
print(f"Parameters: {total_params:,} total, {trainable_params:,} trainable")

TRANSFORMER MODEL ARCHITECTURE
SalesTransformer(
  (input_projection): Linear(in_features=6, out_features=64, bias=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=128, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (cluster_embedding): Embedding(4, 8)
  (fc): Sequential(
    (0): Linear(in_feat

In [13]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(
    transformer_model.parameters(),
    lr=0.001,
    weight_decay=0.01  # L2 regularization
)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.5,
    patience=3)

In [14]:
EPOCHS = 10
best_val_loss = float('inf')
patience_counter = 0
EARLY_STOPPING_PATIENCE = 7

train_losses = []
val_losses = []
val_rmses = []

print("TRAINING TRANSFORMER")

for epoch in range(EPOCHS):
    #  Training Phase 
    transformer_model.train()
    epoch_train_losses = []
    
    for X_batch, c_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        c_batch = c_batch.to(device)
        y_batch = y_batch.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        predictions = transformer_model(X_batch, c_batch)
        loss = criterion(predictions, y_batch)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping (important for transformers)
        torch.nn.utils.clip_grad_norm_(transformer_model.parameters(), max_norm=1.0)
        
        optimizer.step()
        epoch_train_losses.append(loss.item())
    
    avg_train_loss = np.mean(epoch_train_losses)
    train_losses.append(avg_train_loss)
    
    # Validation Phase 
    transformer_model.eval()
    epoch_val_losses = []
    val_preds_scaled = []
    val_true_scaled = []
    
    with torch.no_grad():
        for X_batch, c_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            c_batch = c_batch.to(device)
            y_batch = y_batch.to(device)
            
            predictions = transformer_model(X_batch, c_batch)
            loss = criterion(predictions, y_batch)
            
            epoch_val_losses.append(loss.item())
            val_preds_scaled.extend(predictions.cpu().numpy())
            val_true_scaled.extend(y_batch.cpu().numpy())
    
    avg_val_loss = np.mean(epoch_val_losses)
    val_losses.append(avg_val_loss)
    
    # Calculate RMSE (scaled)
    scaled_rmse = np.sqrt(mean_squared_error(val_true_scaled, val_preds_scaled))
    val_rmses.append(scaled_rmse)
    
    # Learning rate scheduling
    scheduler.step(avg_val_loss)
    
    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        # Save best model
        best_model_state = transformer_model.state_dict().copy()
    else:
        patience_counter += 1
    
    # Print progress
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Epoch {epoch+1:2d}/{EPOCHS} | "
          f"Train Loss: {avg_train_loss:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | "
          f"Val RMSE: {scaled_rmse:.4f} | "
          f"LR: {current_lr:.6f}")
    
    # Early stopping
    if patience_counter >= EARLY_STOPPING_PATIENCE:
        print(f"Early stopping triggered at epoch {epoch+1}")
        break

# Load best model
transformer_model.load_state_dict(best_model_state)
print(f"Loaded best model (Val Loss: {best_val_loss:.4f})")

TRAINING TRANSFORMER
Epoch  1/10 | Train Loss: 0.0010 | Val Loss: 0.0001 | Val RMSE: 0.0079 | LR: 0.001000
Epoch  2/10 | Train Loss: 0.0001 | Val Loss: 0.0000 | Val RMSE: 0.0068 | LR: 0.001000
Epoch  3/10 | Train Loss: 0.0001 | Val Loss: 0.0001 | Val RMSE: 0.0085 | LR: 0.001000
Epoch  4/10 | Train Loss: 0.0001 | Val Loss: 0.0002 | Val RMSE: 0.0131 | LR: 0.001000
Epoch  5/10 | Train Loss: 0.0001 | Val Loss: 0.0001 | Val RMSE: 0.0097 | LR: 0.001000
Epoch  6/10 | Train Loss: 0.0001 | Val Loss: 0.0001 | Val RMSE: 0.0092 | LR: 0.000500
Epoch  7/10 | Train Loss: 0.0000 | Val Loss: 0.0001 | Val RMSE: 0.0084 | LR: 0.000500
Epoch  8/10 | Train Loss: 0.0000 | Val Loss: 0.0001 | Val RMSE: 0.0086 | LR: 0.000500
Epoch  9/10 | Train Loss: 0.0000 | Val Loss: 0.0001 | Val RMSE: 0.0088 | LR: 0.000500
Early stopping triggered at epoch 9
Loaded best model (Val Loss: 0.0000)


In [19]:
transformer_model.eval()
val_preds_scaled = []
val_true_scaled = []

with torch.no_grad():
    for X_batch, c_batch, y_batch in val_loader:
        X_batch = X_batch.to(device)
        c_batch = c_batch.to(device)
        
        predictions = transformer_model(X_batch, c_batch)
        
        val_preds_scaled.extend(predictions.cpu().numpy())
        val_true_scaled.extend(y_batch.numpy())

val_preds_scaled = np.array(val_preds_scaled)
val_true_scaled = np.array(val_true_scaled)

print(f"Predictions range: {val_preds_scaled.min():.4f} to {val_preds_scaled.max():.4f}")
print(f"Actual range: {val_true_scaled.min():.4f} to {val_true_scaled.max():.4f}")

Predictions range: 0.0148 to 0.8091
Actual range: 0.0000 to 1.0000


In [16]:
# Clip predictions to valid range before inverse transform
val_preds_clipped = np.clip(val_preds_scaled, 0, 1)

# Inverse transform
val_preds_original = sales_scaler.inverse_transform(
    val_preds_clipped.reshape(-1, 1)
).flatten()

val_true_original = sales_scaler.inverse_transform(
    val_true_scaled.reshape(-1, 1)
).flatten()

print("Original Scale Results")
print(f"Predictions range: {val_preds_original.min():,.0f} to {val_preds_original.max():,.0f}")
print(f"Actual range: {val_true_original.min():,.0f} to {val_true_original.max():,.0f}")

Original Scale Results
Predictions range: 616 to 33,620
Actual range: 0 to 41,551


In [20]:
# RMSE
rmse_transformer = np.sqrt(mean_squared_error(val_true_original, val_preds_original))
rmspe_transformer = rmspe(val_true_original, val_preds_original)
# MAE
mae_transformer = mean_absolute_error(val_true_original, val_preds_original)

# R² Score
r2_transformer = r2_score(val_true_original, val_preds_original)



# Average sales for context
avg_sales = val_true_original.mean()


In [22]:
# Model Error Metrics
print(f"RMSE (Root Mean Square Error): {rmse_transformer:>12,.2f} sales units")
print(f"RMSPE Transformer: {rmspe_transformer:>12,.2f}")
print(f"R² Score: {r2_transformer:>12.4f}")
# Business Context Metrics
print(f"Average Actual Sales: {avg_sales:>12,.2f} units")
print(f"RMSE as % of Average: {(rmse_transformer/avg_sales)*100:>12.2f}%")

RMSE (Root Mean Square Error):       366.86 sales units
RMSPE Transformer:         0.06
R² Score:       0.9903
Average Actual Sales:     5,731.55 units
RMSE as % of Average:         6.40%
