<a href="https://colab.research.google.com/github/hakim733/scania-predictive-maintenance/blob/main/scania_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import gdown
import pandas as pd

# ----------------------------
# 1. TRAINING DATA
# ----------------------------
# Training specifications
file_id = '18A955gJOHf0Ay3UComqy3SigMjf2X3Pi'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'train_specifications.csv', quiet=False)
train_specs = pd.read_csv("train_specifications.csv")

# Training labels (time-to-event)
file_id = '1WG7FYf9KaidtOaM04gBubkwYLi292Ken'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'train_tte.csv', quiet=False)
train_tte = pd.read_csv("train_tte.csv")[['vehicle_id', 'in_study_repair','length_of_study_time_step']]

# Training sensor data (critical fix: corrected file ID)
file_id = '1zBeLuuyEB1SOc_Fr06khtC090-cWFQGp'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'train_operational_readouts.csv', quiet=False)
train_ops = pd.read_csv("train_operational_readouts.csv")
# ----------------------------
#  Merge training data
# ----------------------------#
train_df = train_ops.merge(train_tte, on='vehicle_id').merge(train_specs, on='vehicle_id')
train_df.dropna(subset=['in_study_repair', 'length_of_study_time_step', 'time_step'], inplace=True)

#----------------------------
# 2. VALIDATION DATA
# ----------------------------
# Validation specifications
url = "https://drive.google.com/file/d/1tc-cAlepMP_EbObvxv0J6oqPH3u2DlLL/view?usp=sharing"
gdown.download(url, 'validation_specifications.csv', fuzzy=True, quiet=False)
specs_val = pd.read_csv("validation_specifications.csv")

# Validation sensor data
file_id = '1cLKKPAP7sZRE0SxjFc7wGX_AfARcaHGc'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'validation_operational_readouts.csv', quiet=False)
ops_val = pd.read_csv("validation_operational_readouts.csv")

# Validation labels
file_id = '1lnd0VkDb2234csyl1OGt2jecUx6KPbuw'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'validation_labels.csv', quiet=False)
labels_val = pd.read_csv("validation_labels.csv")
# ----------------------------
# 3. Merge val data
# ----------------------------
val_df = ops_val.merge(labels_val, on='vehicle_id').merge(specs_val, on='vehicle_id')
val_df.dropna(inplace=True)

# ----------------------------
# 3. TEST DATA
# ----------------------------
# Test specifications (corrected)
file_id = '138drtUXNmG4dwrJgIESUNZuW0aHKEPJU'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'test_specifications.csv', quiet=False)
specs_test = pd.read_csv("test_specifications.csv")

# Test sensor data (critical fix: new file ID)
file_id = '1djLlBBTTyh4l4DmW_TrTtQ6P8HpTl26E'  # New correct file ID
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'test_operational_readouts.csv', quiet=False)
ops_test = pd.read_csv("test_operational_readouts.csv")



# Test labels (ADD THIS SECTION)
file_id = '15njyTfKdErU3cvd837nuFoMBIgTtOOgv'  # Replace with actual file ID
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'test_labels.csv', quiet=False)
labels_test = pd.read_csv("test_labels.csv")
#Merginf test_data
test_df = ops_test.merge(labels_test, on='vehicle_id').merge(specs_test, on='vehicle_id')


Downloading...
From: https://drive.google.com/uc?id=18A955gJOHf0Ay3UComqy3SigMjf2X3Pi
To: /content/train_specifications.csv
100%|██████████| 1.08M/1.08M [00:00<00:00, 143MB/s]
Downloading...
From: https://drive.google.com/uc?id=1WG7FYf9KaidtOaM04gBubkwYLi292Ken
To: /content/train_tte.csv
100%|██████████| 345k/345k [00:00<00:00, 118MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1zBeLuuyEB1SOc_Fr06khtC090-cWFQGp
From (redirected): https://drive.google.com/uc?id=1zBeLuuyEB1SOc_Fr06khtC090-cWFQGp&confirm=t&uuid=a305dc4b-7a2f-4d98-8e55-6adabd6968d9
To: /content/train_operational_readouts.csv
100%|██████████| 1.22G/1.22G [00:16<00:00, 75.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1tc-cAlepMP_EbObvxv0J6oqPH3u2DlLL
To: /content/validation_specifications.csv
100%|██████████| 232k/232k [00:00<00:00, 14.7MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1cLKKPAP7sZRE0SxjFc7wGX_AfARcaHGc
From (redirected): https://drive.google.com/uc?id=1cLK

In [5]:
import numpy as np

# after processor.load_data() and processor.process_classes(...)
print("Train nulls per column:\n", train_df.isna().sum())
print("Val   nulls per column:\n", val_df.isna().sum())
print("Test  nulls per column:\n", test_df.isna().sum())

# also check your sequence arrays
#print("X_train contains NaN?", np.isnan(X_train).any())
#print("X_val   contains NaN?", np.isnan(X_val).any())
#print("X_test  contains NaN?", np.isnan(X_test).any())


Train nulls per column:
 vehicle_id       0
time_step        0
171_0            0
666_0           40
427_0         6405
              ... 
Spec_3           0
Spec_4           0
Spec_5           0
Spec_6           0
Spec_7           0
Length: 117, dtype: int64
Val   nulls per column:
 vehicle_id    0
time_step     0
171_0         0
666_0         0
427_0         0
             ..
Spec_3        0
Spec_4        0
Spec_5        0
Spec_6        0
Spec_7        0
Length: 116, dtype: int64
Test  nulls per column:
 vehicle_id       0
time_step        0
171_0            0
666_0            8
427_0         1025
              ... 
Spec_3           0
Spec_4           0
Spec_5           0
Spec_6           0
Spec_7           0
Length: 116, dtype: int64


In [None]:

import gdown
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix
from joblib import Parallel, delayed

# ========================
# Hyperparameters
# ========================
SEQ_LEN = 20
BATCH_SIZE = 64
HIDDEN_DIM = 256
DROPOUT = 0.3
EPOCHS = 10
LEARNING_RATE = 1e-3

COST_MATRIX = torch.tensor([
 # Pred→  0    1    2    3    4    5
    [    0,   7,   8,   9,  10,  11],  # True=0
    [  200,   0,   7,   8,   9,  10],  # True=1
    [  300, 200,   0,   7,   8,   9],  # True=2
    [  400, 300, 200,   0,   7,   8],  # True=3
    [  500, 400, 300, 200,   0,   7],  # True=4
    [  600, 500, 400, 300, 200,   0],  # True=5
], dtype=torch.float32)


# ========================
# 1. Enhanced Data Loading with Categorical Handling
# ========================
class DataProcessor:
    def __init__(self):
        self.encoder = OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1
        )
        self.fitted = False
        self.spec_cols = [f'Spec_{i}' for i in range(8)]
        self.sensor_prefixes = ("167_", "272_", "459_")

    def load_data(self):
        def load_and_merge(spec_path, ops_path, tte_path=None, label_path=None):
            specs = pd.read_csv(spec_path)
            ops   = pd.read_csv(ops_path)

            # (optional) fill NaNs in your time series:
            ops = (
                ops.sort_values(['vehicle_id','time_step'])
                   .groupby('vehicle_id')
                   .apply(lambda g: g.ffill().bfill())
                   .reset_index(drop=True)
            )

            # encode specs
            if not self.fitted:
                specs[self.spec_cols] = self.encoder.fit_transform(
                    specs[self.spec_cols]
                )
                self.fitted = True
            else:
                specs[self.spec_cols] = self.encoder.transform(
                    specs[self.spec_cols]
                )

            # merge in either raw TTE (for train) or pre-binned labels (for val/test)
            if tte_path is not None:
                tte = pd.read_csv(tte_path)
                df  = ops.merge(tte, on='vehicle_id').merge(specs, on='vehicle_id')
            elif label_path is not None:
                lbl = pd.read_csv(label_path)
                df  = ops.merge(lbl, on='vehicle_id').merge(specs, on='vehicle_id')
            else:
                df  = ops.merge(specs, on='vehicle_id')

            df.dropna(inplace=True)
            return df

        train_df = load_and_merge(
            "train_specifications.csv",
            "train_operational_readouts.csv",
            tte_path="train_tte.csv"
        )
        val_df   = load_and_merge(
            "validation_specifications.csv",
            "validation_operational_readouts.csv",
            label_path="validation_labels.csv"
        )
        test_df  = load_and_merge(
            "test_specifications.csv",
            "test_operational_readouts.csv",
            label_path="test_labels.csv"
        )

        return train_df, val_df, test_df

    def process_classes(self, df, is_training=True):
        if is_training:
            # recompute 6-bucket labels from raw TTE
            df['tte'] = df['length_of_study_time_step'] - df['time_step']
            bins   = [-np.inf, 0, 6, 12, 24, 48, np.inf]
            labels = [0,    5,  4,   3,   2,   1]
            df['class_label'] = (
                pd.cut(df['tte'], bins=bins, labels=labels)
                  .astype(int)
                  .where(df['in_study_repair']==1, 0)
            )
            df.drop(columns=['tte'], inplace=True)
        else:
            # val/test already have `class_label` 0–5
            df['class_label'] = df['class_label'].astype(int)

        return df

# Initialize data processor
processor = DataProcessor()
train_df, val_df, test_df = processor.load_data()

train_df = processor.process_classes(train_df, is_training=True)
val_df   = processor.process_classes(val_df,   is_training=False)
test_df  = processor.process_classes(test_df,  is_training=False)

#X_train, y_train = prepare_data(train_df)
#X_val,   y_val   = prepare_data(val_df)
#X_test,  y_test  = prepare_data(test_df)



# ========================
# 2. Optimized Sequence Generation
# ========================
def prepare_data(df, seq_len=20):
    # Feature engineering with vectorized operations
    sensor_cols = [c for c in df.columns if c.startswith(processor.sensor_prefixes)]
    feature_cols = sensor_cols + processor.spec_cols

    # Generate sequences using sliding window approach
    groups = df.groupby('vehicle_id')
    sequences = []
    labels = []

    for _, group in groups:
        group_data = group[feature_cols].values.astype(np.float32)
        if len(group_data) >= seq_len:
            # Create sliding windows using NumPy
            for i in range(len(group_data) - seq_len + 1):
                window = group_data[i:i+seq_len]
                sequences.append(window)
                labels.append(group['class_label'].iloc[i+seq_len-1])

    return np.array(sequences), np.array(labels)

# Generate and scale datasets
X_train, y_train = prepare_data(train_df)
X_val, y_val = prepare_data(val_df)
X_test, y_test = prepare_data(test_df)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_val = scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)
X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

# ========================
# 3. Enhanced Model Architecture
# ========================
class PredictiveMaintenanceModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.gru = nn.GRU(input_size, HIDDEN_DIM,
                         bidirectional=True,
                         num_layers=2,
                         dropout=DROPOUT,
                         batch_first=True)
        self.attention = nn.Sequential(
            nn.Linear(HIDDEN_DIM*2, 128),
            nn.Tanh(),
            nn.Linear(128, 1))
        #self.classifier = nn.Sequential(
         #   nn.Linear(HIDDEN_DIM*2, 64),
          #  nn.ReLU(),
           # nn.Dropout(DROPOUT),
            #nn.Linear(64, 5))
        self.classifier = nn.Sequential(
            nn.Linear(HIDDEN_DIM*2, 64),
            nn.ReLU(),
            nn.Dropout(DROPOUT),
            nn.Linear(64, 6))   # <-- now output 6 logits for classes 0–5

    def forward(self, x):
        out, _ = self.gru(x)
        attn_weights = torch.softmax(self.attention(out), dim=1)
        context = torch.sum(attn_weights * out, dim=1)
        return self.classifier(context)

model = PredictiveMaintenanceModel(X_train.shape[-1])

# ========================
# 4. Cost-Sensitive Training
# ========================
class CostAwareLoss(nn.Module):
    def __init__(self, cost_matrix):
        super().__init__()
        self.cost_matrix = cost_matrix

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        preds = torch.argmax(inputs, dim=1)
        costs = self.cost_matrix[targets, preds]
        return torch.mean(ce_loss * costs)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)
#criterion = CostAwareLoss(COST_MATRIX)
scaled_cost = COST_MATRIX / 100.0
criterion = nn.CrossEntropyLoss()

# ========================
# 5. Training Loop with Early Stopping
# ========================
def create_loader(X, y, batch_size=64, shuffle=False):
    dataset = TensorDataset(
        torch.tensor(X, dtype=torch.float32),
        torch.tensor(y, dtype=torch.long)
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
                      pin_memory=True, num_workers=4)

train_loader = create_loader(X_train, y_train, shuffle=True)
val_loader = create_loader(X_val, y_val)
test_loader = create_loader(X_test, y_test)

best_val_loss = float('inf')
for epoch in range(EPOCHS):
    # Training phase
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item()

    # Validation phase
    model.eval()
    val_loss, correct = 0, 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            val_loss += criterion(outputs, y_batch).item()
            correct += (outputs.argmax(1) == y_batch).sum().item()

    val_loss /= len(val_loader)
    val_acc = correct / len(val_loader.dataset)
    scheduler.step(val_loss)

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')

    print(f"Epoch {epoch+1:02d} | Train Loss: {train_loss/len(train_loader):.4f} | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

# Load best model for evaluation
model.load_state_dict(torch.load('best_model.pth'))

# ========================
# 6. Final Evaluation
# ========================
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        all_preds.extend(outputs.argmax(1).tolist())
        all_labels.extend(y_batch.tolist())

from sklearn.metrics import classification_report, confusion_matrix

bucket_names = [
    "Healthy (tte ≤ 0)",    # 0
    "> 48h remaining",      # 1
    "24–48h remaining",     # 2
    "12–24h remaining",     # 3
    "6–12h remaining",      # 4
    "0–6h remaining"        # 5
]

# Ensure we report on all 6 buckets, even if some are missing in this run
labels = [0, 1, 2, 3, 4, 5]

print(classification_report(
    all_labels,
    all_preds,
    labels=labels,
    target_names=bucket_names,
    zero_division=0  # avoids errors if a class has 0 predictions
))

print("Confusion Matrix:")
print(confusion_matrix(
    all_labels,
    all_preds,
    labels=labels
))



  .apply(lambda g: g.ffill().bfill())
  .apply(lambda g: g.ffill().bfill())
  .apply(lambda g: g.ffill().bfill())


Epoch 01 | Train Loss: 0.3365 | Val Loss: 0.4375 | Val Acc: 0.8799
Epoch 02 | Train Loss: 0.1912 | Val Loss: 0.7786 | Val Acc: 0.8406
Epoch 03 | Train Loss: 0.1316 | Val Loss: 0.7809 | Val Acc: 0.8486
Epoch 04 | Train Loss: 0.1072 | Val Loss: 0.9239 | Val Acc: 0.8278
Epoch 05 | Train Loss: 0.0947 | Val Loss: 0.7234 | Val Acc: 0.8822
Epoch 06 | Train Loss: 0.0864 | Val Loss: 0.9594 | Val Acc: 0.8304
Epoch 07 | Train Loss: 0.0825 | Val Loss: 0.9516 | Val Acc: 0.8601
Epoch 08 | Train Loss: 0.0495 | Val Loss: 1.4381 | Val Acc: 0.8422
Epoch 09 | Train Loss: 0.0391 | Val Loss: 1.6509 | Val Acc: 0.8440


In [None]:
best_model.pth

In [None]:
confusion_matrix(all_labels, all_preds)