<a href="https://colab.research.google.com/github/hakim733/scania-predictive-maintenance/blob/main/scania_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import gdown
import pandas as pd

# ----------------------------
# 1. TRAINING DATA
# ----------------------------
# Training specifications
file_id = '18A955gJOHf0Ay3UComqy3SigMjf2X3Pi'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'train_specifications.csv', quiet=False)
train_specs = pd.read_csv("train_specifications.csv")

# Training labels (time-to-event)
file_id = '1WG7FYf9KaidtOaM04gBubkwYLi292Ken'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'train_tte.csv', quiet=False)
train_tte = pd.read_csv("train_tte.csv")[['vehicle_id', 'in_study_repair','length_of_study_time_step']]

# Training sensor data (critical fix: corrected file ID)
file_id = '1zBeLuuyEB1SOc_Fr06khtC090-cWFQGp'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'train_operational_readouts.csv', quiet=False)
train_ops = pd.read_csv("train_operational_readouts.csv")

# ----------------------------
# 2. VALIDATION DATA
# ----------------------------
# Validation specifications
url = "https://drive.google.com/file/d/1tc-cAlepMP_EbObvxv0J6oqPH3u2DlLL/view?usp=sharing"
gdown.download(url, 'validation_specifications.csv', fuzzy=True, quiet=False)
specs_val = pd.read_csv("validation_specifications.csv")

# Validation sensor data
file_id = '1cLKKPAP7sZRE0SxjFc7wGX_AfARcaHGc'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'validation_operational_readouts.csv', quiet=False)
ops_val = pd.read_csv("validation_operational_readouts.csv")

# Validation labels
file_id = '1lnd0VkDb2234csyl1OGt2jecUx6KPbuw'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'validation_labels.csv', quiet=False)
labels_val = pd.read_csv("validation_labels.csv")



# ----------------------------
# 3. TEST DATA
# ----------------------------
# Test specifications (corrected)
file_id = '138drtUXNmG4dwrJgIESUNZuW0aHKEPJU'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'test_specifications.csv', quiet=False)
specs_test = pd.read_csv("test_specifications.csv")

# Test sensor data (critical fix: new file ID)
file_id = '1djLlBBTTyh4l4DmW_TrTtQ6P8HpTl26E'  # New correct file ID
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'test_operational_readouts.csv', quiet=False)
ops_test = pd.read_csv("test_operational_readouts.csv")

Downloading...
From: https://drive.google.com/uc?id=18A955gJOHf0Ay3UComqy3SigMjf2X3Pi
To: /content/train_specifications.csv
100%|██████████| 1.08M/1.08M [00:00<00:00, 10.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1WG7FYf9KaidtOaM04gBubkwYLi292Ken
To: /content/train_tte.csv
100%|██████████| 345k/345k [00:00<00:00, 5.74MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1zBeLuuyEB1SOc_Fr06khtC090-cWFQGp
From (redirected): https://drive.google.com/uc?id=1zBeLuuyEB1SOc_Fr06khtC090-cWFQGp&confirm=t&uuid=825bc4a4-5336-4f73-a861-75924b64880e
To: /content/train_operational_readouts.csv
100%|██████████| 1.22G/1.22G [00:25<00:00, 48.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1tc-cAlepMP_EbObvxv0J6oqPH3u2DlLL
To: /content/validation_specifications.csv
100%|██████████| 232k/232k [00:00<00:00, 4.32MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1cLKKPAP7sZRE0SxjFc7wGX_AfARcaHGc
From (redirected): https://drive.google.com/uc?id=1c

In [88]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# --- Parameters ---
SEQ_LEN = 10
BATCH_SIZE = 64
HIDDEN_DIM = 128
EPOCHS = 10
spec_cols = [f"Spec_{i}" for i in range(8)]
cost_matrix = np.array([
    [0,   9,   10],
    [300, 0,   8],
    [500, 300, 0]
])

# ========================
# 1. Data Preprocessing
# ========================
df = train_ops.merge(train_tte, on='vehicle_id')
df = df.merge(train_specs, on='vehicle_id')
df.dropna(subset=['in_study_repair', 'length_of_study_time_step', 'time_step'], inplace=True)

# --- Compute TTE + Label ---
df['tte'] = df['length_of_study_time_step'] - df['time_step']
conditions = [
    (df['in_study_repair'] == 1) & (df['tte'] > 48),                     # class 1
    (df['in_study_repair'] == 1) & (df['tte'] > 24) & (df['tte'] <= 48), # class 2
    (df['in_study_repair'] == 1) & (df['tte'] > 12) & (df['tte'] <= 24), # class 3
    (df['in_study_repair'] == 1) & (df['tte'] > 6)  & (df['tte'] <= 12), # class 4
    (df['in_study_repair'] == 1) & (df['tte'] >= 0) & (df['tte'] <= 6),  # class 5
]
df['class_label'] = np.select(conditions, [1, 2, 3, 4, 5], default=0)

# --- Merge rare classes 1–3 into 4 due to ambalanced sampling ---
df['class_label'] = df['class_label'].replace({1: 4, 2: 4, 3: 4})
num_classes = 3  # final labels: 0, 4, 5
# Map class labels to 0, 1, 2 for CrossEntropy compatibility
label_map = {0: 0, 4: 1, 5: 2}
df['class_label'] = df['class_label'].map(label_map)

# --- Encode specs ---
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df[spec_cols] = enc.fit_transform(df[spec_cols])
df[spec_cols] = df[spec_cols].astype(float)

# --- Feature scaling ---
sensor_cols = [c for c in df.columns if c.startswith("Sensor_")]
feature_cols = sensor_cols + spec_cols
scaler = StandardScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols])



# Optional: shuffle
from sklearn.utils import shuffle
X_balanced, y_balanced = shuffle(X_balanced, y_balanced, random_state=42)

# 4. Replace in the DataLoader
tX = torch.tensor(X_balanced, dtype=torch.float32)
ty = torch.tensor(y_balanced, dtype=torch.long)
train_ds = TensorDataset(tX, ty)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)



# --- Sequence building ---
sequences, labels = [], []
for _, group in df.sort_values('time_step').groupby('vehicle_id'):
    if len(group) >= SEQ_LEN:
        seq = group[feature_cols].values[-SEQ_LEN:]
        label = group['class_label'].iloc[-1]
        sequences.append(seq)
        labels.append(label)

X_train = np.array(sequences)
y_train = np.array(labels)

tX = torch.tensor(X_train, dtype=torch.float32)
ty = torch.tensor(y_train, dtype=torch.long)
train_ds = TensorDataset(tX, ty)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)


# --- Upsampling ---
from sklearn.utils import resample

# 1. Separate the data by class
X_majority = X_train[y_train == 0]
X_class1   = X_train[y_train == 1]
X_class2   = X_train[y_train == 2]

y_majority = y_train[y_train == 0]
y_class1   = y_train[y_train == 1]
y_class2   = y_train[y_train == 2]

# 2. Upsample minority classes to match class 0
X_class1_upsampled, y_class1_upsampled = resample(
    X_class1, y_class1,
    replace=True,
    n_samples=len(X_majority),
    random_state=42
)

X_class2_upsampled, y_class2_upsampled = resample(
    X_class2, y_class2,
    replace=True,
    n_samples=len(X_majority),
    random_state=42
)


# 3. Combine all into one balanced dataset
X_balanced = np.vstack((X_majority, X_class1_upsampled, X_class2_upsampled))
y_balanced = np.hstack((y_majority, y_class1_upsampled, y_class2_upsampled))
# ========================
# 2. GRU Model
# ========================
class GRUClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_classes=3):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        _, h_n = self.gru(x)
        return self.fc(h_n.squeeze(0))

model = GRUClassifier(input_dim=tX.shape[2], hidden_dim=HIDDEN_DIM, num_classes=num_classes)

# ========================
# 3. Loss & Optimizer
# ========================
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None):
        super().__init__()
        self.gamma = gamma
        self.ce = nn.CrossEntropyLoss(weight=weight)

    def forward(self, input, target):
        logp = self.ce(input, target)
        p = torch.exp(-logp)
        loss = (1 - p) ** self.gamma * logp
        return loss.mean()



class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
criterion = FocalLoss(gamma=2.0, weight=torch.tensor(class_weights, dtype=torch.float32))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ========================
# 4. Training Loop
# ========================
for epoch in range(EPOCHS):
    model.train()
    for xb, yb in train_loader:
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate on train (placeholder)
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for xb, yb in train_loader:
            out = model(xb)
            pred = torch.argmax(out, dim=1).cpu().numpy()
            label = yb.cpu().numpy()
            all_preds.extend(pred)
            all_labels.extend(label)

    acc = accuracy_score(all_labels, all_preds)
    print(f"\nEpoch {epoch+1} — Acc: {acc:.4f}")
    print(confusion_matrix(all_labels, all_preds))
    print(classification_report(all_labels, all_preds, digits=4))



Epoch 1 — Acc: 0.5374
[[10981    79  9270]
 [   61     1   183]
 [  826    13  1139]]
              precision    recall  f1-score   support

           0     0.9253    0.5401    0.6821     20330
           1     0.0108    0.0041    0.0059       245
           2     0.1075    0.5758    0.1812      1978

    accuracy                         0.5374     22553
   macro avg     0.3478    0.3734    0.2897     22553
weighted avg     0.8436    0.5374    0.6308     22553


Epoch 2 — Acc: 0.5632
[[11716   339  8275]
 [   75    21   149]
 [  950    63   965]]
              precision    recall  f1-score   support

           0     0.9196    0.5763    0.7085     20330
           1     0.0496    0.0857    0.0629       245
           2     0.1028    0.4879    0.1698      1978

    accuracy                         0.5632     22553
   macro avg     0.3573    0.3833    0.3137     22553
weighted avg     0.8385    0.5632    0.6543     22553


Epoch 3 — Acc: 0.5238
[[10767   640  8923]
 [   69    27   149]


Validation Accuracy: 0.4388 | Cost: 24,655
Confusion Matrix:
 [[2033   35 2434]
 [   0    0    0]
 [   0    0    0]]
              precision    recall  f1-score   support

           0     0.9779    0.4516    0.6178      4502
           1     0.0000    0.0000    0.0000         0
           2     0.0000    0.0000    0.0000         0
           4     0.0000    0.0000    0.0000       131

    accuracy                         0.4388      4633
   macro avg     0.2445    0.1129    0.1545      4633
weighted avg     0.9502    0.4388    0.6004      4633



In [81]:
def process_eval_data(ops_df, specs_df, labels_df=None):
    # Merge operational + specs
    df = ops_df.merge(specs_df, on='vehicle_id', how='left')
    if labels_df is not None:
        df = df.merge(labels_df, on='vehicle_id', how='left')

    # Apply encoder + scaler
    df[spec_cols] = enc.transform(df[spec_cols])
    df[spec_cols] = df[spec_cols].astype(float)
    df[feature_cols] = scaler.transform(df[feature_cols])

    # If class_label exists, re-map classes
    if 'class_label' in df.columns:
        df['class_label'] = df['class_label'].replace({1: 4, 2: 4, 3: 4})
        df['class_label'] = df['class_label'].map({0: 0, 4: 1, 5: 2})

    # Build sequences
    sequences, labels = [], []
    for _, group in df.sort_values('time_step').groupby('vehicle_id'):
        if len(group) >= SEQ_LEN:
            seq = group[feature_cols].values[-SEQ_LEN:]
            sequences.append(seq)
            if 'class_label' in group:
                labels.append(group['class_label'].iloc[-1])

    X_eval = np.array(sequences)
    tX_eval = torch.tensor(X_eval, dtype=torch.float32)

    if labels:
        y_eval = np.array(labels)
        ty_eval = torch.tensor(y_eval, dtype=torch.long)
        ds = TensorDataset(tX_eval, ty_eval)
    else:
        ds = TensorDataset(tX_eval)

    return ds


In [82]:


# Validation set
val_ds = process_eval_data(ops_val, specs_val , labels_val)
val_loader = DataLoader(val_ds, batch_size=200, shuffle=False)

# Test set (no labels)
test_ds = process_eval_data(specs_test, ops_test)
test_loader = DataLoader(test_ds, batch_size=200, shuffle=False)


In [87]:
# --- Validation ---
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in val_loader:
        out = model(xb.to(model.fc.weight.device))
        preds = torch.argmax(out, dim=1).cpu().numpy()
        labels = yb.cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels)

# Metrics
acc = accuracy_score(all_labels, all_preds)
print(f"\nEpoch {epoch+1} — Val Acc: {acc:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds, labels=[0, 1, 2]))
print(classification_report(all_labels, all_preds, labels=[0, 1, 2], digits=4, zero_division=0))



Epoch 10 — Val Acc: 0.4399
Confusion Matrix:
[[2033   35 2434]
 [  46    5   80]
 [   0    0    0]]
              precision    recall  f1-score   support

           0     0.9779    0.4516    0.6178      4502
           1     0.1250    0.0382    0.0585       131
           2     0.0000    0.0000    0.0000         0

    accuracy                         0.4399      4633
   macro avg     0.3676    0.1632    0.2254      4633
weighted avg     0.9538    0.4399    0.6020      4633



In [90]:
# Ensure the model is in evaluation mode
model.eval()

# Generate test predictions
test_preds = []
with torch.no_grad():
    for xb in test_loader:
        xb = xb[0].to(model.fc.weight.device)
        out = model(xb)
        preds = torch.argmax(out, dim=1).cpu().numpy()
        test_preds.extend(preds)

# Map back to original labels (reverse of label_map: {0: 0, 4: 1, 5: 2})
inv_label_map = {0: 0, 1: 4, 2: 5}
final_preds = [inv_label_map[p] for p in test_preds]

# Print first few predictions
print("First 30 test predictions:")
print(final_preds[:30])


First 30 test predictions:
[5, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5]
