### Our Main Problem statement

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

pod_multigpu = pd.read_csv('openb_pod_list_multigpu50.csv')
pod_gpushare = pd.read_csv('openb_pod_list_gpushare100.csv')
pod_cpu = pd.read_csv('openb_pod_list_cpu300.csv')
node_data = pd.read_csv('openb_node_list_all_node.csv')

pod_data = pd.concat([pod_multigpu, pod_gpushare, pod_cpu], ignore_index=True)

time_columns = ['creation_time', 'deletion_time', 'scheduled_time']
for col in time_columns:
    if col in pod_data.columns:
        pod_data[col] = pd.to_numeric(pod_data[col], errors='coerce')

pod_data = pd.merge(pod_data, node_data, left_on='name', right_on='sn', how='left')

pod_data.fillna(0, inplace=True)

  pod_data.fillna(0, inplace=True)


In [27]:
print(pod_data.columns)

Index(['name', 'cpu_milli_x', 'memory_mib_x', 'num_gpu', 'gpu_milli',
       'gpu_spec', 'qos', 'pod_phase', 'creation_time', 'deletion_time',
       'scheduled_time', 'sn', 'cpu_milli_y', 'memory_mib_y', 'gpu', 'model'],
      dtype='object')


In [None]:
pod_data.rename(columns={'cpu_milli_x': 'cpu_milli', 'memory_mib_x': 'memory_mib'}, inplace=True)

pod_data.drop(columns=['cpu_milli_y', 'memory_mib_y'], inplace=True, errors='ignore')

required_columns = ['cpu_milli', 'memory_mib', 'gpu_milli']
for col in required_columns:
    if col not in pod_data.columns:
        pod_data[col] = 0

print("Final Columns:", pod_data.columns)

Final Columns: Index(['name', 'cpu_milli', 'memory_mib', 'num_gpu', 'gpu_milli', 'gpu_spec',
       'qos', 'pod_phase', 'creation_time', 'deletion_time', 'scheduled_time',
       'sn', 'gpu', 'model'],
      dtype='object')


### Pod & Node Failure Prediction  -  pod_phase == Failed or Pending
### Resource Exhaustion (CPU, Memory, Disk)	cpu_utilization > 90%, memory_utilization > 90%
### Network Failure Detection	packet_loss_rate > 3%, network_receive_mbps < 0.5
### Service Disruptions (Logs)	Logs: OOMKilled, CrashLoopBackOff, Evicted

In [None]:
pod_data['cpu_utilization'] = pod_data['cpu_milli'] / 32000  # 32K max CPU
pod_data['memory_utilization'] = pod_data['memory_mib'] / 262144  # 256GB max memory

pod_data['gpu_utilization'] = np.where(pod_data['num_gpu'] > 0, pod_data['gpu_milli'] / 1000, 0)

pod_data['runtime'] = pod_data['deletion_time'] - pod_data['creation_time']  # Pod lifespan
pod_data['scheduling_delay'] = pod_data['scheduled_time'] - pod_data['creation_time']  # Scheduling delay

pod_data.replace([np.inf, -np.inf], 0, inplace=True)
pod_data.fillna(0, inplace=True)

In [None]:
anomaly_features = ['cpu_utilization', 'memory_utilization', 'gpu_utilization', 'runtime', 'scheduling_delay']

iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
pod_data['anomaly_score'] = iso_forest.fit_predict(pod_data[anomaly_features])

pod_data['anomaly'] = (pod_data['anomaly_score'] == -1).astype(int)

In [None]:
features = ['cpu_utilization', 'memory_utilization', 'gpu_utilization', 'runtime', 'scheduling_delay', 'anomaly']

X = pod_data[features]
y = pod_data['failure_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9996338337605273
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2250
           1       1.00      1.00      1.00      3212

    accuracy                           1.00      5462
   macro avg       1.00      1.00      1.00      5462
weighted avg       1.00      1.00      1.00      5462



### Seeing if the dataset is balanced

In [33]:
print(pod_data['failure_label'].value_counts())

failure_label
1    15855
0    11452
Name: count, dtype: int64


In [34]:
new_predictions = model.predict(X_test[:10])
print("Predictions on New Data:", new_predictions)

Predictions on New Data: [1 0 1 0 1 1 1 1 0 1]


In [35]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))

Logistic Regression Accuracy: 0.9018674478213109


In [None]:
if 'anomaly' not in future_data.columns:
    future_data['anomaly'] = 0 

features = ['cpu_utilization', 'memory_utilization', 'gpu_utilization', 'runtime', 'scheduling_delay', 'anomaly']
X_future = future_data[features]

X_future = scaler.transform(X_future)

future_preds = model.predict(X_future)

from sklearn.metrics import accuracy_score, classification_report

print("Future Data Accuracy:", accuracy_score(y_future, future_preds))
print("Future Data Classification Report:\n", classification_report(y_future, future_preds))

Future Data Accuracy: 0.9992676675210546
Future Data Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       910
           1       1.00      1.00      1.00      1821

    accuracy                           1.00      2731
   macro avg       1.00      1.00      1.00      2731
weighted avg       1.00      1.00      1.00      2731



In [None]:
pod_data['event_OOMKilled'] = np.random.choice([0, 1], size=len(pod_data), p=[0.95, 0.05])
pod_data['event_CrashLoopBackOff'] = np.random.choice([0, 1], size=len(pod_data), p=[0.90, 0.10]) 
pod_data['event_Evicted'] = np.random.choice([0, 1], size=len(pod_data), p=[0.97, 0.03])

pod_data['service_failure'] = (
    pod_data['event_OOMKilled'] | 
    pod_data['event_CrashLoopBackOff'] | 
    pod_data['event_Evicted']
).astype(int)

pod_data['failure_label'] = (
    pod_data['failure_label'] | pod_data['service_failure']
)

In [45]:
print("Available columns:", pod_data.columns)

Available columns: Index(['name', 'cpu_milli', 'memory_mib', 'num_gpu', 'gpu_milli', 'gpu_spec',
       'qos', 'pod_phase', 'creation_time', 'deletion_time', 'scheduled_time',
       'sn', 'gpu', 'model', 'cpu_utilization', 'memory_utilization',
       'gpu_utilization', 'runtime', 'scheduling_delay', 'failure_label',
       'event_OOMKilled', 'event_CrashLoopBackOff', 'event_Evicted',
       'service_failure'],
      dtype='object')


In [None]:
import numpy as np

if 'anomaly' not in pod_data.columns:
    pod_data['anomaly'] = 0

for col in ['network_receive_mbps', 'network_transmit_mbps', 'packet_loss_rate']:
    if col not in pod_data.columns:
        pod_data[col] = np.random.uniform(0.1, 100, size=len(pod_data))

In [None]:
features = [
    'cpu_utilization', 'memory_utilization', 'gpu_utilization', 'runtime', 'scheduling_delay', 'anomaly',
    'network_receive_mbps', 'network_transmit_mbps', 'packet_loss_rate',
    'event_OOMKilled', 'event_CrashLoopBackOff', 'event_Evicted'
]

X = pod_data[features]
y = pod_data['failure_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Updated Model Accuracy:", accuracy_score(y_test, y_pred))
print("Updated Classification Report:\n", classification_report(y_test, y_pred))

Updated Model Accuracy: 0.9921274258513365
Updated Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      1850
           1       0.99      1.00      0.99      3612

    accuracy                           0.99      5462
   macro avg       0.99      0.99      0.99      5462
weighted avg       0.99      0.99      0.99      5462



### Failure Type

In [51]:
print(pod_data['pod_phase'].unique())

[0 'Running' 'Pending' 'Succeeded' 'Failed']


In [None]:
from sklearn.utils import resample

majority = pod_data[pod_data['failure_type'] != 4]
minority = pod_data[pod_data['failure_type'] == 4]

minority_upsampled = resample(minority, replace=True, n_samples=500, random_state=42)

pod_data_balanced = pd.concat([majority, minority_upsampled])

print(pod_data_balanced['failure_type'].value_counts())

failure_type
3    17785
1     6672
2     2329
4      500
0      430
Name: count, dtype: int64


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

features = [
    'cpu_utilization', 'memory_utilization', 'gpu_utilization', 'runtime', 
    'scheduling_delay', 'network_receive_mbps', 'packet_loss_rate', 
    'event_OOMKilled', 'event_CrashLoopBackOff', 'event_Evicted'
]

X = pod_data[features]
y = pod_data['failure_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Multi-Class Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Multi-Class Model Accuracy: 0.9895642621750275
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96        86
           1       0.98      0.98      0.98      1335
           2       0.99      0.99      0.99       466
           3       0.99      0.99      0.99      3557
           4       1.00      0.83      0.91        18

    accuracy                           0.99      5462
   macro avg       0.98      0.95      0.97      5462
weighted avg       0.99      0.99      0.99      5462

