In [1]:
import pickle

# Load preprocessed data from pickle file
with open('preprocessed_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

# Verify the data shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (8000, 8)
X_test shape: (2000, 8)
y_train shape: (8000,)
y_test shape: (2000,)


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)


In [3]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)


Accuracy: 0.973

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1939
           1       0.64      0.26      0.37        61

    accuracy                           0.97      2000
   macro avg       0.81      0.63      0.68      2000
weighted avg       0.97      0.97      0.97      2000



In [7]:
# Train logistic regression with balanced class weights
model_weighted = LogisticRegression(class_weight='balanced', max_iter=1000)
model_weighted.fit(X_train, y_train)

# Predict on the test set
y_pred_weighted = model_weighted.predict(X_test)

# Evaluate model performance
print("Classification Report with Class Weights:")
print(classification_report(y_test, y_pred_weighted))

Classification Report with Class Weights:
              precision    recall  f1-score   support

           0       0.99      0.82      0.90      1939
           1       0.13      0.85      0.22        61

    accuracy                           0.82      2000
   macro avg       0.56      0.83      0.56      2000
weighted avg       0.97      0.82      0.88      2000



In [8]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Original training set shape:", X_train.shape)
print("Resampled training set shape:", X_train_res.shape)

# Train logistic regression on the resampled data
model_smote = LogisticRegression(max_iter=1000)
model_smote.fit(X_train_res, y_train_res)

# Predict on the original test set
y_pred_smote = model_smote.predict(X_test)

# Evaluate the model's performance
print("Classification Report with SMOTE:")
print(classification_report(y_test, y_pred_smote))

Original training set shape: (8000, 8)
Resampled training set shape: (15444, 8)
Classification Report with SMOTE:
              precision    recall  f1-score   support

           0       0.99      0.83      0.91      1939
           1       0.13      0.80      0.22        61

    accuracy                           0.83      2000
   macro avg       0.56      0.82      0.56      2000
weighted avg       0.97      0.83      0.88      2000



In [9]:
# Get predicted probabilities for the failure class (class 1)
y_prob = model_weighted.predict_proba(X_test)[:, 1]

# Optionally, inspect a few probability values
print("Predicted probabilities (first 10):", y_prob[:10])


Predicted probabilities (first 10): [0.37625996 0.50380729 0.19608927 0.12914718 0.50430679 0.53243277
 0.2997286  0.77857433 0.08661634 0.00750935]


In [11]:
import numpy as np
from sklearn.metrics import classification_report

thresholds = [0.2, 0.3, 0.4, 0.5]

for t in thresholds:
    # Convert probabilities to predictions based on the threshold
    y_pred_tuned = np.where(y_prob >= t, 1, 0)
    
    # Print the classification report for each threshold
    print(f"\nClassification Report for Threshold {t}:")
    print(classification_report(y_test, y_pred_tuned))


Classification Report for Threshold 0.2:
              precision    recall  f1-score   support

           0       1.00      0.57      0.73      1939
           1       0.06      0.92      0.12        61

    accuracy                           0.58      2000
   macro avg       0.53      0.75      0.42      2000
weighted avg       0.97      0.58      0.71      2000


Classification Report for Threshold 0.3:
              precision    recall  f1-score   support

           0       0.99      0.68      0.81      1939
           1       0.08      0.89      0.15        61

    accuracy                           0.69      2000
   macro avg       0.54      0.78      0.48      2000
weighted avg       0.97      0.69      0.79      2000


Classification Report for Threshold 0.4:
              precision    recall  f1-score   support

           0       0.99      0.76      0.86      1939
           1       0.10      0.85      0.18        61

    accuracy                           0.76      2000
  