In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
# =========================================
# library import
# =========================================
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score
)
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [21]:
# -----------------------------------------------------
# 1. data load
# -----------------------------------------------------

BASE_PATH = "/content/drive/MyDrive/ML_Dataset"

# Multi-class data
X_train = np.load(f"{BASE_PATH}/X_train_mc.npy")
X_test  = np.load(f"{BASE_PATH}/X_test_mc.npy")
y_train = np.load(f"{BASE_PATH}/y_train_mc.npy")
y_test  = np.load(f"{BASE_PATH}/y_test_mc.npy")

print("âœ… Data loaded successfully!")
print(f"X_train: {X_train.shape} | y_train: {y_train.shape}")
print(f"X_test : {X_test.shape}  | y_test : {y_test.shape}")

# Check data distribution
print("\nðŸ“Š Train data distribution:")
train_counter = Counter(y_train)
monitored_train = sum(1 for label in y_train if label >= 0)
unmonitored_train = sum(1 for label in y_train if label == -1)
print(f"  - Monitored (0~94): {monitored_train} samples")
print(f"  - Unmonitored (-1): {unmonitored_train} samples")

print("\nðŸ“Š Test data distribution:")
test_counter = Counter(y_test)
monitored_test = sum(1 for label in y_test if label >= 0)
unmonitored_test = sum(1 for label in y_test if label == -1)
print(f"  - Monitored (0~94): {monitored_test} samples")
print(f"  - Unmonitored (-1): {unmonitored_test} samples")

print("Check for labels:", np.unique(y_train))

âœ… Data loaded successfully!
X_train: (16500, 25) | y_train: (16500,)
X_test : (5500, 25)  | y_test : (5500,)

ðŸ“Š Train data distribution:
  - Monitored (0~94): 14250 samples
  - Unmonitored (-1): 2250 samples

ðŸ“Š Test data distribution:
  - Monitored (0~94): 4750 samples
  - Unmonitored (-1): 750 samples
Check for labels: [-1.  0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.
 17. 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34.
 35. 36. 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52.
 53. 54. 55. 56. 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67. 68. 69. 70.
 71. 72. 73. 74. 75. 76. 77. 78. 79. 80. 81. 82. 83. 84. 85. 86. 87. 88.
 89. 90. 91. 92. 93. 94.]


In [22]:
# -----------------------------------------------------
# 2. Random Forest Model training
# -----------------------------------------------------
rf_multi = RandomForestClassifier(
    n_estimators=200,
    criterion='entropy',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("ðŸš€ Training Random Forest model for Multi-class Classification...")
rf_multi.fit(X_train, y_train)
print("\nâœ… Training complete!\n")

ðŸš€ Training Random Forest model for Multi-class Classification...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.6min



âœ… Training complete!



[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.6min finished


In [23]:
# -----------------------------------------------------
# 3. Prediction
# -----------------------------------------------------
print("ðŸ”® Making predictions...")
y_pred = rf_multi.predict(X_test)
y_prob = rf_multi.predict_proba(X_test)  # Probability for each class

print("âœ… Prediction complete!\n")

ðŸ”® Making predictions...


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s


âœ… Prediction complete!



[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.6s finished


In [24]:
# -----------------------------------------------------
# 4. Overall performance evaluation
# -----------------------------------------------------
overall_acc = accuracy_score(y_test, y_pred)

print("="*60)
print("ðŸ“ˆ OVERALL PERFORMANCE METRICS")
print("="*60)
print(f"ðŸŽ¯ Overall Accuracy: {overall_acc:.4f}")
print("\n")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\n" + "="*60 + "\n")

ðŸ“ˆ OVERALL PERFORMANCE METRICS
ðŸŽ¯ Overall Accuracy: 0.8315


Classification Report:
              precision    recall  f1-score   support

        -1.0       0.75      0.88      0.81       750
         0.0       0.89      0.84      0.87        50
         1.0       0.95      0.72      0.82        50
         2.0       0.96      0.92      0.94        50
         3.0       0.89      0.84      0.87        50
         4.0       0.89      0.94      0.91        50
         5.0       0.84      0.82      0.83        50
         6.0       0.87      0.90      0.88        50
         7.0       0.84      0.92      0.88        50
         8.0       0.90      0.76      0.83        50
         9.0       0.85      0.78      0.81        50
        10.0       0.95      0.82      0.88        50
        11.0       0.95      0.82      0.88        50
        12.0       0.89      0.94      0.91        50
        13.0       0.79      0.60      0.68        50
        14.0       0.66      0.70      0.68    