In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold

df = pd.read_csv('Path_CT.csv').astype(bool)

In [40]:
X = []
y = []
for i in range(len(df['fibrosis'])):
    x_i=[
        df['Hypersensitivity pneumonitis'][i], 
        df['GGO (ground glass opacity)'][i],
        df['Mosaic or geographic or air trapping or small airway disease'][i], 
        df['bronchiectasis_CT'][i], 
        df['fibrosis/reticular'][i], df['honeycombing'][i],
        df['Emphysema'][i], df['Air bronchogram/consolidation'][i],
        df['Nodules'][i], df['Interstitial pneumonitis'][i]
        ]
    # x = df['bronchiectasis_p'][i]
    X.append(x_i)
    y_i =[
          # df['granuloma'][i], 
          # df['hypersensitivity pneumonitis'][i],
          # df['fibrosis'][i], 
          # df['organizing pneumonia'][i], 
          # df['Airway centered/bronchiolitis/small airway disease'][i], 
          # df['Nodules'][i], 
          # df['Honeycomb'][i], 
          # df['bronchiectasis_p'][i], 
          df['Interstitial pneumonia/pneumonitis'][i]
          ]
    y.append(y_i)
X = np.array(X)
y = np.array(y).ravel()
print(X.shape)
print(y.shape)

(120, 10)
(120,)


In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
# Best Hyperparameters_fibrosis: {'max_depth': 2, 'max_features': 'log2', 'min_samples_leaf': 10, 'min_samples_split': 20, 'n_estimators': 200}
# Best Hyperparameters_granuloma: {'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 50}

# best_rf_classifier = RandomForestClassifier(
#     n_estimators=200,              # Number of trees
#     max_depth=2,                   # Maximum depth of each tree
#     max_features='log2',           # Number of features considered for each split
#     min_samples_leaf=10,           # Minimum samples in each leaf node
#     min_samples_split=20,          # Minimum samples required to split an internal node               # Ensuring reproducibility
# )
# skf = StratifiedKFold(n_splits=10)
# cv_scores = cross_val_score(best_rf_classifier, X, y, cv=skf)
# best_rf_classifier.fit(X_train, y_train)
rf_classifier = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 200, 300],            # Number of trees in the forest
    'max_depth': [2, 3, 4],                   # Max depth of each tree
    'max_features': ['sqrt', 'log2'],          # Number of features to consider at each split
    'min_samples_split': [2, 5, 10, 20],               # Minimum number of samples required to split a node
    'min_samples_leaf': [2, 5, 10, 20]              # Minimum number of samples at each leaf node
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=0)
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best model
best_rf_classifier = grid_search.best_estimator_

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Predict on the test set
y_pred = best_rf_classifier.predict(X_test)

# Evaluate the classifier performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, zero_division=0)

# Output results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Best Hyperparameters: {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 75.00%
Confusion Matrix:
[[ 5  4]
 [ 2 13]]
Classification Report:
              precision    recall  f1-score   support

       False       0.71      0.56      0.62         9
        True       0.76      0.87      0.81        15

    accuracy                           0.75        24
   macro avg       0.74      0.71      0.72        24
weighted avg       0.75      0.75      0.74        24



  _data = np.array(data, dtype=dtype, copy=copy,


In [43]:
import pickle
with open('RF_classifiers/IP_pneumonitis_rf_classifier.pickle', 'wb') as f:
    pickle.dump(best_rf_classifier, f)

In [44]:
y_pred_all = best_rf_classifier.predict(X)
class_report = classification_report(y, y_pred_all)
# Evaluate the classifier performance
accuracy = accuracy_score(y, y_pred_all)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(class_report)

Accuracy: 79.17%
Classification Report:
              precision    recall  f1-score   support

       False       0.73      0.68      0.71        44
        True       0.82      0.86      0.84        76

    accuracy                           0.79       120
   macro avg       0.78      0.77      0.77       120
weighted avg       0.79      0.79      0.79       120

