## down sampling the features

#### read folder 1 training nodule features

In [4]:
import numpy as np
import os

# Path to directory containing .npy files
training_dir = "data/cross_validation/folder1/train"

# List all .npy files in directory
npy_files = os.listdir(training_dir)
nodule_npy_files = [f for f in npy_files if f.endswith("nodule_features.npy")]

# Load each .npy file and store in a list
nodule_npy_list = []
for npy_file in nodule_npy_files:
    npy_path = os.path.join(training_dir, npy_file)
    npy_array = np.load(npy_path)
    nodule_npy_list.append(npy_array)

# Combine the list of numpy arrays into a single numpy array
nodule_features = np.concatenate(nodule_npy_list, axis=0)

print(nodule_features.shape)


(77779, 6)


#### read vessel featrues

In [5]:
# List all .npy files in directory
npy_files = os.listdir(training_dir)
vessel_npy_files = [f for f in npy_files if f.endswith("vessel_features.npy")]

# Load each .npy file and store in a list
vessel_npy_list = []
for npy_file in vessel_npy_files:
    npy_path = os.path.join(training_dir, npy_file)
    npy_array = np.load(npy_path)
    vessel_npy_list.append(npy_array)

# Combine the list of numpy arrays into a single numpy array
vessel_features = np.concatenate(vessel_npy_list, axis=0)

print(vessel_features.shape)

(1206564, 6)


#### read lung features

In [6]:
# List all .npy files in directory
npy_files = os.listdir(training_dir)
lung_npy_files = [f for f in npy_files if f.endswith("lung_features.npy")]

# Load each .npy file and store in a list
lung_npy_list = []
for npy_file in lung_npy_files:
    npy_path = os.path.join(training_dir, npy_file)
    npy_array = np.load(npy_path)
    lung_npy_list.append(npy_array)

# Combine the list of numpy arrays into a single numpy array
lung_features = np.concatenate(lung_npy_list, axis=0)

print(lung_features.shape)

(11143953, 6)


## divide the data
- vessel divide into 15 parts
- lung divide into 32 parts

In [7]:
# shuffle the rows of the array
np.random.shuffle(vessel_features)
np.random.shuffle(lung_features)

# divide vessel features into 15 parts
vessel_parts = np.array_split(vessel_features, 15)

# divide lung features into 32 parts
lung_parts = np.array_split(lung_features, 32)

# generate 15 parts traning data
train15 =[]
for i in range (15):
    combined = np.concatenate((vessel_parts[i], lung_parts[i], nodule_features), axis=0)
    train15.append(combined)


## load testing data

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# load testing data
testing_dir = "data/cross_validation/folder1/test"

npy_files = os.listdir(testing_dir)
testing_npy_files = [f for f in npy_files if f.endswith("features.npy")]

testing_npy_list = []
for npy_file in testing_npy_files:
    npy_path = os.path.join(testing_dir, npy_file)
    npy_array = np.load(npy_path)
    testing_npy_list.append(npy_array)

testing_features = np.concatenate(testing_npy_list, axis=0)

print(testing_features.shape)

(2814212, 6)


### training randomforest

In [None]:
from joblib import dump

acc_list = []

for i in range(15):
    
    train = train15[i]

    X_train = train[:, :-1]
    X_test = testing_features[:, :-1]
    y_train = train[:, -1]
    y_test = testing_features[:, -1]


    # create a random forest classifier and fit it to the training data
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)

    # use the trained model to predict on the testing data
    y_pred = rf.predict(X_test)

    # calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(str(i) + '_Accuracy:', accuracy)

    acc_list.append(accuracy)

    dump(rf, str(i) + '_random_forest_'+ str(round(accuracy, 5)) +'.joblib')


### confusing matrix

In [14]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# load testing data
npy_files = os.listdir(testing_dir)
testing_npy_files = [f for f in npy_files if f.endswith(".npy")]

testing_npy_list = []
for npy_file in testing_npy_files:
    npy_path = os.path.join(testing_dir, npy_file)
    npy_array = np.load(npy_path)
    testing_npy_list.append(npy_array)

testing_features = np.concatenate(testing_npy_list, axis=0)

X_test = testing_features[:, :-1]
y_test = testing_features[:, -1]


In [15]:

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

[[2355916   59057   82393]
 [ 234918   42044   27550]
 [   3615    3355    5364]]


In [18]:
fpr = (cm[2, 0] + cm[2, 1]) / (cm[2, 2] +cm[2, 0] + cm[2, 1])
fpr

0.5651045889411384