## down sampling the features

#### read folder 1 training nodule features

In [9]:
import numpy as np
import os

# Path to directory containing .npy files
training_dir = "data/cross_validation/folder1/train"

# List all .npy files in directory
npy_files = os.listdir(training_dir)
nodule_npy_files = [f for f in npy_files if f.endswith("nodule_features.npy")]

# Load each .npy file and store in a list
nodule_npy_list = []
for npy_file in nodule_npy_files:
    npy_path = os.path.join(training_dir, npy_file)
    npy_array = np.load(npy_path)
    nodule_npy_list.append(npy_array)

# Combine the list of numpy arrays into a single numpy array
nodule_features = np.concatenate(nodule_npy_list, axis=0)

print(nodule_features.shape)


(75467, 6)


#### read vessel featrues

In [10]:
# List all .npy files in directory
npy_files = os.listdir(training_dir)
vessel_npy_files = [f for f in npy_files if f.endswith("vessel_features.npy")]

# Load each .npy file and store in a list
vessel_npy_list = []
for npy_file in vessel_npy_files:
    npy_path = os.path.join(training_dir, npy_file)
    npy_array = np.load(npy_path)
    vessel_npy_list.append(npy_array)

# Combine the list of numpy arrays into a single numpy array
vessel_features = np.concatenate(vessel_npy_list, axis=0)

print(vessel_features.shape)

(1206799, 6)


## divide the data
- vessel divide into 15 parts

In [11]:
# shuffle the rows of the array in place
np.random.shuffle(vessel_features)

In [12]:
# divide vessel features into 15 parts
vessel_parts = np.array_split(vessel_features, 15)

# generate 15 parts traning data
train15 =[]
for i in range (15):
    combined = np.concatenate((vessel_parts[i], nodule_features), axis=0)
    train15.append(combined)


## load testing data (only load nodules and vessel)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# load testing data
testing_dir = "data/cross_validation/folder1/test"

npy_files = os.listdir(testing_dir)
testing_npy_files = [f for f in npy_files if (f.endswith("nodule_features.npy") or f.endswith("vessel_features.npy"))]

testing_npy_list = []
for npy_file in testing_npy_files:
    npy_path = os.path.join(testing_dir, npy_file)
    npy_array = np.load(npy_path)
    testing_npy_list.append(npy_array)

testing_features = np.concatenate(testing_npy_list, axis=0)

X_test = testing_features[:, :-1]
y_test = testing_features[:, -1]

# adjsut label vessel : 0 and nodule : 1
y_test -= 1

### training random forest

In [14]:
from joblib import dump

acc_list = []

for i in range(15):
    
    train = train15[i]
    X_train = train[:, :-1]
    y_train = train[:, -1]

    # adjust label
    y_train -= 1

    # create a random forest classifier and fit it to the training data
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)

    # use the trained model to predict on the testing data
    y_pred = rf.predict(X_test)

    # calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(str(i) + '_Accuracy:', accuracy)

    acc_list.append(accuracy)

    dump(rf, str(i) + '_random_forest_'+ str(round(accuracy, 5)) +'.joblib')


0_Accuracy: 0.8550809931296677
1_Accuracy: 0.8564846432494185
2_Accuracy: 0.8577592582684749
3_Accuracy: 0.8538850580500593
4_Accuracy: 0.8529944011355088
5_Accuracy: 0.8527709501074768
6_Accuracy: 0.8559842388345298
7_Accuracy: 0.8512382648870314
8_Accuracy: 0.8541714530296497
9_Accuracy: 0.8579984452843965
10_Accuracy: 0.858206160324539
11_Accuracy: 0.8588450414328561
12_Accuracy: 0.8512791784555442
13_Accuracy: 0.8548071869403889
14_Accuracy: 0.8555719559518227


In [15]:
sum(acc_list) / len(acc_list)

0.8551384819387575

## load model plot confusion matrix

In [22]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import joblib
import os
from sklearn.metrics import accuracy_score

testing_dir = "data/cross_validation/folder1/test"

# load testing data
npy_files = os.listdir(testing_dir)
testing_npy_files = [f for f in npy_files if (f.endswith("nodule_features.npy") or f.endswith("vessel_features.npy"))]

testing_npy_list = []
for npy_file in testing_npy_files:
    npy_path = os.path.join(testing_dir, npy_file)
    npy_array = np.load(npy_path)
    testing_npy_list.append(npy_array)

testing_features = np.concatenate(testing_npy_list, axis=0)

X_test = testing_features[:, :-1]
y_test = testing_features[:, -1]

y_test -= 1

# load rf model
rf = joblib.load("vn/0_random_forest_0.8503.joblib")

In [27]:
# generate predictions on the test set
y_pred = rf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

fpr = cm[1][0] / (cm[1][1]+cm[1][0])
print("fpr: ", fpr)

[[261905  42607]
 [  4824   7510]]
fpr:  0.3911139938381709
