In [1]:
import os
import shutil
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from skimage.io import imread_collection
from sklearn.model_selection import GridSearchCV
from skimage.io import imread
from skimage.transform import resize
import warnings

In [2]:
# disable Future Warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# give the images uniform names

def rename_files(folder):
    filenames = os.listdir(folder)
    for i, filename in enumerate(filenames):
        # Extract the file extension
        ext = os.path.splitext(filename)[1]
        # Generate the new filename with three-digit number
        new_filename = str(i+1).zfill(3) + ext
        # Construct the full path to the file
        old_path = os.path.join(folder, filename)
        new_path = os.path.join(folder, new_filename)
        # Rename the file
        os.rename(old_path, new_path)

# Rename files in the 'no' folder
rename_files('no')

# Rename files in the 'yes' folder
rename_files('yes')

In [4]:
# divide dataset in tran/test subsets

def create_train_test_sets(source_folder, train_folder, test_folder, train_proportion):
    # Get the list of filenames in the source folder
    filenames = os.listdir(source_folder)
    # Calculate the number of files for training and testing
    num_files = len(filenames)
    num_train = int(train_proportion * num_files)
    num_test = num_files - num_train
    
    # Shuffle the filenames
    np.random.shuffle(filenames)
    
    # Create the 'train' and 'test' folders if they don't exist
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)
    
    # Copy files to the 'train' folder
    for filename in filenames[:num_train]:
        src_path = os.path.join(source_folder, filename)
        dst_path = os.path.join(train_folder, filename)
        shutil.copyfile(src_path, dst_path)
    
    # Copy files to the 'test' folder
    for filename in filenames[num_train:]:
        src_path = os.path.join(source_folder, filename)
        dst_path = os.path.join(test_folder, filename)
        shutil.copyfile(src_path, dst_path)

# Create 'train' and 'test' subsets for 'no_tumor' class
create_train_test_sets('no', 'train/no', 'test/no', train_proportion=0.8)

# Create 'train' and 'test' subsets for 'tumor' class
create_train_test_sets('yes', 'train/yes', 'test/yes', train_proportion=0.8)

In [5]:
# Load the 'no_tumor' images for training
train_no = imread_collection('train/no/*.jpg')
train_no_labels = np.zeros(len(train_no))  # Class label 0 for 'no_tumor'

# Load the 'tumor' images for training
train_yes = imread_collection('train/yes/*.jpg')
train_yes_labels = np.ones(len(train_yes))  # Class label 1 for 'tumor'

# Concatenate the training data and labels
train_data = np.concatenate((train_no, train_yes), axis=0)
train_labels = np.concatenate((train_no_labels, train_yes_labels), axis=0)

# Flatten the image data into a 1D array and resize images if necessary
reshaped_train_data = []
for image in train_data:
    if image.shape != (224, 224):  # If image shape is not (224, 224), resize it
        image = np.resize(image, (224, 224))
    reshaped_train_data.append(image.flatten())
train_data = np.array(reshaped_train_data)

# Train the Random Forest classifier
clf = RandomForestClassifier(max_depth = None, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 5, n_estimators = 100, random_state = 42)
clf.fit(train_data, train_labels)

# Load the 'no_tumor' images for testing
test_no = imread_collection('test/no/*.jpg')
test_no_labels = np.zeros(len(test_no))  # Class label 0 for 'no_tumor'

# Load the 'tumor' images for testing
test_yes = imread_collection('test/yes/*.jpg')
test_yes_labels = np.ones(len(test_yes))  # Class label 1 for 'tumor'

# Concatenate the testing data and labels
test_data = np.concatenate((test_no, test_yes), axis=0)
test_labels = np.concatenate((test_no_labels, test_yes_labels), axis=0)

# Flatten the image data into a 1D array and resize images if necessary
reshaped_test_data = []
for image in test_data:
    if image.shape != (224, 224):  # If image shape is not (224, 224), resize it
        image = np.resize(image, (224, 224))
    reshaped_test_data.append(image.flatten())
test_data = np.array(reshaped_test_data)

# Predict using the trained model
predictions = clf.predict(test_data)

# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)



Accuracy: 0.984251968503937


# Hyperparameter tuning (takes a long time)

In [6]:
# # Define the hyperparameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 5, 10],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# # Create the grid search object
# grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)

# # Fit the grid search to the training data
# grid_search.fit(train_data, train_labels)

# # Get the best hyperparameters found
# best_params = grid_search.best_params_
# print("Best Hyperparameters:", best_params)

In [11]:
# check selected image

# Function to manually check an image
def check_image(image_path):
    # Load and preprocess the image
    image = imread(image_path)
    if image.shape != (224, 224):
        image = np.resize(image, (224, 224))
    flattened_image = image.flatten()

    # Make a prediction using the trained model
    prediction = clf.predict([flattened_image])

    # Map the prediction to the corresponding class label
    if prediction == 0:
        class_label = 'no_tumor'
    else:
        class_label = 'tumor'

    # Display the result
    print('Image:', image_path)
    print('Prediction:', class_label)

# Provide the image path for manual checking
image_path = '006.jpg'
check_image(image_path)

image_path = '078.jpg'
check_image(image_path)

Image: 006.jpg
Prediction: no_tumor
Image: 078.jpg
Prediction: tumor
