# Projects in data science, Project: Skin lesions

# Part 2 Training and making decision tree model

##### Imports

In [2]:
# Import libraries
import os
import pickle 
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score


np.random.seed(42)

##### File paths, please update before running

In [3]:
Attributescsv_path="C:\\Users\\elias\\Downloads\\Attributes_final.csv"

## Preparing data from csv

In [4]:
# Read the CSV file into a DataFrame
df = pd.read_csv(Attributescsv_path, delimiter=',')
# Separate features (X) and target variable (y)
X = df.drop(columns=['lesion_name', 'is_cancer_bool','patient_id'])  # Features - a new copy of df without name and cancer status
y = df['is_cancer_bool']  # Target variable

## Picking training and testing data

In [5]:
# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training models

Random Forest classifier

In [6]:
# Create and fit the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=107, random_state=42)
random_forest.fit(x_train, y_train)

Decision Tree classifier

In [7]:
# Create and fit the Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(x_train, y_train)


Logistic regression model

In [8]:
logistic_regression = LogisticRegression(random_state=42)
logistic_regression.fit(x_train, y_train)

In [9]:
# Create KNN model
knn = KNeighborsClassifier(n_neighbors=3)  # You can adjust the number of neighbors as needed
# Train the model
knn.fit(x_train, y_train)

# Make predictions and test model

In [10]:
# Make Random Forest predictions on the test set
y_pred_random_forest = random_forest.predict(x_test)

# Evaluate the Random Forest model
accuracy = accuracy_score(y_test, y_pred_random_forest)
print("Random Forest Accuracy:", accuracy)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_random_forest)
print("Confusion Matrix:")
print(cm)

Random Forest Accuracy: 0.5555555555555556
Confusion Matrix:
[[ 9 11]
 [ 9 16]]


In [11]:
# Make Decision Tree predictions on the test set
y_pred_decision_tree = decision_tree.predict(x_test)

# Evaluate the Decision Tree model
accuracy = accuracy_score(y_test, y_pred_decision_tree)
print("Decision Tree Accuracy:", accuracy)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_decision_tree)
print("Confusion Matrix:")
print(cm)

Decision Tree Accuracy: 0.6
Confusion Matrix:
[[ 8 12]
 [ 6 19]]


In [12]:
# Make logistic regression model predictions on the test set
y_pred_logistic_regression = logistic_regression.predict(x_test)

# Evaluate the logistic regression model
accuracy = accuracy_score(y_test, y_pred_logistic_regression)
print("logistic regression model Accuracy:", accuracy)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_logistic_regression)
print("Confusion Matrix:")
print(cm)

logistic regression model Accuracy: 0.6
Confusion Matrix:
[[ 6 14]
 [ 4 21]]


In [13]:
# # Make logistic regression model predictions on the test set
# knn_predictions = knn.predict(x_test)

# # Evaluate the logistic regression model
# accuracy = accuracy_score(y_test, knn_predictions)
# print("logistic regression model Accuracy:", accuracy)

# # Compute confusion matrix
# cm = confusion_matrix(y_test, knn_predictions)
# print("Confusion Matrix:")
# print(cm)

# Deciding to use a decision tree, and using grid search cv (cross validation) to train a final model, while also punishing more strongly for false negatives.


In [14]:

from sklearn.model_selection import GroupKFold


# Define the parameter grid for grid search
param_grid = {
    'max_depth': [None, 10,15, 20,25, 30,40,50,60,70],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required to be at a leaf node
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the class weights
class_weights = {
    0: 1,  # Class 0
    1: 10  # Class 1 (adjust this weight accordingly)
}

# Initialize the Decision Tree classifier with class weights
decision_tree = DecisionTreeClassifier(class_weight=class_weights)

#######################################################################

# Prepare cross-validation
group_kfold = GroupKFold(n_splits=5)
# Create a groups array where each entry is the patient_id for the corresponding row in X_train
groups_train = df.loc[X_train.index, 'patient_id']
# Initialize GridSearchCV with the GroupKFold object
grid_search = GridSearchCV(decision_tree, param_grid, cv=group_kfold, n_jobs=-1, scoring='accuracy')
#######################################################################
# Perform grid search cross-validation
grid_search.fit(X_train, y_train, groups=groups_train)

# Get the best model
best_decision_tree = grid_search.best_estimator_

# Make predictions on the test set
y_pred_best = best_decision_tree.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Best Decision Tree Accuracy:", accuracy_best)

# Compute confusion matrix for the best model
cm_best = confusion_matrix(y_test, y_pred_best)
print("Best Model Confusion Matrix:")
print(cm_best)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the model with recall
recall_best = recall_score(y_test, y_pred_best)
print("Best Model Recall:", recall_best)


KeyboardInterrupt: 

In [None]:
len(set(df['patient_id']))

214

In [None]:
    classifier = DecisionTreeClassifier(class_weight=class_weights,
                                        max_depth=best_params['max_depth'],
                                        min_samples_leaf=best_params['min_samples_leaf'],
                                        min_samples_split=best_params['min_samples_split'])
    #It will be tested on external data, so we can try to maximize the use of our available data by training on 
    #ALL of x and y
    classifier = classifier.fit(X,y)

    #This is the classifier you need to save using pickle, add this to your zip file submission
    filename = 'groupNJ_classifier.sav'
    pickle.dump(classifier, open(filename, 'wb'))

    # Get the absolute path of the saved file
    print("Trained model has been exported and saved at:")
    print(os.path.abspath(filename))

In [None]:
df['patient_id']

0      PAT_1000
1       PAT_101
2      PAT_1021
3      PAT_1022
4      PAT_1026
         ...   
217     PAT_988
218     PAT_990
219     PAT_994
220     PAT_998
221       PAT_9
Name: patient_id, Length: 222, dtype: object

In [None]:
[0.2275020718568712,0.2456734753570906,0.18203242799172492,0.2324703107187357,0.2365877736069809,0.20725136935523025,0.4459639,0.35605845,0.29192758,0.0880147,0.08986005,0.08697982,1,0,1,1,0,1,4,0,21.246067745093534,3.118813039095475,0.30449086882590254,0.03787845838724052,0.965304997317334,1]

In [None]:
import numpy as np

# Assuming the input array is named input_array
input_array = np.array([0.2275020718568712,0.2456734753570906,0.18203242799172492,0.2324703107187357,0.2365877736069809,0.20725136935523025,0.4459639,0.35605845,0.29192758,0.0880147,0.08986005,0.08697982,1,0,1,1,0,1,4,0,21.246067745093534,3.118813039095475,0.30449086882590254,0.03787845838724052,0.965304997317334])

# Reshape the input array to be 2-dimensional
input_array_reshaped = input_array.reshape(1, -1)

# Now, predict the probabilities
probabilities = classifier.predict_proba(input_array_reshaped)

print("Predicted probabilities:", probabilities)


In [None]:
import pickle
from sklearn.tree import export_graphviz
import graphviz

# Define the filename from which to load the model
filename = "C:\\Users\\elias\\Desktop\\Github repos\\medical-imaging\\Code files\\groupNJ_classifier.sav"

# Load the model from file
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Generate visualization of the decision tree
dot_data = export_graphviz(loaded_model, out_file=None,
                           filled=True, rounded=True,
                           special_characters=True)

graph = graphviz.Source(dot_data)
graph.render("decision_tree_visualization")


In [None]:
graph.view()

In [None]:
import pickle
from sklearn.tree import export_graphviz
import graphviz

# Define the feature names
feature_names = [ "symmetry_major", "symmetry_minor", "ssim_major", "ssim_minor", "symmetry_score", "color_symmetry_score", "avg_red_value", "avg_blue_value", "avg_green_value","avg_red_std_dev", "avg_blue_std_dev", "avg_green_std_dev", "white", "red", "light_brown", "dark_brown", "blue_gray", "black","color_sum", "blue_white_veil_score", "Haralick_contrast", "Haralick_dissimilarity", "Haralick_homogeneity", "Haralick_energy", "Haralick_correlation"]


# Define the filename from which to load the model
filename = "C:\\Users\\elias\\Desktop\\Github repos\\medical-imaging\\Code files\\groupNJ_classifier.sav"

# Load the model from file
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Generate visualization of the decision tree with custom feature names
dot_data = export_graphviz(loaded_model, out_file=None,
                           feature_names=feature_names,
                           filled=True, rounded=True,
                           special_characters=True)

graph = graphviz.Source(dot_data)
graph.render("decision_tree_visualization")


In [None]:
graph.view()

In [15]:
import pickle
from sklearn.tree import export_graphviz
import graphviz

# Define the feature names
feature_names = [ "symmetry_major", "symmetry_minor", "ssim_major", "ssim_minor", "symmetry_score", "color_symmetry_score", "avg_red_value", "avg_blue_value", "avg_green_value","avg_red_std_dev", "avg_blue_std_dev", "avg_green_std_dev", "white", "red", "light_brown", "dark_brown", "blue_gray", "black","color_sum", "blue_white_veil_score", "Haralick_contrast", "Haralick_dissimilarity", "Haralick_homogeneity", "Haralick_energy", "Haralick_correlation"]


# Define the filename from which to load the model
filename = "C:\\Users\\elias\\Desktop\\Github repos\\medical-imaging\\groupNJ_classifier.sav"


# Load the model from file
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Define class labels
class_labels = ['Non-Cancer', 'Cancer']

# Generate visualization of the decision tree with custom feature names and class labels
dot_data = export_graphviz(loaded_model, out_file=None,
                           feature_names=feature_names,
                           class_names=class_labels,
                           filled=True, rounded=True,
                           special_characters=True)

# Replace colors in the DOT data
dot_data = dot_data.replace('[label="Cancer', '[label="Cancer', 1)
dot_data = dot_data.replace('[label="Non-Cancer', '[label="Non-Cancer', 1)

graph = graphviz.Source(dot_data)
graph.render("decision_tree_visualization")


'decision_tree_visualization.pdf'

In [16]:
graph.view()

'decision_tree_visualization.pdf'