# Projects in data science, Project: Skin lesions

# Part 2 Training and making decision tree model

##### Imports

In [19]:
# Import libraries
import os
import pickle 
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score


np.random.seed(42)

##### File paths, please update before running

In [2]:
Attributescsv_path="C:\\Users\\elias\\Downloads\\Attributes_final.csv"

## Preparing data from csv

In [3]:
# Read the CSV file into a DataFrame
df = pd.read_csv(Attributescsv_path, delimiter=',')
# Separate features (X) and target variable (y)
X = df.drop(columns=['lesion_name', 'is_cancer_bool','patient_id'])  # Features - a new copy of df without name and cancer status
y = df['is_cancer_bool']  # Target variable

## Picking training and testing data

In [24]:
# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training models

Random Forest classifier

In [25]:
# Create and fit the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=107, random_state=42)
random_forest.fit(x_train, y_train)

Decision Tree classifier

In [26]:
# Create and fit the Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(x_train, y_train)


Logistic regression model

In [27]:
logistic_regression = LogisticRegression(random_state=42)
logistic_regression.fit(x_train, y_train)

In [28]:
# Create KNN model
knn = KNeighborsClassifier(n_neighbors=3)  # You can adjust the number of neighbors as needed
# Train the model
knn.fit(x_train, y_train)

# Make predictions and test model

In [29]:
# Make Random Forest predictions on the test set
y_pred_random_forest = random_forest.predict(x_test)

# Evaluate the Random Forest model
accuracy = accuracy_score(y_test, y_pred_random_forest)
print("Random Forest Accuracy:", accuracy)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_random_forest)
print("Confusion Matrix:")
print(cm)

Random Forest Accuracy: 0.5555555555555556
Confusion Matrix:
[[ 9 11]
 [ 9 16]]


In [30]:
# Make Decision Tree predictions on the test set
y_pred_decision_tree = decision_tree.predict(x_test)

# Evaluate the Decision Tree model
accuracy = accuracy_score(y_test, y_pred_decision_tree)
print("Decision Tree Accuracy:", accuracy)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_decision_tree)
print("Confusion Matrix:")
print(cm)

Decision Tree Accuracy: 0.6
Confusion Matrix:
[[ 8 12]
 [ 6 19]]


In [31]:
# Make logistic regression model predictions on the test set
y_pred_logistic_regression = logistic_regression.predict(x_test)

# Evaluate the logistic regression model
accuracy = accuracy_score(y_test, y_pred_logistic_regression)
print("logistic regression model Accuracy:", accuracy)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_logistic_regression)
print("Confusion Matrix:")
print(cm)

logistic regression model Accuracy: 0.6
Confusion Matrix:
[[ 6 14]
 [ 4 21]]


# Deciding to use a decision tree, and using grid search cv (cross validation) to train a final model, while also punishing more strongly for false negatives.


In [32]:


# Define the parameter grid for grid search
param_grid = {
    'max_depth': [None, 10,15, 20,25, 30,40,50,60,70],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required to be at a leaf node
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the class weights
class_weights = {
    0: 1,  # Class 0
    1: 10  # Class 1 (adjust this weight accordingly)
}

# Initialize the Decision Tree classifier with class weights
decision_tree = DecisionTreeClassifier(class_weight=class_weights)

# Initialize GridSearchCV
grid_search = GridSearchCV(decision_tree, param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Perform grid search cross-validation
grid_search.fit(X_train, y_train)

# Get the best model
best_decision_tree = grid_search.best_estimator_

# Make predictions on the test set
y_pred_best = best_decision_tree.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Best Decision Tree Accuracy:", accuracy_best)

# Compute confusion matrix for the best model
cm_best = confusion_matrix(y_test, y_pred_best)
print("Best Model Confusion Matrix:")
print(cm_best)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the model with recall
recall_best = recall_score(y_test, y_pred_best)
print("Best Model Recall:", recall_best)


Best Decision Tree Accuracy: 0.7333333333333333
Best Model Confusion Matrix:
[[11  9]
 [ 3 22]]
Best Hyperparameters: {'max_depth': 50, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best Model Recall: 0.88


In [33]:
class_weights = {
    0: 1,  # Class 0
    1: 10  # Class 1 (adjust this weight accordingly)
}
classifier = DecisionTreeClassifier(class_weight=class_weights, max_depth= 15, min_samples_leaf= 2, min_samples_split=2)

#It will be tested on external data, so we can try to maximize the use of our available data by training on 
#ALL of x and y
classifier = classifier.fit(X,y)

#This is the classifier you need to save using pickle, add this to your zip file submission
filename = 'groupNJ_classifier.sav'
pickle.dump(classifier, open(filename, 'wb'))

# Get the absolute path of the saved file
print("Trained model has been exported and saved at:")
print(os.path.abspath(filename))

Trained model has been exported and saved at:
C:\Users\elias\Desktop\Github repos\medical-imaging\Code files\groupNJ_classifier.sav
