In [66]:
'''

Author: Hevra Petekkaya
Project: Data Mining Assignment 2 - Data Classification
Due Date: 11th of May, 2023

'''

# Importing needed libraries
import pandas as pd
import numpy as np
import statistics as st
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from scipy.stats import kendalltau
from scipy.stats import pearsonr
from collections import Counter
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate, cross_val_predict
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score


In [67]:
def identify_outliers(data):
    # Calculate the first and third quartiles and the IQR
    q1, q3 = np.percentile(data, [25, 75])
    iqr = q3 - q1

    # Identify potential outliers using the IQR
    outliers = [x for x in data if x < q1 - 1.5*iqr or x > q3 + 1.5*iqr]
    
    return outliers


In [68]:
def equal_depth_binning(data, num_bins):
    """
    Divides the data into num_bins equal-frequency bins.

    Parameters:
    -----------
    data : array-like
        The data to be binned.
    num_bins : int
        The number of equal-frequency bins to divide the data into.

    Returns:
    --------
    bin_edges : array-like
        An array of num_bins+1 elements containing the edges of the bins.

    """
    # Sort the data
    sorted_data = np.sort(data)

    # Calculate the number of data points per bin
    points_per_bin = len(data) // num_bins

    # Determine the edges of the bins
    bin_edges = [sorted_data[i*points_per_bin] for i in range(num_bins)]
    bin_edges.append(sorted_data[-1])

    # Return the edges of the bins
    return bin_edges

In [69]:
# Load Excel file into Pandas DataFrame
df = pd.read_csv('cng514-cancer-patient-data-assignment-2.csv')

# Convert DataFrame to NumPy array
dataset = df.to_numpy()

# (1000, 14) => 1000 patients with 14 attributes
print(dataset.shape)
# print(dataset)

(1000, 14)


In [70]:
# Data Preprocessing

# Risk level has three possible values but first we need to identify a way to map it to a binary form.
risk_levels = dataset[:,13]

# Count the occurrences of each value in the risk level column
counts = Counter(risk_levels)

# Print the counts
print(counts)

Counter({'High': 365, 'Medium': 332, 'Low': 303})


In [71]:
# Create a LabelBinarizer object
lb = LabelBinarizer()

# Define the mapping of risk levels to binary labels
risk_level_map = {'High': 0, 'Medium': 1, 'Low': 1}

# Convert the risk level data to binary labels
risk_levels_binary = [risk_level_map[risk_level] for risk_level in risk_levels]

# Fit the LabelBinarizer object to the binary labels and transform the data
risk_levels_binary = lb.fit_transform(risk_levels_binary)
risk_levels_binary = [sublist[0] for sublist in risk_levels_binary]  # list comprehension to extract values

dataset[:, 13] = risk_levels_binary
# Print the binary labels
# print(risk_levels_binary)

In [72]:
# Check for missing values

# check for missing values using isnan() and any() functions
missing_values_present = False
for i in range(1, dataset.shape[1]):
    if np.isnan(dataset[:, i].astype(float)).any():
        print("Attribute {} has missing values".format(i))
        missing_values_present = Trues
        
if missing_values_present == False:
    print("There are no missing values in any attributes/features!")
        
# there are no missing values in the attributes 

There are no missing values in any attributes/features!


In [73]:
# Check for noisy data 
noisy_values_present = False
for i in range(1, dataset.shape[1]):
    outliers = identify_outliers(dataset[:, i])
    if outliers != []:
        print("Attribute {} has noisy values".format(i))
        print("Outliers are {}".format(outliers))
        noisy_values_present = True
        
if noisy_values_present == False:
    print("There are no noisy values in any attributes/features!")

# It appears like age has outliers; however, that isn't really the case.
# please refer to the report for detailed explanation. 

Attribute 1 has noisy values
Outliers are [73, 73, 73, 73, 73, 73, 73, 73, 73, 73]


In [74]:
# define the hyperparameter grid
param_grid = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 , 13],
    'min_samples_split': [2, 4, 6, 8, 10],
    'criterion': ['gini', 'entropy']
}

# create a decision tree classifier
dt = DecisionTreeClassifier()

# perform grid search with stratified cross-validation
skf = StratifiedKFold(n_splits=10)
grid_search = GridSearchCV(dt, param_grid=param_grid, cv=skf, error_score='raise')
grid_search.fit(dataset[:, 1:-1], dataset[:, 13].astype(int))

# print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best hyperparameters: {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 2}
Best cross-validation score: 1.0


In [75]:
# Define a custom scoring function to calculate specificity and sensitivity
def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

def sensitivity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tp / (tp + fn)

In [76]:
# now that we did grid search using stratified cross-validation for hyperparameters and 
# got the best values for hyperparameters, we know which combination of hyperparameters 
# will give us the best tree. 

# Now, in order to find the accuracy, specificity, and sensitivity, we can
# perform cross validation on the specific combination of hyperparameters that were the most promising. 
# Please refer to the report to the Results section for more detail. 

# Separate the features and target variable
X = dataset[:, 1:-1]
y = dataset[:, 13].astype(int)

# Create a decision tree classifier with the best hyperparameters found through grid search
dt = DecisionTreeClassifier(criterion='gini', max_depth=4, min_samples_split=2)

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=10)

# Calculate the mean accuracy, specificity, and sensitivity using cross-validation
scoring = {'accuracy': make_scorer(accuracy_score),
           'specificity': make_scorer(specificity),
           'sensitivity': make_scorer(sensitivity)}
scores = cross_validate(dt, X, y, cv=cv, scoring=scoring)

# Print the mean accuracy, specificity, and sensitivity
print("Mean accuracy:", np.mean(scores['test_accuracy']))
print("Mean specificity:", np.mean(scores['test_specificity']))
print("Mean sensitivity:", np.mean(scores['test_sensitivity']))

Mean accuracy: 1.0
Mean specificity: 1.0
Mean sensitivity: 1.0


In [77]:
y_pred = cross_val_predict(dt, X, y, cv=10)
conf_mat = confusion_matrix(y, y_pred)
print(conf_mat)

[[365   0]
 [  0 635]]


In [78]:
# what we did above was to get a realistic approixmation of how well a model can perform given the dataset. 
# Consequently, now that we know the performance, we do not need a separate data for testing hence we can use the whole dataset 
# for training. 
# Please refer to the report to the Results section for more detailed explanation. 

# Define the best hyperparameters from the grid search
best_params = {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 2}

# Create the decision tree classifier with the best hyperparameters
dt_final = DecisionTreeClassifier(criterion=best_params['criterion'],
                            max_depth=best_params['max_depth'],
                            min_samples_split=best_params['min_samples_split'])

# Fit the model on the training set that is the whole dataset
dt_final.fit(X, y)