# Cancer Diagnosis Using Machine Learning - CS4661 - Jaquan Jones
(Question 1)

In [1]:
# library imports
import numpy as np
import pandas as pd

import math

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

### A. Read the dataset file “Cancer.csv” ([from github](https://github.com/mpourhoma/CS4661/raw/master/Cancer.csv) ), and assign it to a Pandas DataFrame.

 The last column is the binary label (“1” means it is a malignant cancer, “0” means it is a benign tumor). 

In [2]:
# read csv data to dataframe
df  = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Cancer.csv")

df

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Malignant_Cancer
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
145,3,1,1,1,2,1,2,1,1,0
146,9,7,7,5,5,10,7,8,3,1
147,10,8,8,4,10,10,8,1,1,1
148,1,1,1,1,2,1,3,1,1,0


### B. Use sklearn functions to split the dataset into testing and training sets with the following parameters: **test_size=0.35, random_state=3**.

In [3]:
# define feature matrix columns for feature matrix
feature_labels = ['Clump_Thickness', 'Uniformity_of_Cell_Size', 
                  'Uniformity_of_Cell_Shape', 'Marginal_Adhesion',	
                  'Single_Epithelial_Cell_Size', 'Bare_Nuclei',
                  'Bland_Chromatin', 'Normal_Nucleoli',	'Mitoses']

X = df[feature_labels] 

X

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
145,3,1,1,1,2,1,2,1,1
146,9,7,7,5,5,10,7,8,3
147,10,8,8,4,10,10,8,1,1
148,1,1,1,1,2,1,3,1,1


In [4]:
# define label vector
y = df['Malignant_Cancer']

y

0      0
1      0
2      0
3      0
4      0
      ..
145    0
146    1
147    1
148    0
149    0
Name: Malignant_Cancer, Length: 150, dtype: int64

In [5]:
# split dataset into testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=3)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(97, 9)
(97,)
(53, 9)
(53,)


### C. Use “Decision Tree Classifier” to predict Cancer based on the training/testing datasets that you built in part (B). Then, calculate and report the accuracy of your classifier.

---
**Answer C:** 

Accuracy Score of Decision Tree classifier (random_state=3):
0.8301886792452831


In [6]:
# method to obtain decision tree accuracy
def accuracy_of_decision_tree(rs, X_training, y_training, X_testing, y_testing):
        """Returning accuracy of predictions for Decision Tree classifier, 
        given training and testing data and a specific value of random state 
        value, rs"""

        # Create Instance of DecisionTreeClassifier passing parameter value 'rs'
        my_DecisionTree = DecisionTreeClassifier(random_state=rs)

        # Train model with "fit" method along with training dataset and labels
        my_DecisionTree.fit(X_training, y_training)

        # Generate label predictions vector
        y_predictions = my_DecisionTree.predict(X_testing)

        # print predictions for testing set
        print('Predictions for testing set:\n')
        print(y_predictions)

        return accuracy_score(y_testing, y_predictions)

In [7]:
# obtain accuracy of decision tree
random_state = 3

dt_accuracy = accuracy_of_decision_tree(random_state, X_train, y_train, X_test, y_test)

print(f'\nAccuracy of Decision Tree classifier, random_state={random_state}: {dt_accuracy}')

Predictions for testing set:

[0 1 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0
 0 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1]

Accuracy of Decision Tree classifier, random_state=3: 0.8301886792452831


### D. Now, we want to perform a new Ensemble Learning method called “Bagging” based on Voting on 19 decision tree classifiers.

---

**Answer D:**

Bagging (majority vote) accuracy score: 0.9056603773584906


In [8]:
# initialize predictions from 19 unique decision trees 
original_dataset_size = len(X_train)
predictions_lists = []

for i in range(0,19):
        bootstrap_size=int(0.8 * original_dataset_size)

        new_X_train, new_y_train = resample(X_train, y_train, n_samples=bootstrap_size, 
                                    random_state=i , replace = True)

        base_decision_tree = DecisionTreeClassifier(random_state=3)
        
        base_decision_tree.fit(new_X_train, new_y_train)

        predictions = base_decision_tree.predict(X_test)

        predictions_lists.append(predictions.tolist())


for i in range(0,19):
        print(f'Decision Tree {i+1} predictions:\n{predictions_lists[i]}\n')


Decision Tree 1 predictions:
[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1]

Decision Tree 2 predictions:
[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1]

Decision Tree 3 predictions:
[1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]

Decision Tree 4 predictions:
[0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1]

Decision Tree 5 predictions:
[1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1]

Decision Tree 6 predictions:
[1, 1, 1, 1, 1, 0, 1,

In [9]:
# initialize prediction values as a numpy matrix
prediction_matrix = np.array(predictions_lists)

prediction_matrix

array([[1, 1, 1, ..., 0, 1, 1],
       [1, 1, 1, ..., 0, 1, 1],
       [1, 1, 0, ..., 0, 1, 1],
       ...,
       [1, 1, 0, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 1, 1],
       [1, 1, 1, ..., 0, 1, 1]])

In [10]:
# transpose prediction_matrix so that each row is instead all predictions for individual data sample
voting_matrix = prediction_matrix.transpose()

voting_matrix

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 0, ..., 0, 1, 1],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [11]:
# voting method (positive vote for 1's, negative vote for 0's)

# initializing list to store majority votes of decision trees
majority_vote_predictions = []

# return 1 if number of positives, (1's), greater than half the list rounded down  
get_majority_vote = lambda prediction_list: 1 if np.count_nonzero(prediction_list == 1) > math.floor(len(prediction_list) / 2) else 0

for i in range(0, len(voting_matrix)):
        majority_vote_predictions.append(get_majority_vote(voting_matrix[i]))

print(f'Majority votes of all decision trees: \n{majority_vote_predictions}')

Majority votes of all decision trees: 
[1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1]


In [12]:
# calculating accuracy of majority vote predictions
majority_vote_accuracy = accuracy_score(y_test ,majority_vote_predictions)

print(f'Bagging (majority vote) accuracy score: {majority_vote_accuracy}')

Bagging (majority vote) accuracy score: 0.9056603773584906


In [13]:
# alternate voting method (positive vote for 1's, negative vote for 0's) and test

# bagging_vote = lambda prediction_list: np.count_nonzero(prediction_list == 1) - np.count_nonzero(prediction_list == 0)

# get_majority_vote = lambda majority_vote: 1 if majority_vote > 0 else 0

# Since number of predictions for a sample is odd (19), will not have to worry about edge case that bagging vote is 0 in subraction


# testing voting method accuracy
# print(f'All Row 2 predictions: {voting_matrix[2]}\n')
# print(f'# of predictions: {len(voting_matrix[2])}\n')
# print(f'Number of positive (1) predictions in Row 2: {np.count_nonzero(voting_matrix[2] == 1)}\n')
# print(f'Number of negative (0) predictions in Row 2: {np.count_nonzero(voting_matrix[2] == 0)}\n')
# print(f'"Positive predictions" - "negative predictions" in Row 2: \n{np.count_nonzero(voting_matrix[2] == 1)} - {np.count_nonzero(voting_matrix[2] == 0)} = {np.count_nonzero(voting_matrix[2] == 1) - np.count_nonzero(voting_matrix[2] == 0)}\n')
# print(f'Bagging vote for row 2: {get_bagging_vote(voting_matrix[2])}')
# print(f'Bagging vote prediction for row 2: {get_majority_vote(bagging_vote(voting_matrix[2]))}')

### E. Use scikit-learn “Random Forest” classifier to predict Cancer based on the training/testing datasets that you built in part (b). Then, calculate and report the accuracy of your classifier. 

---
**Answer E:**

Random Forest Classifier accuracy score: 0.9245283018867925

In [14]:
# initialize random forest classifier
my_randomforest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=3)

# Train model with "fit" method along with training dataset and labels
my_randomforest.fit(X_train, y_train)

# Generate label predictions vector
y_predictions = my_randomforest.predict(X_test)


rf_accuracy_score = accuracy_score(y_test, y_predictions)

print(f'Random Forest Classifier accuracy score: {rf_accuracy_score}')

Random Forest Classifier accuracy score: 0.9245283018867925
