In [1]:
import pandas as pd
from sklearn import datasets, tree
from sklearn.preprocessing import LabelEncoder
# from id3 import Id3Estimator, export_text
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# Load the breast cancer dataset
breast_cancer_dataset = datasets.load_breast_cancer()

# Load the play tennis dataset and encode data
play_tennis_dataset = pd.read_csv('./PlayTennis.csv')

In [2]:
# Encode play tennis dataset
label_encoder = LabelEncoder()

play_tennis_dataset_encoded = pd.DataFrame([label_encoder.fit_transform(play_tennis_dataset[column]) for column in play_tennis_dataset.columns]).transpose()
play_tennis_dataset_encoded.columns = play_tennis_dataset.columns

play_tennis_classes = dict()
for column in play_tennis_dataset_encoded.columns:
    play_tennis_classes[column] = label_encoder.fit(play_tennis_dataset[column]).classes_

play_tennis_labels = play_tennis_dataset_encoded.pop('play')

In [3]:
# Parse data into training and testing
def parse_data(dataset, label, training_percentage):
    training_size = round(len(dataset) * training_percentage)

    training_data = dataset[:training_size]
    testing_data = dataset[training_size:]

    training_label = label[:training_size]
    testing_label = label[training_size:]
    return training_data, testing_data, training_label, testing_label

bc_training_data, bc_testing_data, bc_training_label, bc_testing_label = parse_data(breast_cancer_dataset.data, breast_cancer_dataset.target, 0.8)
pt_training_data, pt_testing_data, pt_training_label, pt_testing_label = parse_data(play_tennis_dataset_encoded, play_tennis_labels, 0.8)

## Decision Tree Classifier

In [4]:
# Train model using decision tree

# Breast Cancer
print("Breast Cancer")
tree_decision_bc = tree.DecisionTreeClassifier(random_state=0)
tree_decision_bc = tree_decision_bc.fit(bc_training_data, bc_training_label)
result = tree.export_text(tree_decision_bc, feature_names=breast_cancer_dataset['feature_names'].tolist())
print(result)

Breast Cancer
|--- worst perimeter <= 106.05
|   |--- worst smoothness <= 0.18
|   |   |--- worst concave points <= 0.16
|   |   |   |--- worst fractal dimension <= 0.06
|   |   |   |   |--- class: 0
|   |   |   |--- worst fractal dimension >  0.06
|   |   |   |   |--- worst texture <= 30.15
|   |   |   |   |   |--- area error <= 48.98
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- area error >  48.98
|   |   |   |   |   |   |--- mean compactness <= 0.06
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- mean compactness >  0.06
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- worst texture >  30.15
|   |   |   |   |   |--- mean fractal dimension <= 0.06
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- mean fractal dimension >  0.06
|   |   |   |   |   |   |--- smoothness error <= 0.01
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- smoothness error >  0.01
|   |   |   |   |   |   |   |--- mean radiu

In [5]:

# Play Tennis
print("Play Tennis")
tree_decision_pt = tree.DecisionTreeClassifier(random_state=0)
tree_decision_pt = tree_decision_pt.fit(pt_training_data, pt_training_label)
result = tree.export_text(tree_decision_pt, feature_names=play_tennis_dataset.columns[:-1].tolist())
print(result)

Play Tennis
|--- outlook <= 1.50
|   |--- windy <= 0.50
|   |   |--- class: 1
|   |--- windy >  0.50
|   |   |--- outlook <= 0.50
|   |   |   |--- class: 1
|   |   |--- outlook >  0.50
|   |   |   |--- class: 0
|--- outlook >  1.50
|   |--- humidity <= 0.50
|   |   |--- class: 0
|   |--- humidity >  0.50
|   |   |--- class: 1



In [6]:
# Predict testing data using the model

# Breast Cancer
tree_results_bc = tree_decision_bc.predict(bc_testing_data)
# Evaluate the model
print("Breast Cancer")
print("Accuracy Score:", accuracy_score(bc_testing_label, tree_results_bc))
print("F1 Score:", f1_score(bc_testing_label, tree_results_bc, average='macro'))

# Play Tennis
tree_results_pt = tree_decision_pt.predict(pt_testing_data)
# Evaluate the model
print("\nPlay Tennis")
print("Accuracy Score:", accuracy_score(pt_testing_label, tree_results_pt))
print("F1 Score:", f1_score(pt_testing_label, tree_results_pt, average='macro'))

Breast Cancer
Accuracy Score: 0.8421052631578947
F1 Score: 0.8044969512195121

Play Tennis
Accuracy Score: 1.0
F1 Score: 1.0


## Id3 Estimator

In [7]:
# id3_estimator = Id3Estimator()
# id3_estimator = id3_estimator.fit(training_data, training_label)
# id3_tree = export_text(id3_estimator.tree_, breast_cancer_dataset['feature_names'])
# print(id3_tree)

## K-Means

In [8]:
# Run K-Means clustering
kmeans = KMeans(n_clusters=2, random_state=0).fit(bc_training_data)
kmeans_results = kmeans.predict(bc_testing_data)

print("Breast Cancer")
kmeans_centroids_bc = pd.DataFrame(kmeans.cluster_centers_.transpose())
kmeans_centroids_bc.index = breast_cancer_dataset['feature_names'].tolist()
kmeans_centroids_bc.columns = ["Centroid 1", "Centroid 2"]
print(kmeans_centroids_bc)

Breast Cancer
                          Centroid 1  Centroid 2
mean radius                19.222478   12.587553
mean texture               21.637257   18.114649
mean perimeter            127.108850   81.335819
mean area                1164.976106  498.400585
mean smoothness             0.101174    0.094907
mean compactness            0.147851    0.090839
mean concavity              0.173736    0.064169
mean concave points         0.099197    0.034212
mean symmetry               0.192082    0.179094
mean fractal dimension      0.060460    0.063360
radius error                0.718746    0.309820
texture error               1.218749    1.195348
perimeter error             5.091000    2.180759
area error                 91.098673   24.335713
smoothness error            0.006624    0.007115
compactness error           0.032062    0.023753
concavity error             0.042033    0.029298
concave points error        0.015471    0.010760
symmetry error              0.020625    0.021047
fracta

In [9]:
# Evaluate the model
print("Breast Cancer")
print("Accuracy Score:", accuracy_score(bc_testing_label, kmeans_results))
print("F1 Score:", f1_score(bc_testing_label, kmeans_results, average='macro'))

Breast Cancer
Accuracy Score: 0.9122807017543859
F1 Score: 0.8591897233201581


## Logistic Regression

In [10]:
# Run Logistic Regression
logistic_regression = LogisticRegression(random_state=0, max_iter=2500).fit(bc_training_data, bc_training_label)
logistic_regression_results = logistic_regression.predict(bc_testing_data)

# Evaluate the model
print(accuracy_score(bc_testing_label, logistic_regression_results))
print(f1_score(bc_testing_label, logistic_regression_results, average='micro'))

0.9298245614035088
0.9298245614035088


## Neural Network

In [11]:
neural_network = MLPClassifier(max_iter=300).fit(bc_training_data, bc_training_label)
neural_network_results = neural_network.predict(bc_testing_data)
# neural_network.score(testing_data, testing_label)

# Evaluate the model
print(accuracy_score(bc_testing_label, neural_network_results))
print(f1_score(bc_testing_label, neural_network_results, average='macro'))

0.9298245614035088
0.9053156146179402


## SVM

In [12]:
# Create SVM model
svm_bc = make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(bc_testing_data, bc_testing_label)
svm_bc_results = svm_bc.predict(bc_testing_data)

# Evaluate the model
print("Breast Cancer")
print(accuracy_score(bc_testing_label, svm_bc_results))
print(f1_score(bc_testing_label, svm_bc_results, average='macro'))
print()

# print("Play Tennis")
# print(accuracy_score(bc_testing_label, svc_results))
# print(f1_score(bc_testing_label, svc_results, average='macro'))
# print()

Breast Cancer
0.9824561403508771
0.9743820224719102

