In [26]:
import pandas as pd
from sklearn import datasets, tree
from sklearn.preprocessing import LabelEncoder
# from id3 import Id3Estimator, export_text
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score

# Load the breast cancer dataset
breast_cancer_dataset = datasets.load_breast_cancer()

# Load the play tennis dataset and encode data
play_tennis_dataset = pd.read_csv('./PlayTennis.csv')

In [27]:
# Encode play tennis dataset
label_encoder = LabelEncoder()

play_tennis_dataset_encoded = pd.DataFrame([label_encoder.fit_transform(play_tennis_dataset[column]) for column in play_tennis_dataset.columns]).transpose()
play_tennis_dataset_encoded.columns = play_tennis_dataset.columns

play_tennis_classes = dict()
for column in play_tennis_dataset_encoded.columns:
    play_tennis_classes[column] = label_encoder.fit(play_tennis_dataset[column]).classes_

play_tennis_labels = play_tennis_dataset_encoded.pop('play')

In [28]:
# Parse data into training and testing
def parse_data(dataset, label, training_percentage):
    training_size = round(len(dataset) * training_percentage)

    training_data = dataset[:training_size]
    testing_data = dataset[training_size:]

    training_label = label[:training_size]
    testing_label = label[training_size:]
    return training_data, testing_data, training_label, testing_label

bc_training_data, bc_testing_data, bc_training_label, bc_testing_label = parse_data(breast_cancer_dataset.data, breast_cancer_dataset.target, 0.8)
pt_training_data, pt_testing_data, pt_training_label, pt_testing_label = parse_data(play_tennis_dataset_encoded, play_tennis_labels, 0.8)

## Decision Tree Classifier

In [29]:
# Train model using decision tree
tree_decision = tree.DecisionTreeClassifier(random_state=0)
tree_decision = tree_decision.fit(bc_training_data, bc_training_label)
result = tree.export_text(tree_decision, feature_names=breast_cancer_dataset['feature_names'].tolist())
print(result)

|--- worst perimeter <= 106.05
|   |--- worst smoothness <= 0.18
|   |   |--- worst concave points <= 0.16
|   |   |   |--- worst fractal dimension <= 0.06
|   |   |   |   |--- class: 0
|   |   |   |--- worst fractal dimension >  0.06
|   |   |   |   |--- worst texture <= 30.15
|   |   |   |   |   |--- area error <= 48.98
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- area error >  48.98
|   |   |   |   |   |   |--- mean compactness <= 0.06
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- mean compactness >  0.06
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- worst texture >  30.15
|   |   |   |   |   |--- mean fractal dimension <= 0.06
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- mean fractal dimension >  0.06
|   |   |   |   |   |   |--- smoothness error <= 0.01
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- smoothness error >  0.01
|   |   |   |   |   |   |   |--- mean radius <= 12.38
|  

In [30]:
# Predict using the model
tree_results = tree_decision.predict(bc_testing_data)

# Evaluate the model
print(accuracy_score(bc_testing_label, tree_results))
print(f1_score(bc_testing_label, tree_results, average='macro'))

0.8421052631578947
0.8044969512195121


## Id3 Estimator

In [31]:
# id3_estimator = Id3Estimator()
# id3_estimator = id3_estimator.fit(training_data, training_label)
# id3_tree = export_text(id3_estimator.tree_, breast_cancer_dataset['feature_names'])
# print(id3_tree)

## K-Means

In [32]:
# Run K-Means clustering
kmeans = KMeans(n_clusters=2, random_state=0).fit(bc_training_data)
kmeans_results = kmeans.predict(bc_testing_data)

# Evaluate the model
print(accuracy_score(bc_testing_label, kmeans_results))
print(f1_score(bc_testing_label, kmeans_results, average='macro'))

0.9122807017543859
0.8591897233201581


## Neural Network

In [33]:
neural_network = MLPClassifier(max_iter=300).fit(bc_training_data, bc_training_label)
neural_network_results = neural_network.predict(bc_testing_data)
# neural_network.score(testing_data, testing_label)

# Evaluate the model
print(accuracy_score(bc_testing_label, neural_network_results))
print(f1_score(bc_testing_label, neural_network_results, average='macro'))

0.9035087719298246
0.8742352823187243
