In [164]:
# Import dataset library
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder

# Import algorithm library
from sklearn import tree
import six
import sys
sys.modules['sklearn.externals.six'] = six
from id3 import Id3Estimator, export_text
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# Load the breast cancer dataset
breast_cancer_dataset = datasets.load_breast_cancer()

# Load the play tennis dataset and encode data
play_tennis_dataset = pd.read_csv('./PlayTennis.csv')

In [165]:
# Encode play tennis dataset
label_encoder = LabelEncoder()

play_tennis_dataset_encoded = pd.DataFrame([label_encoder.fit_transform(play_tennis_dataset[column]) for column in play_tennis_dataset.columns]).transpose()
play_tennis_dataset_encoded.columns = play_tennis_dataset.columns

play_tennis_classes = dict()
for column in play_tennis_dataset_encoded.columns:
    play_tennis_classes[column] = label_encoder.fit(play_tennis_dataset[column]).classes_

play_tennis_labels = play_tennis_dataset_encoded.pop('play')

In [166]:
# Parse data into training and testing
def parse_data(dataset, label, training_percentage):
    training_size = round(len(dataset) * training_percentage)

    training_data = dataset[:training_size]
    testing_data = dataset[training_size:]

    training_label = label[:training_size]
    testing_label = label[training_size:]
    return training_data, testing_data, training_label, testing_label

bc_training_data, bc_testing_data, bc_training_label, bc_testing_label = parse_data(breast_cancer_dataset.data, breast_cancer_dataset.target, 0.8)
pt_training_data, pt_testing_data, pt_training_label, pt_testing_label = parse_data(play_tennis_dataset_encoded, play_tennis_labels, 0.8)

## Decision Tree Classifier

In [167]:
# Breast Cancer
print("Breast Cancer")
tree_decision_bc = tree.DecisionTreeClassifier(random_state=0)
tree_decision_bc = tree_decision_bc.fit(bc_training_data, bc_training_label)
result = tree.export_text(tree_decision_bc, feature_names=breast_cancer_dataset['feature_names'].tolist())
print(result)

Breast Cancer
|--- worst perimeter <= 106.05
|   |--- worst smoothness <= 0.18
|   |   |--- worst concave points <= 0.16
|   |   |   |--- worst fractal dimension <= 0.06
|   |   |   |   |--- class: 0
|   |   |   |--- worst fractal dimension >  0.06
|   |   |   |   |--- worst texture <= 30.15
|   |   |   |   |   |--- area error <= 48.98
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- area error >  48.98
|   |   |   |   |   |   |--- mean compactness <= 0.06
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- mean compactness >  0.06
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- worst texture >  30.15
|   |   |   |   |   |--- mean fractal dimension <= 0.06
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- mean fractal dimension >  0.06
|   |   |   |   |   |   |--- smoothness error <= 0.01
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- smoothness error >  0.01
|   |   |   |   |   |   |   |--- mean radiu

In [168]:
# Evaluate the model
tree_results_bc = tree_decision_bc.predict(bc_testing_data)
print("Accuracy Score:", accuracy_score(bc_testing_label, tree_results_bc))
print("F1 Score:", f1_score(bc_testing_label, tree_results_bc, average='macro'))

Accuracy Score: 0.8421052631578947
F1 Score: 0.8044969512195121


In [169]:
# Play Tennis
print("Play Tennis")
tree_decision_pt = tree.DecisionTreeClassifier(random_state=0)
tree_decision_pt = tree_decision_pt.fit(pt_training_data, pt_training_label)
result = tree.export_text(tree_decision_pt, feature_names=play_tennis_dataset.columns[:-1].tolist())
print(result)

Play Tennis
|--- outlook <= 1.50
|   |--- windy <= 0.50
|   |   |--- class: 1
|   |--- windy >  0.50
|   |   |--- outlook <= 0.50
|   |   |   |--- class: 1
|   |   |--- outlook >  0.50
|   |   |   |--- class: 0
|--- outlook >  1.50
|   |--- humidity <= 0.50
|   |   |--- class: 0
|   |--- humidity >  0.50
|   |   |--- class: 1



In [170]:
# Evaluate the model
tree_results_pt = tree_decision_pt.predict(pt_testing_data)
print("Accuracy Score:", accuracy_score(pt_testing_label, tree_results_pt))
print("F1 Score:", f1_score(pt_testing_label, tree_results_pt, average='macro'))

Accuracy Score: 1.0
F1 Score: 1.0


## ID3 Estimator

In [171]:
id3_estimator = Id3Estimator()
# Breast Cancer
id3_estimator = id3_estimator.fit(bc_training_data, bc_training_label)
id3_tree = export_text(id3_estimator.tree_, breast_cancer_dataset['feature_names'])
print("Breast Cancer")
print(id3_tree)

Breast Cancer

worst perimeter <=106.05
|   worst concave points <=0.13
|   |   area error <=48.97
|   |   |   worst texture <=30.15: 1 (222) 
|   |   |   worst texture >30.15
|   |   |   |   mean radius <=12.69: 1 (14) 
|   |   |   |   mean radius >12.69
|   |   |   |   |   compactness error <=0.01: 0 (2) 
|   |   |   |   |   compactness error >0.01: 1 (4) 
|   |   area error >48.97
|   |   |   mean smoothness <=0.09: 1 (2) 
|   |   |   mean smoothness >0.09: 0 (2) 
|   worst concave points >0.13
|   |   worst texture <=23.47: 1 (6) 
|   |   worst texture >23.47
|   |   |   worst symmetry <=0.29
|   |   |   |   mean radius <=14.13: 1 (3) 
|   |   |   |   mean radius >14.13: 0 (1) 
|   |   |   worst symmetry >0.29: 0 (11) 
worst perimeter >106.05
|   worst radius <=18.22
|   |   worst smoothness <=0.14
|   |   |   mean texture <=22.26
|   |   |   |   worst texture <=28.82: 1 (17) 
|   |   |   |   worst texture >28.82
|   |   |   |   |   mean compactness <=0.10: 0 (2) 
|   |   |   |   |

In [172]:
# Evaluate the model
id3_results_bc = id3_estimator.predict(bc_testing_data)
print("Accuracy Score:", accuracy_score(bc_testing_label, id3_results_bc))
print("F1 Score:", f1_score(bc_testing_label, id3_results_bc, average='macro'))

Accuracy Score: 0.9210526315789473
F1 Score: 0.8971015946244107


In [173]:
# Play Tennis
id3_estimator = id3_estimator.fit(pt_training_data, pt_training_label)
id3_tree_pt = export_text(id3_estimator.tree_, feature_names=play_tennis_dataset.columns[:-1].tolist())
print("Play Tennis")
print(id3_tree_pt)


Play Tennis

outlook <=1.50
|   windy <=0.50: 1 (4) 
|   windy >0.50: 0 (1/1) 
outlook >1.50
|   humidity <=0.50: 0 (3) 
|   humidity >0.50: 1 (2) 



In [174]:
# Evaluate the model
id3_results_pt = id3_estimator.predict(pt_testing_data)
print("Accuracy Score:", accuracy_score(pt_testing_label, id3_results_pt))
print("F1 Score:", f1_score(pt_testing_label, id3_results_pt, average='macro'))

Accuracy Score: 0.6666666666666666
F1 Score: 0.6666666666666666


## K-Means

In [175]:
# Run K-Means clustering
# Breast Cancer
kmeans_bc = KMeans(n_clusters=2, random_state=0).fit(bc_training_data)

print("Breast Cancer")
kmeans_centroids_bc = pd.DataFrame(kmeans_bc.cluster_centers_.transpose())
kmeans_centroids_bc.index = breast_cancer_dataset['feature_names'].tolist()
kmeans_centroids_bc.columns = ["Centroid 1", "Centroid 2"]
print(kmeans_centroids_bc)

Breast Cancer
                          Centroid 1  Centroid 2
mean radius                19.222478   12.587553
mean texture               21.637257   18.114649
mean perimeter            127.108850   81.335819
mean area                1164.976106  498.400585
mean smoothness             0.101174    0.094907
mean compactness            0.147851    0.090839
mean concavity              0.173736    0.064169
mean concave points         0.099197    0.034212
mean symmetry               0.192082    0.179094
mean fractal dimension      0.060460    0.063360
radius error                0.718746    0.309820
texture error               1.218749    1.195348
perimeter error             5.091000    2.180759
area error                 91.098673   24.335713
smoothness error            0.006624    0.007115
compactness error           0.032062    0.023753
concavity error             0.042033    0.029298
concave points error        0.015471    0.010760
symmetry error              0.020625    0.021047
fracta

In [176]:
# Evaluate the model
kmeans_results_bc = kmeans_bc.predict(bc_testing_data)
print("Accuracy Score:", accuracy_score(bc_testing_label, kmeans_results_bc))
print("F1 Score:", f1_score(bc_testing_label, kmeans_results_bc, average='macro'))

Accuracy Score: 0.9122807017543859
F1 Score: 0.8591897233201581


In [177]:
# Run K-Means clustering
# Play Tennis
kmeans_pt = KMeans(n_clusters=2, random_state=0).fit(pt_training_data)

print("Play Tennis")
kmeans_centroids_pt = pd.DataFrame(kmeans_pt.cluster_centers_.transpose())
kmeans_centroids_pt.index = play_tennis_dataset.columns[:-1].tolist()
kmeans_centroids_pt.columns = ["Centroid 1", "Centroid 2"]
print(kmeans_centroids_pt)

Play Tennis
          Centroid 1  Centroid 2
outlook     1.666667         0.8
temp        1.666667         0.2
humidity    0.333333         0.8
windy       0.333333         0.4


In [178]:
# Evaluate the model
kmeans_results_pt = kmeans_pt.predict(pt_testing_data)
print("Accuracy Score:", accuracy_score(pt_testing_label, kmeans_results_pt))
print("F1 Score:", f1_score(pt_testing_label, kmeans_results_pt, average='macro'))

Accuracy Score: 0.6666666666666666
F1 Score: 0.6666666666666666


## Logistic Regression

In [179]:
# Breast Cancer
print("Breat Cancer")
logistic_regression_bc = LogisticRegression(random_state=0, max_iter=2500).fit(bc_training_data, bc_training_label)
logistic_regression_results_bc = logistic_regression_bc.predict(bc_testing_data)
logistic_regression_koef_bc = pd.DataFrame(logistic_regression_bc.coef_.transpose())
logistic_regression_koef_bc.columns = ["Coefficient"]
logistic_regression_koef_bc.index = breast_cancer_dataset['feature_names'].tolist()
print(logistic_regression_koef_bc)

Breat Cancer
                         Coefficient
mean radius                 0.811754
mean texture               -0.065497
mean perimeter              0.005605
mean area                   0.014761
mean smoothness            -0.145417
mean compactness           -0.178002
mean concavity             -0.347003
mean concave points        -0.220522
mean symmetry              -0.142888
mean fractal dimension     -0.028602
radius error               -0.055970
texture error               1.048760
perimeter error             0.005760
area error                 -0.103339
smoothness error           -0.018520
compactness error           0.048203
concavity error            -0.016115
concave points error       -0.031209
symmetry error             -0.026582
fractal dimension error     0.010018
worst radius                0.256574
worst texture              -0.340738
worst perimeter            -0.274097
worst area                 -0.012889
worst smoothness           -0.280263
worst compactness        

In [180]:
# Evaluate the model
print("Accuracy Score:", accuracy_score(bc_testing_label, logistic_regression_results_bc))
print("F1 Score:", f1_score(bc_testing_label, logistic_regression_results_bc, average='macro'))

Accuracy Score: 0.9385964912280702
F1 Score: 0.9181286549707601


In [181]:
# Run Logistic Regression
print("Play Tennis")
logistic_regression_pt = LogisticRegression(random_state=0, max_iter=2500).fit(pt_training_data, pt_training_label)
logistic_regression_results_pt = logistic_regression_pt.predict(pt_testing_data)
logistic_regression_koef_pt = pd.DataFrame(logistic_regression_pt.coef_.transpose())
logistic_regression_koef_pt.columns = ["Coefficient"]
logistic_regression_koef_pt.index = play_tennis_dataset.columns[:-1].tolist()
print(logistic_regression_koef_pt)

Play Tennis
          Coefficient
outlook     -0.815711
temp         0.223467
humidity     0.768774
windy       -0.415489


In [182]:
# Evaluate the model
print("Accuracy Score:", accuracy_score(pt_testing_label, logistic_regression_results_pt))
print("F1 Score:", f1_score(pt_testing_label, logistic_regression_results_pt, average='macro'))

Accuracy Score: 0.6666666666666666
F1 Score: 0.4


## Neural Network

In [183]:
# Create the Model
neural_network_bc = MLPClassifier(max_iter=300, hidden_layer_sizes=(50,)).fit(bc_training_data, bc_training_label)
neural_network_koef_bc = neural_network_bc.coefs_
print("Breast Cancer")
print("Hidden Layer 1 coefficients")
print(neural_network_koef_bc[0])

print("Hidden Layer 2 coefficients")
print(neural_network_koef_bc[1])

Breast Cancer
Hidden Layer 1 coefficients
[[-1.48973654e-01 -1.95920649e-01  2.19989904e-01 ... -1.65165993e-01
  -1.28673571e-01 -1.96802713e-01]
 [-5.90939031e-02 -1.94411971e-01 -1.93229695e-01 ...  1.69898942e-01
  -2.77885929e-01 -1.94896983e-01]
 [ 8.05903614e-02  2.80975925e-01  8.61210733e-02 ...  8.16350187e-02
   3.67116702e-02  7.36297203e-02]
 ...
 [ 1.76151328e-01 -2.22128587e-01 -2.69349741e-01 ...  1.63182511e-01
   9.54277277e-02 -1.27913419e-01]
 [-1.11657495e-01 -1.36189747e-01 -8.60267686e-02 ...  2.19995663e-01
   3.47279081e-02 -2.98999062e-04]
 [ 2.44877817e-01 -2.71973181e-01 -2.67718191e-01 ...  3.08433986e-01
   1.26952457e-01 -2.53484110e-01]]
Hidden Layer 2 coefficients
[[-3.12945465e-01]
 [ 1.70412612e-01]
 [ 2.90587745e-01]
 [-6.99247564e-03]
 [ 2.45252214e-01]
 [-2.71048992e-05]
 [-2.69553947e-15]
 [ 1.61478519e-01]
 [ 3.14306538e-01]
 [-1.46631403e-01]
 [ 1.85623795e-01]
 [-7.92993245e-02]
 [-3.96753838e-02]
 [-1.30302493e-02]
 [-2.77763520e-01]
 [ 2.9301

In [184]:
neural_network_results_bc = neural_network_bc.predict(bc_testing_data)
print("Accuracy Score:", accuracy_score(bc_testing_label, neural_network_results_bc))
print("F1 Score:", f1_score(bc_testing_label, neural_network_results_bc, average='macro'))

Accuracy Score: 0.8771929824561403
F1 Score: 0.8449280994947532


In [185]:
neural_network_pt = MLPClassifier(max_iter=1000, hidden_layer_sizes=(10,)).fit(pt_training_data, pt_training_label)
neural_network_koef_pt = neural_network_pt.coefs_
print("Play Tennis")
print("Hidden Layer 1 coefficients")
print(neural_network_koef_pt[0])

print("Hidden Layer 2 coefficients")
print(neural_network_koef_pt[1])

Play Tennis
Hidden Layer 1 coefficients
[[-1.09104196e-02  7.34276770e-01 -4.43638473e-01  1.38895124e+00
  -2.62789138e-01 -5.72249870e-01 -7.78865462e-01 -2.18100138e-01
   8.22610551e-25  2.69472702e-01]
 [ 2.14263941e-22 -9.27662025e-02 -2.94613712e-01  3.94278144e-01
   5.27225940e-01 -2.92551812e-01  1.19265215e+00  6.93018969e-01
  -7.58349033e-05  1.61128623e-01]
 [ 1.23731774e-24 -1.53655494e-02  1.55254469e-01 -9.95304121e-01
   1.36221449e+00  3.59961209e-01  1.32862269e+00  8.94992813e-01
  -1.34780232e-08  6.31101145e-01]
 [ 1.00413481e-24  6.53090436e-01 -5.86687669e-01  1.38913467e+00
  -1.25563859e-01 -8.47921946e-01 -5.94286784e-01 -8.84444565e-01
  -1.35802181e-06 -5.09652349e-01]]
Hidden Layer 2 coefficients
[[-5.92055461e-25]
 [-5.82298165e-01]
 [-1.43187576e-01]
 [-8.93296004e-01]
 [ 1.21956258e+00]
 [-4.76189571e-01]
 [ 5.84793787e-01]
 [ 6.91847235e-01]
 [-1.94515477e-26]
 [ 1.17379887e+00]]




In [186]:
# Evaluate the model
neural_network_results_pt = neural_network_pt.predict(pt_testing_data)
print("Accuracy Score:", accuracy_score(pt_testing_label, neural_network_results_pt))
print("F1 Score:", f1_score(pt_testing_label, neural_network_results_pt, average='macro'))

Accuracy Score: 1.0
F1 Score: 1.0


## SVM

In [187]:
# Create SVM model
svm_bc = SVC(kernel='linear', gamma='auto')
svm_bc.fit(bc_testing_data, bc_testing_label)
svm_koef_bc = pd.DataFrame(svm_bc.coef_.transpose())
svm_koef_bc.index = breast_cancer_dataset['feature_names'].tolist()
svm_koef_bc.columns = ['Coeffecient']
print("Breast Cancer")
print(svm_koef_bc)

Breast Cancer
                         Coeffecient
mean radius                -0.085418
mean texture                0.512835
mean perimeter             -1.119029
mean area                   0.086925
mean smoothness            -0.013175
mean compactness           -0.028192
mean concavity             -0.045712
mean concave points        -0.020102
mean symmetry              -0.038129
mean fractal dimension     -0.002567
radius error               -0.016242
texture error              -0.151182
perimeter error             0.210347
area error                  0.050727
smoothness error           -0.000550
compactness error           0.002978
concavity error            -0.010398
concave points error       -0.003096
symmetry error             -0.002752
fractal dimension error    -0.000047
worst radius               -0.083945
worst texture              -0.612404
worst perimeter             0.153567
worst area                 -0.033117
worst smoothness           -0.016473
worst compactness       

In [188]:
svm_bc_results = svm_bc.predict(bc_testing_data)

print(f"Accuracy score: {accuracy_score(bc_testing_label, svm_bc_results)}")
print(f"F1 score: {f1_score(bc_testing_label, svm_bc_results, average='macro')}")

Accuracy score: 1.0
F1 score: 1.0


In [189]:
# Create SVM model
svm_pt = SVC(kernel='linear', gamma='auto').fit(pt_testing_data, pt_testing_label)
svm_koef_pt = pd.DataFrame(svm_pt.coef_.transpose())
svm_koef_pt.index = play_tennis_dataset.columns[:-1].tolist()
svm_koef_pt.columns = ['Coeffecient']

# Evaluate the model
print("Play Tennis")
print(svm_koef_pt)

Play Tennis
          Coeffecient
outlook          -1.0
temp              0.0
humidity          0.0
windy             0.0


In [190]:
svm_pt_results = svm_pt.predict(pt_testing_data)

print(f"Accuracy score: {accuracy_score(pt_testing_label, svm_pt_results)}")
print(f"F1 score: {f1_score(pt_testing_label, svm_pt_results, average='macro')}")

Accuracy score: 0.6666666666666666
F1 score: 0.4
