### Parsing input

In [None]:
#---------------------------------- Parse of input ----------------------------#
import pandas as pd
from matplotlib import pyplot as plt
from scipy.io.arff import loadarff
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import GridSearchCV

raw_data = loadarff('breast.w.arff')
df_data = pd.DataFrame(raw_data[0])  # converting data to a pandas DataFrame
df_data = df_data.dropna()  # all rows with Na values are dropped
df_data['Class'].replace({b'malignant': 1, b'benign': 0}, inplace=True)

### Decision Tree - mutual information

In [None]:
#-------- Train and test of a decision tree classifier varying #features and max-depth ---------#
data, target = df_data.drop(columns='Class'), df_data['Class']
tra_acc_features, test_acc_features, tra_acc_depth, test_acc_depth = [
], [], [], []  # to save accuries to be plotted
# number of the features and maximum depth to be looped through
values = [1, 3, 5, 9]
x_train, x_test, y_train, y_test = train_test_split(
    data, target, test_size=0.3, random_state=10)
for v in values:
    # select k best features using MI value
    kbest = SelectKBest(mutual_info_classif, k=v)
    kbest.fit(data, target)
    # get the names of the best k features
    cols = kbest.get_support(indices=True)
    # gets only the columns of the featured selected
    x_train_features, x_test_features = x_train.iloc[:,
                                                     cols], x_test.iloc[:, cols]
    clf_features = DecisionTreeClassifier(criterion="entropy")
    clf_features.fit(x_train_features, y_train)  # train max_features tree
    # max_depth is defined as parameter
    clf_depth = DecisionTreeClassifier(criterion="entropy", max_depth=v)
    clf_depth.fit(x_train, y_train)  # train max_depth tree
    tra_acc_features.append(clf_features.score(
        x_train_features, y_train))  # test on train set
    test_acc_features.append(clf_features.score(
        x_test_features, y_test))  # test on test set
    tra_acc_depth.append(clf_depth.score(x_train, y_train))
    test_acc_depth.append(clf_depth.score(x_test, y_test))

plt.figure(figsize=(10, 5))
plt.plot(values, tra_acc_features, 'co:', label="X features training set")
plt.plot(values, test_acc_features, 'darkcyan', marker='D',
         linestyle=":", label="X features testing set")
plt.plot(values, tra_acc_depth, 'ro:', label="Maximum depth = X training set")
plt.plot(values, test_acc_depth, color="firebrick", marker='D',
         linestyle=":", label="Maximum depth = X testing set")
plt.xlabel('X')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('plots.png')

### Grid-Search with CV to find the best max-depth

In [None]:
parameters = {'max_depth': [1, 3, 5, 9]}
# decides which is the best hyperparameter for the decision tree
clf = GridSearchCV(DecisionTreeClassifier(), parameters)
clf.fit(data, target)
print("Best score", clf.best_score_, " Best depth", clf.best_params_)