In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import statistics
%matplotlib inline

In [3]:
# (b) i. for abalone dataset, convert Type into numerical format

# Load the datasets into python
abalone_1 = pd.read_csv('sample_data/abalone.csv')

enc = OneHotEncoder()
type_abalone = enc.fit_transform(abalone_1[["Type"]])
abalone_1[["Type"]] = type_abalone


# (b) ii. for abalone dataset, convert Type to categories

# Load the datasets into python
abalone_2 = pd.read_csv('sample_data/abalone.csv')

# change F to 0, M to 1, I to 2
abalone_2 = np.array(abalone_2)

for i in range(abalone_2[:,0].size):
  if abalone_2[:,0][i] == "F":
    abalone_2[:,0][i] = 0
  elif abalone_2[:,0][i] == "M":
    abalone_2[:,0][i] = 1
  else:
     abalone_2[:,0][i] = 2

## 2

In [18]:
#  Plot percentage of instances in each species for abalone
output_classes = abalone_1['Type'].value_counts()
plt.pie(output_classes, labels=output_classes.index, autopct='%1.1f%%')
plt.title("Percentage of instances in each output class for abalone")
plt.savefig(f'abalone-classes.png') # save as .png file
plt.show()

AttributeError: ignored

## 3

In [24]:
# Split the abalone dataset into train and test sets
A_X = abalone_2[:, 1:9].astype(float)
A_y = abalone_2[:, 0].astype(float)
A_X_train, A_X_test, A_y_train, A_y_test = train_test_split(A_X, A_y, random_state=0)

print(abalone_2)

[[0 0.605 0.47 ... 0.2275 0.292 9]
 [1 0.55 0.425 ... 0.1765 0.2165 10]
 [1 0.46 0.345 ... 0.0885 0.1159999999999999 7]
 ...
 [1 0.695 0.55 ... 0.36 0.445 11]
 [0 0.585 0.475 ... 0.217 0.3 11]
 [0 0.565 0.45 ... 0.239 0.249 11]]


## 4

In [26]:
# Instantiate the models

basedt_model = tree.DecisionTreeClassifier(random_state=0)

parameters = {'criterion': ('gini', 'entropy'), 'max_depth': [1, 5, None], 'min_samples_split': [2, 1, 3]}
topdt_model = GridSearchCV(tree.DecisionTreeClassifier(), parameters)

basemlp_model = MLPClassifier(hidden_layer_sizes=(100, 100, ), activation='logistic', solver='sgd')

parameters = {'activation': ('logistic', 'tanh', 'relu'), 'hidden_layer_sizes': [(30, 50, ), (10, 10, 10, )], 'solver': ['adam', 'sgd']}
topmlp_model = GridSearchCV(MLPClassifier(), parameters)

epochs = 6

accuracies_basedt = []
accuracies_topdt = []
accuracies_basemlp = []
accuracies_topmlp = []

macro_averages_f1_basedt = []
macro_averages_f1_topdt = []
macro_averages_f1_basemlp = []
macro_averages_f1_topmlp = []

weighted_averages_f1_basedt = []
weighted_averages_f1_topdt = []
weighted_averages_f1_basemlp = []
weighted_averages_f1_topmlp = []


with open(f"abalone-performance.txt", "w") as out:
  out.write("********************* Abalone Performance ********************* \n\n")

In [None]:
for _ in range(0, epochs):

  with open(f"abalone-performance.txt", "a") as out:
      out.write(f"********************* ITERATION {_+1} *********************\n")

  # (a) Base-DT
  basedt_model = basedt_model.fit(A_X_train, A_y_train.astype(float))
  # tree.plot_tree(basedt_model)
  A_y_pred = basedt_model.predict(A_X_test)
  accuracies_basedt.append(accuracy_score(A_y_test, A_y_pred))
  macro_averages_f1_basedt.append(f1_score(A_y_test, A_y_pred, average='macro'))
  weighted_averages_f1_basedt.append(f1_score(A_y_test, A_y_pred, average='weighted'))
  with open(f"abalone-performance.txt", "a") as out:
      out.write("********************* Base-DT Model *********************\n")
      out.write(f"Model name: Base-DT Model\n")
      out.write(f"Hyperparameters (default):\n")
      out.write(f"  criterion: gini\n")
      out.write(f"  max_depth: None\n")
      out.write(f"  min_samples_split: 2\n")
      out.write("\n")
      out.write(f"Confusion Matrix: \n{confusion_matrix(A_y_test, A_y_pred)}\n")
      out.write("\n")
      out.write(f"Classification Report: \n{classification_report(A_y_test, A_y_pred)}\n")

  # (b) Top-DT
  topdt_model.fit(A_X_train, A_y_train)
  # best_params_dt_model = tree.DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=3)
  # best_params_dt_model = basedt_model.fit(A_X_train, A_y_train)
  # tree.plot_tree(best_params_dt_model)
  A_y_predict = topdt_model.predict(A_X_test)
  accuracies_topdt.append(accuracy_score(A_y_test, A_y_pred))
  macro_averages_f1_topdt.append(f1_score(A_y_test, A_y_pred, average='macro'))
  weighted_averages_f1_topdt.append(f1_score(A_y_test, A_y_pred, average='weighted'))
  with open(f"abalone-performance.txt", "a") as out:
      out.write("********************* Top-DT Model *********************\n")
      out.write(f"Model name: Top-DT Model\n")
      out.write(f"Best Hyperparameters:\n")
      out.write(f"{topdt_model.best_params_}")
      out.write("\n")
      out.write(f"Confusion Matrix: \n{confusion_matrix(A_y_test, A_y_pred)}\n")
      out.write("\n")
      out.write(f"Classification Report: \n{classification_report(A_y_test, A_y_pred)}\n")

  # (c) Base-MLP
  basemlp_model.fit(A_X_train, A_y_train)
  A_y_predict = basemlp_model.predict(A_X_test)
  accuracies_basemlp.append(accuracy_score(A_y_test, A_y_pred))
  macro_averages_f1_basemlp.append(f1_score(A_y_test, A_y_pred, average='macro'))
  weighted_averages_f1_basemlp.append(f1_score(A_y_test, A_y_pred, average='weighted'))
  with open(f"abalone-performance.txt", "a") as out:
      out.write("********************* Base-MLP Model *********************\n")
      out.write(f"Model name: Base-MLP Model\n")
      out.write(f"Hyperparameters (default):\n")
      out.write(f" hidden_layer_sizes=(100, 100, )\n")
      out.write(f" activation='logistic'\n")
      out.write(f" solver='sgd'\n")
      out.write("\n")
      out.write(f"Confusion Matrix: \n{confusion_matrix(A_y_test, A_y_pred)}\n")
      out.write("\n")
      out.write(f"Classification Report: \n{classification_report(A_y_test, A_y_pred)}\n")

  # (d) Top-MLP
  topmlp_model.fit(A_X_train, A_y_train)
  A_y_predict = topmlp_model.predict(A_X_test)
  accuracies_topmlp.append(accuracy_score(A_y_test, A_y_pred))
  macro_averages_f1_topmlp.append(f1_score(A_y_test, A_y_pred, average='macro'))
  weighted_averages_f1_topmlp.append(f1_score(A_y_test, A_y_pred, average='weighted'))
  with open(f"penguin-performance.txt", "a") as out:
    out.write("********************* Top-MLP Model *********************\n")
    out.write(f"Model name: Top-MLP Model\n")
    out.write(f"Best Hyperparameters:\n")
    out.write(f"{topmlp_model.best_params_}\n")
    out.write("\n")
    out.write(f"Confusion Matrix: \n{confusion_matrix(A_y_test, A_y_pred)}\n")
    out.write("\n")
    out.write(f"Classification Report: \n{classification_report(A_y_test, A_y_pred)}\n")

30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ut

In [None]:
# Append averages in performance files
with open(f"penguin-performance.txt", "a") as out:
  out.write(f"Average accuracy for BaseDT-Model = {statistics.mean(accuracies_basedt)}\n")
  out.write(f"Average accuracy for TopDT-Model = {statistics.mean(accuracies_topdt)}\n")
  out.write(f"Average accuracy for BaseMLP-Model = {statistics.mean(accuracies_basemlp)}\n")
  out.write(f"Average accuracy for TopMLP-Model = {statistics.mean(accuracies_topmlp)}\n")

  out.write(f"Varience for BaseDT-Model = {statistics.stdev(accuracies_basedt)}\n")
  out.write(f"Varience for TopDT-Model = {statistics.stdev(accuracies_topdt)}\n")
  out.write(f"Varience for BaseMLP-Model = {statistics.stdev(accuracies_basemlp)}\n")
  out.write(f"Varience for TopMLP-Model = {statistics.stdev(accuracies_topmlp)}\n")

  out.write(f"Average macro-average F1 for BaseDT-Model = {statistics.mean(macro_averages_f1_basedt)}\n")
  out.write(f"Average macro-average F1 for TopDT-Model = {statistics.mean(macro_averages_f1_topdt)}\n")
  out.write(f"Average macro-average F1 for BaseMLP-Model = {statistics.mean(macro_averages_f1_basemlp)}\n")
  out.write(f"Average macro-average F1 for TopMLP-Model = {statistics.mean(macro_averages_f1_topmlp)}\n")

  out.write(f"Average weighted-average F1 for BaseDT-Model = {statistics.mean(weighted_averages_f1_basedt)}\n")
  out.write(f"Average weighted-average F1 for TopDT-Model = {statistics.mean(weighted_averages_f1_topdt)}\n")
  out.write(f"Average weighted-average F1 for BaseMLP-Model = {statistics.mean(weighted_averages_f1_basemlp)}\n")
  out.write(f"Average weighted-average F1 for TopMLP-Model = {statistics.mean(weighted_averages_f1_topmlp)}\n")