Need to import the dataset - can do direct. 
Use environment complete_ml with addition of 
```bash
pip install ucimlrepo
```
 

## Import the data from the UCI ML repository.

In [2]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
secondary_mushroom = fetch_ucirepo(id=848) 


In [3]:

# data (as pandas dataframes) 
X = secondary_mushroom.data.features 
y = secondary_mushroom.data.targets 

# metadata 
print(secondary_mushroom.metadata) 

# variable information 
print(secondary_mushroom.variables) 

X.head()

names = secondary_mushroom.variables['name']

print(names)


{'uci_id': 848, 'name': 'Secondary Mushroom', 'repository_url': 'https://archive.ics.uci.edu/dataset/848/secondary+mushroom+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/848/data.csv', 'abstract': 'Dataset of simulated mushrooms for binary classification into edible and poisonous.', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 61068, 'num_features': 20, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2021, 'last_updated': 'Wed Apr 10 2024', 'dataset_doi': '10.24432/C5FP5Q', 'creators': ['Dennis Wagner', 'D. Heider', 'Georges Hattab'], 'intro_paper': {'title': 'Mushroom data creation, curation, and simulation to support classification tasks', 'authors': 'Dennis Wagner, D. Heider, Georges Hattab', 'published_in': 'Scientific Reports', 'year': 2021, 'url': 'https://www.semanticscholar.org/

Start by creating a pandas dataframe with the data and the target. Then, create a list of the column names using the variables dictionary. Finally, print the first few rows of the dataframe to see the data.


In [7]:
import pandas as pd

combined_df = pd.concat([X, y], axis=1)

combined_df.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,...,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,class
0,15.26,x,g,o,f,e,,w,16.95,17.09,...,y,w,u,w,t,g,,d,w,p
1,16.6,x,g,o,f,e,,w,17.99,18.19,...,y,w,u,w,t,g,,d,u,p
2,14.07,x,g,o,f,e,,w,17.8,17.74,...,y,w,u,w,t,g,,d,w,p
3,14.17,f,h,e,f,e,,w,15.77,15.98,...,y,w,u,w,t,p,,d,w,p
4,14.64,x,h,o,f,e,,w,16.53,17.2,...,y,w,u,w,t,p,,d,w,p


use similar model to last neural network model.

In [1]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [4]:
## Mac only step
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')


In [12]:
## sklearn  for preprocessing and ML models

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# sklearn for model evaluation
from sklearn.model_selection import learning_curve
from sklearn.metrics import mutual_info_score

# TensorFlow sequential model
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam

from xgboost.sklearn import XGBClassifier

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
# Import package to investigate our loaded dataframe
from ydata_profiling import ProfileReport

# Import functions for evaluating model
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report, \
                            confusion_matrix, ConfusionMatrixDisplay, auc, roc_curve
from sklearn.metrics import auc, roc_curve, RocCurveDisplay, f1_score, precision_score, \
                            recall_score, confusion_matrix, ConfusionMatrixDisplay, \
                            classification_report
from sklearn.inspection import permutation_importance

# Imports relating to logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Imports relating to plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import json # For saving model outcomes

In [18]:
# convert numerical to np in case we do k-fold splits

X_np = X.values
y_np = y.values


# Split the data into training and validation sets

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


results_file_path = 'experiment_results.jsonl'  # JSON Lines format


In [8]:
def scale_data(X_train, X_test):
    """Scale data 0-1 based on min and max in training set"""
    
    # Initialise a new scaling object for normalising input data
    sc = MinMaxScaler()

    # Apply the scaler to the training and test sets
    train_sc = sc.fit_transform(X_train)
    test_sc = sc.fit_transform(X_test)
    
    return train_sc, test_sc

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
import pandas as pd

# Assuming X_train and X_test are pandas DataFrames
# List of numerical columns to scale
numerical_columns = ['cap-diameter', 'stem-height', 'stem-width']  

# List of all columns
all_columns = X_train.columns.tolist()

# List of columns to one-hot encode (all columns except the numerical ones)
categorical_columns = [col for col in all_columns if col not in numerical_columns]

# Create the ColumnTransformer
preprocessor = ColumnTransformer(
	transformers=[
		('num', MinMaxScaler(), numerical_columns),
		('cat', OneHotEncoder(), categorical_columns)
	]
)

# Fit the ColumnTransformer to the training data
preprocessor.fit(X_train)

# Transform the training and test data using the ColumnTransformer
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Ensure the transformed data is a dense array
if hasattr(X_train_transformed, "toarray"):
	X_train_transformed = X_train_transformed.toarray()
if hasattr(X_test_transformed, "toarray"):
	X_test_transformed = X_test_transformed.toarray()

# Debugging: Check the shape of the transformed data
print("Shape of X_train_transformed:", X_train_transformed.shape)
print("Shape of X_test_transformed:", X_test_transformed.shape)

# Debugging: Check the feature names
feature_names = preprocessor.get_feature_names_out()
print("Feature names:", feature_names)
print("Number of feature names:", len(feature_names))

# Convert the transformed data back to DataFrame
X_train_ohe = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test_ohe = pd.DataFrame(X_test_transformed, columns=feature_names)

# Display the transformed DataFrame
print(X_train_ohe.head())


Shape of X_train_transformed: (39084, 128)
Shape of X_test_transformed: (9771, 128)
Feature names: ['num__cap-diameter' 'num__stem-height' 'num__stem-width'
 'cat__cap-shape_b' 'cat__cap-shape_c' 'cat__cap-shape_f'
 'cat__cap-shape_o' 'cat__cap-shape_p' 'cat__cap-shape_s'
 'cat__cap-shape_x' 'cat__cap-surface_d' 'cat__cap-surface_e'
 'cat__cap-surface_g' 'cat__cap-surface_h' 'cat__cap-surface_i'
 'cat__cap-surface_k' 'cat__cap-surface_l' 'cat__cap-surface_s'
 'cat__cap-surface_t' 'cat__cap-surface_w' 'cat__cap-surface_y'
 'cat__cap-surface_nan' 'cat__cap-color_b' 'cat__cap-color_e'
 'cat__cap-color_g' 'cat__cap-color_k' 'cat__cap-color_l'
 'cat__cap-color_n' 'cat__cap-color_o' 'cat__cap-color_p'
 'cat__cap-color_r' 'cat__cap-color_u' 'cat__cap-color_w'
 'cat__cap-color_y' 'cat__does-bruise-or-bleed_f'
 'cat__does-bruise-or-bleed_t' 'cat__gill-attachment_a'
 'cat__gill-attachment_d' 'cat__gill-attachment_e'
 'cat__gill-attachment_f' 'cat__gill-attachment_p'
 'cat__gill-attachment_s' 'ca

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming X_train_ohe and X_test_ohe are your preprocessed DataFrames
# and y_train, y_test are your target variables

# Initialize the Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
model.fit(X_train_ohe, y_train)

# Predict on the test data
y_pred = model.predict(X_test_ohe)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

  return fit_method(estimator, *args, **kwargs)


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           e       1.00      1.00      1.00      4407
           p       1.00      1.00      1.00      5364

    accuracy                           1.00      9771
   macro avg       1.00      1.00      1.00      9771
weighted avg       1.00      1.00      1.00      9771



In [31]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

# Assuming the previous code has been executed and y_test, y_pred are available

# Get the predicted probabilities for the positive class
y_pred_proba = model.predict_proba(X_test_ohe)[:, 1]

# Compute the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Compute the AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

ValueError: y_true takes value in {'e', 'p'} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly.

setup train and display to try a few different classifiers


In [22]:
def train_and_display(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()
    return f1_score(y_test, y_pred)


In [28]:
model = RandomForestClassifier()

In [29]:
train_and_display(model, X_train, y_train, X_test, y_test)

ValueError: could not convert string to float: 'x'