# CISC3024 Machine Learning Final Project
- Title: Wound Detection
- Groupmates: Huang Yanzhen DC126732, Yang Zhihan DC127992

In [4]:
# Basics
import os
import copy
import time
from itertools import product
from typing import List, Callable, Any, Union

# Pre-processing
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from sklearn.metrics import mean_squared_error
import pandas as pd

# Model Training
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

### Data Retrieval

In [9]:
def get_labels(data_type: str) -> np.ndarray:
    """
    Get ground truth labels from .csv file.
    :param data_type: Type of data: Training or Testing.
    """
    df = pd.read_csv(f"./Wound/{data_type}/myData.csv", delimiter=";")
    return df.to_numpy()

In [10]:
def get_images(data_type: str,
               image_names: np.ndarray,
               augmentation: Union[Callable[[np.ndarray, Any], np.ndarray], None] = None,
               flatten=True,
               **kwargs) -> np.ndarray:
    """
    Get the images from directory.
    :param data_type: Type of data: Training or Testing.
    :param image_names: Names of images from ground truth.
    :param augmentation: Augmentation function.
    :param flatten: Whether to flatten the images.
    :param kwargs: Other arguments to pass to augmentation function.
    """
    images = []
    for i_name in image_names:
        img = Image.open(os.path.join(f"./Wound/{data_type}/", i_name))
        img = img.resize((32, 32), Image.BICUBIC)
        img = np.array(img)
        if augmentation:
            img = augmentation(img, **kwargs)
        images.append(img.flatten() if flatten else img)

    images = np.array(images)

    return images

### Image Augmentation

In [11]:
def add_black_edge(img: np.array, w: int = 4) -> np.array:
    """
    Image augmentation. Add an inner black edge to an image.
    :param img: Image to be processed.
    :param w: Width of the edge.
    """
    if w > min(img.shape[0:2]) // 2:
        raise ValueError("Width of the edge must be smaller than half of the shorter side of an image.")

    new_img = np.zeros_like(img)
    new_img[w:-w, w:-w, :] = img[w:-w, w:-w, :]
    return new_img

In [12]:
def stretch(img: np.ndarray, f: List[float]) -> np.ndarray:
    """
    Image augmentation. Stretch an image on the width and height side.
    :param img: Image to be augmented.
    :param f: Factor tuple. Width and Height.
    """
    fw, fh = f
    if fw < 1 or fh < 1:
        raise ValueError("Width and height factors should be greater than or equal to 1.")

    # New widths
    new_width = int(img.shape[1] * fw)
    new_height = int(img.shape[0] * fh)

    # Adjust image
    img_pil = Image.fromarray(img)
    img_resized = img_pil.resize((new_width, new_height), Image.BICUBIC)

    # Crop regions
    # Keep 32x32 size
    left = (new_width - 32) // 2
    top = (new_height - 32) // 2
    right = left + 32
    bottom = top + 32

    # Crop image
    img_cropped = img_resized.crop((left, top, right, bottom))

    # Convert to numpy array
    img_stretched = np.array(img_cropped)

    return img_stretched

### Train Model
One trainig of model would result in the following structure:
```python
{
    "x": {
        "Best MSE": smallest_mse,
        "Best Fold": best_fold_idx,
        "Avg MSE": avg_mse,
        "model": ModelInstance,
    },
    "y": {
        "Best MSE": smallest_mse,
        "Best Fold": best_fold_idx,
        "Avg MSE": avg_mse,
        "model": ModelInstance,
    },
    "w": {
        "Best MSE": smallest_mse,
        "Best Fold": best_fold_idx,
        "Avg MSE": avg_mse,
        "model": ModelInstance,
    },
    "h": {
        "Best MSE": smallest_mse,
        "Best Fold": best_fold_idx,
        "Avg MSE": avg_mse,
        "model": ModelInstance,
    },
}
```
This is named as an "experiment object".

In [13]:
def train(ModelInstance, X, Y, desc: str = "DESC", n_fold: int = 3, save: bool = False):
    """
    Train the model. Output would be of shape:
    :param ModelInstance: Instance of a model class.
    :param desc: Description of the saved file.
    :param n_fold: Number of folds.
    :param save: Whether to save the experiment object.
    """
    model_name = ModelInstance.__class__.__name__
    semantic_y = ["File Name", "x", "y", "w", "h"]

    # Print Model configurations
    print(f"Training model {model_name}. Description: {desc}\nStarted at: {time.time()}")
    # Predict all for x, y, w, h
    exp = {}
    for i in range(1, Y.shape[1]):
        # Totally 4 labels to predict.
        # Select one of them.
        y = Y[:, i]

        # Split original data into 3 parts
        # to perform cross-validation
        kf = KFold(n_splits=n_fold, shuffle=True, random_state=1919810)
        splits = kf.split(X)

        # Record MSE of each fold.
        # Keep the model with the smallest MSE
        mse_scores = []
        cur_best_model = None
        cur_smallest_MSE = np.inf
        for train_index, val_index in splits:
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            model = copy.deepcopy(ModelInstance)
            model.fit(X_train, y_train)

            y_pred = model.predict(X_val)
            mse = mean_squared_error(y_val, y_pred)
            mse_scores.append(mse)

            if cur_smallest_MSE > mse_scores[-1]:
                cur_best_model = copy.deepcopy(model)

        exp[semantic_y[i]] = {
            "Best MSE": cur_smallest_MSE,
            "Best Fold": np.argmin(mse_scores),
            "Avg MSE": np.mean(mse_scores),
            "model": copy.deepcopy(cur_best_model)
        }
        del cur_best_model

        print(f"{semantic_y[i]} - Avg MSE={np.mean(mse_scores):.4f}, "
              f"Best MSE={np.min(mse_scores):.4f} at index {np.argmin(mse_scores)}")

    # Save models
    print(f"Ended at {time.time()}\n\n")
    if save:
        time_str = str(time.time()).replace(".", "")
        pickle.dump(exp, open(f"./save_models/{model_name}_{desc}_{time_str}.sav", "wb"))
    return exp

In [14]:
def grid_search(ModelClass, hyper_params, hyper_param_names, kwarg_names, **kwargs):
    """
    Perform grid search for the given model class and hyperparameters.
    :param ModelClass: The class to instantiate the model.
    :param hyper_params: The product of two lists of candidate hyperparameters.
    :hyper_param_names: The display name of two hyperparameters.
    :kwarg_names: Names of keyword arguments to be passed into the model.
    :returns: A list of dictionaries containing hyperparameters and experiment objects.
    """
    param_exps = []
    
    n1, n2 = hyper_param_names
    kw1, kw2 = kwarg_names
    
    model_name = ModelClass.__name__
    
    # Data Augmentation
    # Change some useless information
    Y_ori = get_labels(data_type="Training")
    X_ori = get_images(data_type="Training", image_names=Y_ori[:, 0])

    # Add black edge
    Y_be = get_labels(data_type="Training")
    X_be = get_images(data_type="Training", image_names=Y_be[:, 0], augmentation=add_black_edge, w=4)

    # Stretch height
    Y_sh = get_labels(data_type="Training")
    X_sh = get_images(data_type="Training", image_names=Y_sh[:, 0], augmentation=stretch, f=[1.0, 1.05])
    Y_sh[:, 4] *= 1.05

    # Stretch Width
    Y_sw = get_labels(data_type="Training")
    X_sw = get_images(data_type="Training", image_names=Y_sw[:, 0], augmentation=stretch, f=[1.05, 1.0])
    Y_sw[:, 3] *= 1.05
    
    X = np.concatenate((X_ori, X_be, X_sh, X_sw))
    Y = np.concatenate((Y_ori, Y_be, Y_sh, Y_sw))
    
    for param1, param2 in hyper_params:
        # Dynamically add the hyperparameters to kwargs
        model_kwargs = {
            kw1: param1,
            kw2: param2
        }
        
        model_kwargs.update(kwargs)
        
        model_instance = ModelClass(**model_kwargs)
        exp = train(model_instance, X, Y, desc=f"{n1}-{param1}--{n2}-{param2}", n_fold=3, save=False)
        param_exps.append({
            n1: param1,
            n2: param2,
            "exp": exp
        })

    time_str = str(time.time()).replace(".", "")
    pickle.dump(param_exps, open(f"./save_models/{model_name}_{n1}-{n2}_{time_str}.sav", "wb"))
    return param_exps

In [54]:
def test(exp_list, Y_test, X_test):
    # Y_test = get_labels(data_type="Test")
    # X_test = get_images(data_type="Test", image_names=Y_test[:,0])
    results = []
    for i, exp in enumerate(exp_list):
        param_name1, param_name2, _ = exp.keys()
        param1, param2, models = list(exp.values())
        
        model_x, model_y, model_w, model_h = (models["x"]["model"], 
                                              models["y"]["model"], 
                                              models["w"]["model"],
                                              models["h"]["model"])
        
        y_x, y_y, y_w, y_h = Y_test[:, 1], Y_test[:, 2], Y_test[:, 3], Y_test[:, 4]
        
        y_pred_x, y_pred_y, y_pred_w, y_pred_h = (model_x.predict(X_test), 
                                                  model_y.predict(X_test), 
                                                  model_w.predict(X_test), 
                                                  model_h.predict(X_test))
        
        mse_x, mse_y, mse_w, mse_h = (mean_squared_error(y_x, y_pred_x), 
                                      mean_squared_error(y_y, y_pred_y), 
                                      mean_squared_error(y_w, y_pred_w), 
                                      mean_squared_error(y_h, y_pred_h))

        weighted_avg_mse = (mse_x + mse_y) * 0.3 + (mse_w + mse_h) * 0.2
        
        results.append({
            param_name1: param1,
            param_name2: param2,
            "weighted_avg_mse": weighted_avg_mse
        })
    return results

### Grid Search: RandomForestRegressor
Hyper parameters for Random Forest Regressor:
- n_estimators: Number of estimators.
- bootstrap: Bootstrap or not.
- max_depth: Maximum Depth of the tree.
- min_samples_split: Minimum sample number that allows a leaf to be split again.
- min_samples_leaf: Minimum sample number a leaf requires.

In [62]:
rfr_nest = [10, 20, 30, 40, 50]
rfr_maxd = [11, 13, 15, 17, 19]
rfr_mins = [4, 6, 8, 10, 12]
rfr_minl = [6, 8, 10, 12, 14]
rfr_grid0 = product(rfr_nest, rfr_maxd)
rfr_grid1 = product(rfr_mins, rfr_minl)

### Grid Search 0: Num Estimators + Max Deapth

In [9]:
rfr_grid0_exp = grid_search(ModelClass=RandomForestRegressor, 
                            hyper_params=rfr_grid0, 
                            hyper_param_names=["nest", "maxd"], 
                            kwarg_names=["n_estimators", "max_depth"])

Training model RandomForestRegressor. Description: nest-10--maxd-11
Started at: 1732799242.455379
x - Avg MSE=265.0281, Best MSE=226.6704 at index 1
y - Avg MSE=319.6748, Best MSE=206.1377 at index 1
w - Avg MSE=981.5292, Best MSE=914.7635 at index 0
h - Avg MSE=1024.7640, Best MSE=774.9044 at index 0
Ended at 1732799277.573622

Training model RandomForestRegressor. Description: nest-10--maxd-13
Started at: 1732799303.3531008
x - Avg MSE=303.6308, Best MSE=264.3288 at index 2
y - Avg MSE=330.7782, Best MSE=254.7151 at index 1
w - Avg MSE=1046.2773, Best MSE=935.1649 at index 0
h - Avg MSE=1015.2763, Best MSE=801.2331 at index 1
Ended at 1732799339.3567605

Training model RandomForestRegressor. Description: nest-10--maxd-15
Started at: 1732799364.6094503
x - Avg MSE=300.3605, Best MSE=246.8027 at index 2
y - Avg MSE=316.6453, Best MSE=246.9491 at index 1
w - Avg MSE=1039.2064, Best MSE=971.3801 at index 0
h - Avg MSE=1130.1478, Best MSE=980.0463 at index 1
Ended at 1732799401.170166

Tr

In [55]:
with open("./save_models/RandomForestRegressor_nest-maxd_17328034630916922.sav", "rb") as rfr_grid0_exp_f:
    rfr_grid0_exp_loaded = pickle.load(rfr_grid0_exp_f)

In [56]:
Y_test = get_labels(data_type="Test")
X_test = get_images(data_type="Test", image_names=Y_test[:,0])
rfr_grid0_results = test(exp_list=rfr_grid0_exp_loaded, Y_test=Y_test, X_test=X_test)

In [57]:
rfr_grid0_results

[{'nest': 10, 'maxd': 11, 'weighted_avg_mse': np.float64(1037.866711863164)},
 {'nest': 10, 'maxd': 13, 'weighted_avg_mse': np.float64(1117.2152023979122)},
 {'nest': 10, 'maxd': 15, 'weighted_avg_mse': np.float64(1277.2782813552622)},
 {'nest': 10, 'maxd': 17, 'weighted_avg_mse': np.float64(1105.740378375)},
 {'nest': 10, 'maxd': 19, 'weighted_avg_mse': np.float64(1210.9086421250008)},
 {'nest': 20, 'maxd': 11, 'weighted_avg_mse': np.float64(1041.9624518709904)},
 {'nest': 20, 'maxd': 13, 'weighted_avg_mse': np.float64(1131.6653485552738)},
 {'nest': 20, 'maxd': 15, 'weighted_avg_mse': np.float64(1081.842526424827)},
 {'nest': 20, 'maxd': 17, 'weighted_avg_mse': np.float64(1022.9588525584991)},
 {'nest': 20, 'maxd': 19, 'weighted_avg_mse': np.float64(1022.3114634687497)},
 {'nest': 30, 'maxd': 11, 'weighted_avg_mse': np.float64(1002.6387328184359)},
 {'nest': 30, 'maxd': 13, 'weighted_avg_mse': np.float64(978.7584204580128)},
 {'nest': 30, 'maxd': 15, 'weighted_avg_mse': np.float64(10

In [74]:
rfr_grid0_mean = np.mean([res["weighted_avg_mse"] for res in rfr_grid0_results])
rfr_grid0_best = min(rfr_grid0_results, key=lambda x: x["weighted_avg_mse"])
print(f"Best:{rfr_grid0_best}\nDiff:{rfr_grid0_best["weighted_avg_mse"]-rfr_grid0_mean}")

Best:{'nest': 30, 'maxd': 19, 'weighted_avg_mse': np.float64(977.6827556527779)}
Diff:-66.33404769846572


#### Grid Search 1: Min Samples Split, Min Samples Leaf

In [63]:
rfr_grid1_exp = grid_search(ModelClass=RandomForestRegressor, 
                            hyper_params=rfr_grid1, 
                            hyper_param_names=["mins", "minl"], 
                            kwarg_names=["min_samples_split", "min_samples_leaf"],
                            # Settled Parameters
                            n_estimators=30,
                            max_depth=19)

Training model RandomForestRegressor. Description: mins-4--minl-6
Started at: 1732856800.760463
x - Avg MSE=326.0675, Best MSE=280.6331 at index 1
y - Avg MSE=327.7088, Best MSE=258.5277 at index 1
w - Avg MSE=1074.9534, Best MSE=991.6896 at index 0
h - Avg MSE=1065.0379, Best MSE=880.3963 at index 0
Ended at 1732856888.760798


Training model RandomForestRegressor. Description: mins-4--minl-8
Started at: 1732856888.760947
x - Avg MSE=338.8816, Best MSE=293.7372 at index 2
y - Avg MSE=363.2080, Best MSE=275.7450 at index 1
w - Avg MSE=1130.0578, Best MSE=1012.6302 at index 0
h - Avg MSE=1093.0109, Best MSE=1006.7138 at index 2
Ended at 1732856968.4516952


Training model RandomForestRegressor. Description: mins-4--minl-10
Started at: 1732856968.451767
x - Avg MSE=362.3157, Best MSE=326.9200 at index 1
y - Avg MSE=390.4445, Best MSE=299.2935 at index 1
w - Avg MSE=1202.0076, Best MSE=1027.7504 at index 0
h - Avg MSE=1182.0632, Best MSE=1125.1176 at index 0
Ended at 1732857042.551244


T

In [64]:
with open("./save_models/RandomForestRegressor_mins-minl_173285867054318.sav", "rb") as rfr_grid1_exp_f:
    rfr_grid1_exp_loaded = pickle.load(rfr_grid1_exp_f)

In [65]:
rfr_grid1_exp_loaded

[{'mins': 4,
  'minl': 6,
  'exp': {'x': {'Best MSE': inf,
    'Best Fold': np.int64(1),
    'Avg MSE': np.float64(326.06745338827005),
    'model': RandomForestRegressor(max_depth=19, min_samples_leaf=6, min_samples_split=4,
                          n_estimators=30)},
   'y': {'Best MSE': inf,
    'Best Fold': np.int64(1),
    'Avg MSE': np.float64(327.70875259964413),
    'model': RandomForestRegressor(max_depth=19, min_samples_leaf=6, min_samples_split=4,
                          n_estimators=30)},
   'w': {'Best MSE': inf,
    'Best Fold': np.int64(0),
    'Avg MSE': np.float64(1074.9533748950205),
    'model': RandomForestRegressor(max_depth=19, min_samples_leaf=6, min_samples_split=4,
                          n_estimators=30)},
   'h': {'Best MSE': inf,
    'Best Fold': np.int64(0),
    'Avg MSE': np.float64(1065.0378871162209),
    'model': RandomForestRegressor(max_depth=19, min_samples_leaf=6, min_samples_split=4,
                          n_estimators=30)}}},
 {'mins': 4,


In [66]:
Y_test = get_labels(data_type="Test")
X_test = get_images(data_type="Test", image_names=Y_test[:,0])
rfr_grid1_results = test(exp_list=rfr_grid1_exp_loaded, Y_test=Y_test, X_test=X_test)

In [67]:
rfr_grid1_results

[{'mins': 4, 'minl': 6, 'weighted_avg_mse': np.float64(1030.0319577545959)},
 {'mins': 4, 'minl': 8, 'weighted_avg_mse': np.float64(1077.343572941986)},
 {'mins': 4, 'minl': 10, 'weighted_avg_mse': np.float64(965.7390259597503)},
 {'mins': 4, 'minl': 12, 'weighted_avg_mse': np.float64(1001.5366133034171)},
 {'mins': 4, 'minl': 14, 'weighted_avg_mse': np.float64(1104.5554561859)},
 {'mins': 6, 'minl': 6, 'weighted_avg_mse': np.float64(1059.7983548651566)},
 {'mins': 6, 'minl': 8, 'weighted_avg_mse': np.float64(1042.7518063871962)},
 {'mins': 6, 'minl': 10, 'weighted_avg_mse': np.float64(968.3559043781738)},
 {'mins': 6, 'minl': 12, 'weighted_avg_mse': np.float64(1010.3198934567595)},
 {'mins': 6, 'minl': 14, 'weighted_avg_mse': np.float64(1069.8815027499604)},
 {'mins': 8, 'minl': 6, 'weighted_avg_mse': np.float64(923.2098763833342)},
 {'mins': 8, 'minl': 8, 'weighted_avg_mse': np.float64(1017.9480888058059)},
 {'mins': 8, 'minl': 10, 'weighted_avg_mse': np.float64(1001.0090268207798)},

In [73]:
rfr_grid1_mean = np.mean([res["weighted_avg_mse"] for res in rfr_grid1_results])
rfr_grid1_best = min(rfr_grid1_results, key=lambda x: x["weighted_avg_mse"])
print(f"Best:{rfr_grid1_best}\nDiff:{rfr_grid1_best["weighted_avg_mse"]-rfr_grid1_mean}")

Best:{'mins': 8, 'minl': 6, 'weighted_avg_mse': np.float64(923.2098763833342)}
Diff:-93.51913907602784
