# Import libraray and Define function

In [1]:
import json
import numpy as np
import pandas as pd
from io import BytesIO
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split

# Prepare X, y for Machine Learning

In [None]:
# count the number of data in each layer
# 0(mono), 1(bi), 2(tri), 3(quad), 4(penta), 5(thick)
def calculate_feature_lengths(df, index):
  lengths = {}
  for i in index:
    lengths[i] = len(df['data'][0][i]['dist']['RGB_dist'])
  return lengths

# configure X for machine learning
def make_X(df, indices):
    data_list = []
    for index in indices:
        data = df['data'][0][index]['dist']
        X_data = []
        for key, value in data.items():
            X_data.append(np.array(value))
        data_list.append(np.column_stack(X_data).T)

    return np.concatenate(data_list, axis=1)

# configure y for machine learning
def make_Y(index_feature_lengths):
  y = []
  for key, value in index_feature_lengths.items():
    for _ in range(value):
      y.append(key)
  return np.array(y).T

# get File Path

In [19]:
current_file_path = os.getcwd()
current_dir = os.path.dirname(current_file_path)
input_file_path = os.path.join(current_dir, "data_preprocess", "data", "json", "Graphene_merge.json")
output_file_dir = os.path.join(current_dir, "AI", "result")

In [20]:
# Load data (material, color for confusion matrix)
material_and_color = [('Graphene', 'Blues')]
idx = 0

with open(input_file_path, 'r') as file:
    json_data = json.load(file)

# JSON -> Pandas DataFrame
df = pd.json_normalize(json_data)

# Load material and substrate data
MATERIAL = df['material'].to_string(index=False)
SUBSTRATE = df['substrate'].to_string(index=False)
COLOR = material_and_color[idx][1]
print(f'material:{MATERIAL}\nsubstrate:{SUBSTRATE}')

# Load layer data in material
index = [0, 1, 2, 3, 4, 5] # mono, bi, tri, quad, penta, thick
index_feature_lengths = calculate_feature_lengths(df, index) # count the data according to the number of layer
print(index_feature_lengths)

# make X, y
data_labels = [key for key, item in index_feature_lengths.items() if item > 0]
y = make_Y(index_feature_lengths)
X = make_X(df, index)
X = X.T

material:Graphene
substrate:Gelpack
{0: 8633, 1: 2272, 2: 650, 3: 0, 4: 0, 5: 4975}


# Multiple Classifcation

v1. train vs valid vs test & valid, test acc

In [None]:
def machineLearning_print_val_test(feature, X=X, y=y, material=MATERIAL, color=COLOR, labels=data_labels):

    # merge confusion matrices in one image file
    def combine_images(images):
        images = [Image.open(x) for x in images]
        widths, heights = zip(*(i.size for i in images))

        total_width = sum(widths)
        max_height = max(heights)

        combined_image = Image.new('RGB', (total_width, max_height))

        x_offset = 0

        for im in images:
            combined_image.paste(im, (x_offset, 0))
            x_offset += im.size[0]

        return combined_image

    feature_mapping = {
        'rgb': 'RGB', 'yiq': 'YIQ'
    }

    feature_names = [feature_mapping.get(f, f.upper()) for f in feature]
    feature_name = ''.join(feature_names)
    
    output_file_path_for_conf_mat = os.path.join(output_file_dir, f"{feature_name}.png")
    output_file_path_for_table = os.path.join(output_file_dir, f"{feature_name}.csv")

    feature_dict = {
        'RGB_dist': 0, 'YIQ_dist': 1
    }

    feature_indices = [feature_dict[f] for f in feature]
    X_selected = X[:, feature_indices]

    # Train/Test Split
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, stratify=y, random_state=42)

    layer_dict = {0: 'mono', 1: 'bi', 2:'tri', 3:'quad', 4: 'penta', 5: 'thick'}
    layer_labels = [layer_dict[lb] for lb in labels]
    layer_numbers = [key for key, value in layer_dict.items() if value in layer_labels]

    fold_results_list = []
    images = [] 

    models = {
        'SVM': SVC(),
        'KNN': KNeighborsClassifier(),
        'DecisionTree': DecisionTreeClassifier(),
    }

    # Cross-validation for each model
    for model_name, model in models.items():
        print(f'Training {model_name} with 5-Fold Cross Validation...\n')

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
            
            # Split data into train/validation for this fold
            X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
            y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

            # Fit the model on the current fold
            model.fit(X_train_fold, y_train_fold)

            # Validate on the validation set
            y_val_pred = model.predict(X_val_fold)
            val_acc = round(accuracy_score(y_val_fold, y_val_pred), 4)

            # Test on the test set
            y_test_pred = model.predict(X_test)
            test_acc = round(accuracy_score(y_test, y_test_pred), 4)

            # Precision, Recall, F1-Score 
            precision = precision_score(y_test, y_test_pred, average=None, labels=layer_numbers, zero_division=0)
            recall = recall_score(y_test, y_test_pred, average=None, labels=layer_numbers)
            f1 = f1_score(y_test, y_test_pred, average=None, labels=layer_numbers)

            fold_results_list.append({
                'Model': model_name,
                'Fold': fold + 1,
                'Validation Accuracy': val_acc,
                'Test Accuracy': test_acc,
                'Precision': [round(p, 2) for p in precision],
                'Recall': [round(r, 2) for r in recall],
                'F1-Score': [round(f, 2) for f in f1]
            })

        # Generate confusion matrix
        y_test_pred = model.predict(X_test)
        conf_matrix = confusion_matrix(y_test, y_test_pred)

        plt.figure(figsize=(7, 7))
        sns.set(font_scale=2.0)

        heatmap = sns.heatmap(conf_matrix, annot=True, cmap=color, fmt='d', cbar=False,
                              xticklabels=layer_labels, yticklabels=layer_labels, annot_kws={"size": 36})

        plt.title(model_name, size=48)
        plt.xlabel('Predicted', size=24)
        plt.ylabel('True', size=24)

        buf = BytesIO()
        plt.savefig(buf, format='png')
        plt.close()
        buf.seek(0)
        images.append(buf)

    # Write report in CSV file
    fold_results_df = pd.DataFrame(fold_results_list)
    fold_results_df.to_csv(output_file_path_for_table, index=False)

    # Combine and save images
    combined_image = combine_images(images)
    combined_image.save(output_file_path_for_conf_mat)
    
    print("Done!")

In [31]:
machineLearning_print_val_test(['RGB_dist'])
machineLearning_print_val_test(['YIQ_dist'])

Training SVM with 5-Fold Cross Validation...

Training KNN with 5-Fold Cross Validation...

Training DecisionTree with 5-Fold Cross Validation...

Training SVM with 5-Fold Cross Validation...

Training KNN with 5-Fold Cross Validation...

Training DecisionTree with 5-Fold Cross Validation...

