In [1]:
# Kết nối vào gg drive nếu dùng gg colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Git clone
# %cd /content/drive/MyDrive/Git Clone #chọn đường dẫn đến thư mục muốn clone đến
# !git clone https://github.com/lbngyn/CS231-final-project.git

In [2]:
# cd đến thư mục project vừa clone về
%cd /content/drive/MyDrive/Git Clone/CS231-final-project

/content/drive/MyDrive/Git Clone/CS231-final-project


# Import necessary libraries

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from skimage.feature import hog
import lightgbm as lgb

# Get paths

In [2]:
# Directories
train_dir = 'Data/train'
val_dir = 'Data/val'
test_dir = 'Data/test'

# Data processing

In [3]:
def compute_color_histogram(image, bins=(8, 8, 8)):
    # Convert the image to HSV color space
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    # Compute the color histogram
    hist = cv2.calcHist([hsv_image], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    # Normalize the histogram
    hist = cv2.normalize(hist, hist).flatten()
    return hist

def load_images(folder_path, flower_types):
    features = []
    labels = []
    for flower in flower_types:
        flower_folder = os.path.join(folder_path, flower)
        for img in os.listdir(flower_folder):
            img_path = os.path.join(flower_folder, img)
            image = cv2.imread(img_path)
            if image is None:
                continue
            image = np.array(image).astype('uint8')
            image = cv2.resize(image, (64, 64))
            grey_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

            # Extract HOG features
            hog_features = hog(grey_image, orientations=9, pixels_per_cell=(8, 8),
                               cells_per_block=(2, 2), transform_sqrt=True, block_norm="L2")

            # Extract color histogram features
            color_hist = compute_color_histogram(image)

            # Concatenate HOG and color histogram features
            combined_features = np.hstack((hog_features, color_hist))

            features.append(combined_features)
            labels.append(flower)

    return np.array(features), np.array(labels)

In [4]:
flower_types = [folder for folder in os.listdir(train_dir)]

X_train, y_train = load_images(train_dir, flower_types)
X_val, y_val = load_images(val_dir, flower_types)
X_test, y_test = load_images(test_dir, flower_types)

# Kiểm tra dữ liệu sau khi tải
print(f"Number of training samples: {len(X_train)}")
print(f"Number of validation samples: {len(X_val)}")
print(f"Number of testing samples: {len(X_test)}")

Number of training samples: 5981
Number of validation samples: 1709
Number of testing samples: 857


In [5]:
# Encode labels
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_val_encoded = encoder.transform(y_val)
y_test_encoded = encoder.transform(y_test)

print("Label to Encoded Mapping:")
for original_label, encoded_label in zip(encoder.classes_, range(len(encoder.classes_))):
    print(f"{original_label} -> {encoded_label}")

# Standardize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

Label to Encoded Mapping:
daisy -> 0
lily -> 1
orchid -> 2
sunflower -> 3
tulip -> 4


In [6]:
# Apply PCA
pca = PCA(n_components=0.95)
print(X_train.shape)
X_train_pca = pca.fit_transform(X_train_scaled)
print(X_train_pca.shape)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

(5981, 2276)
(5981, 782)


# Initiating necessary functions

In [7]:
def evaluate_model(model, X_train, X_val, y_train, y_val):
    # Huấn luyện mô hình
    model.fit(X_train, y_train)
    # Dự đoán trên tập validation
    y_pred = model.predict(X_val)

    # Tính các chỉ số đánh giá
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy

In [11]:
# Define parameter grids
param_grid_svm = {
    'C': [0.01, 0.1, 1, 5, 10, 15, 20],
    'kernel': ['rbf', 'poly', 'linear']
}

param_grid_rf = {
    'n_estimators': [300, 500, 800, 1000, 1200]
}

param_grid_lgb = {
    'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
}

In [12]:
# Evaluation functions using GridSearchCV
def grid_search_evaluate(model, param_grid, X_train, y_train, X_val, y_val):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_pred)
    
    # Create a DataFrame with all results
    results_df = pd.DataFrame(grid_search.cv_results_)
    return results_df

# SVM

In [None]:
output_dir = os.path.join('finetunning_result', 'svm.csv')
df_svm = grid_search_evaluate(SVC(), param_grid_svm, X_train_pca, y_train_encoded, X_val_pca, y_val_encoded)
df_svm.to_csv(output_dir, index=True)

# Random Forest

In [None]:
output_dir = os.path.join('finetunning_result', 'randomForest.csv')
df_random_forest = grid_search_evaluate(RandomForestClassifier(), param_grid_rf, X_train_pca, y_train, X_val_pca, y_val, 'RandomForest')
df_random_forest.to_csv(output_dir, index=False)

Accuracy with n_estimators = 300: 0.740784084259801
Accuracy with n_estimators = 500: 0.7501462843768285
Accuracy with n_estimators = 800: 0.7595084844938561
Accuracy with n_estimators = 1000: 0.7659449970743125
Accuracy with n_estimators = 1200: 0.7665301345816267


# LightGBM

In [None]:
output_dir = os.path.join('finetunning_result', 'lightGBM.csv')
df_lightGBM = grid_search_evaluate(lgb.LGBMClassifier(), param_grid_lgb, X_train_pca, y_train, X_val_pca, y_val, 'LightGBM')
df_lightGBM.to_csv(output_dir, index=True)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.218954 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 199410
[LightGBM] [Info] Number of data points in the train set: 5981, number of used features: 782
[LightGBM] [Info] Start training from score -1.580761
[LightGBM] [Info] Start training from score -1.664602
[LightGBM] [Info] Start training from score -1.530077
[LightGBM] [Info] Start training from score -1.607100
[LightGBM] [Info] Start training from score -1.671694
Accuracy with learning_rate = 0.05: 0.7875950848449386
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 199410
[LightGBM] [Info] Number of data points in the train set: 5981, number of used features: 782
[LightGBM] [Info] Start training from score -1.580761
[LightGBM] [Info] Start training from s