# COMP5318 Assignment 1: Rice Classification

##### Group number: 127
##### Student 1 SID: 550251668
##### Student 2 SID: 540303144 
##### Student 3 SID: 520325185
##### Student 4 SID: 530419471

In [None]:
# Import all libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold

# libraries for KNN and Random Forest.
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# libraries for Decision Tree, Bagging
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score

# libraries for Ada Boost and Gradient Boosting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# libraries for Logistic Regression and Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

: 

In [None]:
# Ignore future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Load the rice dataset: rice-final2.csv
data = pd.read_csv("rice-final2.csv")

In [None]:
# Pre-process dataset

# 1. Replace missing value symbols ('?') with np.nan
data = data.replace('?', np.nan)

# 2. Separate predictors (X) from the class label (y)
X = data.iloc[:, :-1] 
y = data.iloc[:, -1]

# 3. Ensure all feature columns are numeric
X = X.apply(pd.to_numeric, errors='coerce')

# 4. Handle missing values by imputing with the  column-wise mean (univariate imputation)
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# 5. Normalize features using Min–Max scaling to map all values into the [0, 1] interval
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 6. Encode categorical class labels into binary values: "class1" → 0, "class2" → 1. 
y = y.replace({"class1": 0, "class2": 1}).astype(int)

In [None]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])

print_data(X_scaled, y.to_numpy())


### Part 1: Cross-validation without parameter tuning

In [None]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [None]:
# Logistic Regression
def logregClassifier(X, y):
    model = LogisticRegression()
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

In [None]:
#Naïve Bayes
def nbClassifier(X, y):
    model = GaussianNB()
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

In [None]:
# Decision Tree
def dtClassifier(X, y):
    # Initialize Decision Tree with entropy criterion
    clf = DecisionTreeClassifier(criterion="entropy", random_state=0)
    # Cross-validate and compute accuracy scores
    scores = cross_val_score(clf, X, y, cv=cvKFold, scoring="accuracy", n_jobs=None)
    return scores.mean()

In [None]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting

def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    # Define base Decision Tree with entropy criterion
    base_dt = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0)
    # Bagging ensemble using the base Decision Tree
    bag = BaggingClassifier(
        estimator=base_dt,
        n_estimators=n_estimators,
        max_samples=max_samples,
        bootstrap=True,
        random_state=0,
        n_jobs=None
    )
    # Cross-validate and compute accuracy scores
    scores = cross_val_score(bag, X, y, cv=cvKFold, scoring="accuracy", n_jobs=None)
    return scores.mean()


def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    # Define base Decision Tree
    base_dt = DecisionTreeClassifier(criterion="entropy", max_depth=int(max_depth), random_state=0)
    
    # Initialize AdaBoost with SAMME.R (default)
    ada = AdaBoostClassifier(
        estimator=base_dt,
        n_estimators=int(n_estimators),
        learning_rate=float(learning_rate),
        algorithm="SAMME.R",
        random_state=0
    )
    try:
        # Cross-validation with SAMME.R
        scores = cross_val_score(ada, X, y, cv=cvKFold, scoring="accuracy")
    except ValueError:
        # Fallback: use SAMME if SAMME.R not supported
        ada.set_params(algorithm="SAMME")
        scores = cross_val_score(ada, X, y, cv=cvKFold, scoring="accuracy")
    return scores.mean()


def gbClassifier(X, y, n_estimators, learning_rate):
    # Initialize Gradient Boosting classifier
    gb = GradientBoostingClassifier(
        n_estimators=int(n_estimators),
        learning_rate=float(learning_rate),
        random_state=0
    )
    # Cross-validate and compute accuracy scores
    scores = cross_val_score(gb, X, y, cv=cvKFold, scoring="accuracy")

    return scores.mean()

### Part 1 Results

In [None]:
# Parameters for Part 1:

#Bagging
bag_n_estimators = 50
bag_max_samples = 100
bag_max_depth = 5

#AdaBoost
ada_n_estimators = 50
ada_learning_rate = 0.5
ada_bag_max_depth = 5

#GB
gb_n_estimators = 50
gb_learning_rate = 0.5

# Run Decision Tree
dt_acc = dtClassifier(X_scaled, y.to_numpy())

# Run Bagging (Decision Trees)
bag_acc = bagDTClassifier(
    X_scaled, y.to_numpy(),
    n_estimators=bag_n_estimators,
    max_samples=bag_max_samples,
    max_depth=bag_max_depth
)
# Run Ada Boost (Decision Trees)
ada_acc = adaDTClassifier(
    X_scaled, y.to_numpy(),
    n_estimators=ada_n_estimators,
    learning_rate=ada_learning_rate,
    max_depth=ada_bag_max_depth
)
# Run Gradient Boosting
gb_acc = gbClassifier(
    X_scaled, y.to_numpy(),
    n_estimators=gb_n_estimators,
    learning_rate=gb_learning_rate
)

# Print results for each classifier in part 1 to 4 decimal places here:
print("LogR average cross-validation accuracy: ")
print("NB average cross-validation accuracy: ")
print("DT average cross-validation accuracy: ",f"{dt_acc:.4f}")
print("Bagging average cross-validation accuracy: ",f"{bag_acc:.4f}")
print("AdaBoost average cross-validation accuracy: ",f"{ada_acc:.4f}")
print("GB average cross-validation accuracy: ",f"{gb_acc:.4f}")

### Part 2: Cross-validation with parameter tuning

In [None]:
# KNN
k = [1, 3, 5, 7]
p = [1, 2]

cvKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

def bestKNNClassifier(X, y):
    # Split train and test data set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=0
    )

    knn = KNeighborsClassifier()
    
    # parameters grid
    param_grid = {
        "n_neighbors": [1, 3, 5, 7],
        "p": [1, 2]                 
    }
    # perform grid search
    gs = GridSearchCV(
        estimator=knn,
        param_grid=param_grid,
        scoring="accuracy",
        cv=cvKFold,
        n_jobs=-1,
        refit=True,
        return_train_score=False
    )
    gs.fit(X_train, y_train)
    
    best_k = int(gs.best_params_["n_neighbors"])
    best_p = int(gs.best_params_["p"])
    best_cv_acc = float(gs.best_score_)

    y_pred = gs.best_estimator_.predict(X_test)
    test_acc = float(accuracy_score(y_test, y_pred))

    return best_k, best_p, best_cv_acc, test_acc

In [None]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = [10, 30, 60, 100]
max_leaf_nodes = [6, 12]

def bestRFClassifier(X, y):
    # split train test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=0
    )

    # define classifier
    rf = RandomForestClassifier(
        criterion="entropy",      
        max_features="sqrt",    
        random_state=0,
        n_jobs=-1
    )

    # parameter grid
    param_grid = {
        "n_estimators": [10, 30, 60, 100],
        "max_leaf_nodes": [6, 12],
    }

    # perform grid search
    gs = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        scoring="accuracy",
        cv=cvKFold,
        n_jobs=-1,
        refit=True,
        return_train_score=False
    )
    gs.fit(X_train, y_train)

    best_n_estimators = int(gs.best_params_["n_estimators"])
    best_max_leaf_nodes = int(gs.best_params_["max_leaf_nodes"])
    best_cv_acc = float(gs.best_score_)

    y_pred = gs.best_estimator_.predict(X_test)
    test_acc = float(accuracy_score(y_test, y_pred))
    macro_f1 = float(f1_score(y_test, y_pred, average="macro"))
    weighted_f1 = float(f1_score(y_test, y_pred, average="weighted"))
    
    return best_n_estimators, best_max_leaf_nodes, best_cv_acc, test_acc, macro_f1, weighted_f1

### Part 2: Results

In [None]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.
best_k, best_p, knn_cv_acc, knn_test_acc = bestKNNClassifier(X_scaled, y)
print(f"KNN best k: {best_k:d}")
print(f"KNN best p: {best_p:d}")
print(f"KNN cross-validation accuracy: {knn_cv_acc:.4f}")
print(f"KNN test set accuracy: {knn_test_acc:.4f}")

print()

best_n, best_leaf, rf_cv_acc, rf_test_acc, rf_macro_f1, rf_weighted_f1 = bestRFClassifier(X_scaled, y)
print(f"RF best n_estimators: {best_n:d}")
print(f"RF best max_leaf_nodes: {best_leaf:d}")
print(f"RF cross-validation accuracy: {rf_cv_acc:.4f}")
print(f"RF test set accuracy: {rf_test_acc:.4f}")
print(f"RF test set macro average F1: {rf_macro_f1:.4f}")
print(f"RF test set weighted average F1: {rf_weighted_f1:.4f}")