# COMP5318 Assignment 1: Rice Classification

Student 2 = Iftikhar Amiri

In [None]:
# Import all libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# Ignore future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Load the rice dataset: rice-final2.csv
rice_data = pd.read_csv('rice-final2.csv')

In [None]:
# Pre-process dataset
def load_and_preprocess_data(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Fill missing values with mean
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

    # Convert '?' to NaN for numeric columns
    df.replace('?', np.nan, inplace=True)

    # Ensure numerical columns are float
    for col in df.columns[:-1]:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df[df.columns[:-1]] = imputer.fit_transform(df[df.columns[:-1]])

    # Normalise data using MinMaxScaler
    scaler = MinMaxScaler()
    df[df.columns[:-1]] = scaler.fit_transform(df[df.columns[:-1]])

    # Change class values ('class1' -> 0, 'class2' -> 1)
    df[df.columns[-1]] = df[df.columns[-1]].map({'class1': 0, 'class2': 1})

    return df


In [None]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])

# Importing test-before dataset
file_path = 'test-before.csv'

# Load and preprocess the dataset
processed_df = load_and_preprocess_data(file_path)

# Separate features and labels
X = processed_df.iloc[:, :-1].values
y = processed_df.iloc[:, -1].values

# Print the first 10 rows of the preprocessed dataset
print_data(X, y, 10)

0.0621,0.4999,0.5410,0.2079,0.2594,0.0613,0
0.8073,0.7474,0.6721,0.2634,0.2038,0.0586,0
0.3105,0.6030,0.4187,0.0000,0.0000,0.0900,0
0.3105,0.5618,0.6148,0.3604,0.0000,0.0950,0
0.1863,0.8144,0.6230,0.4990,0.4539,0.1597,1
0.1863,0.6039,0.4754,0.1525,0.1000,0.0655,1
0.6832,0.7114,0.6230,0.0000,0.0000,0.0877,1
0.5589,0.5258,0.6230,0.5129,0.0000,0.0869,0
0.1242,0.4639,0.5574,0.5822,0.0000,0.1009,1
0.2484,0.5722,0.5902,0.6515,0.3835,0.0979,0


### Part 1: Cross-validation without parameter tuning

In [None]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
# The stratified folds from cvKFold should be provided to the classifiers

In [None]:
# Logistic Regression
def logregClassifier(X, y):
    # Create a logistic regression classifier
    lrclf = LogisticRegression()
    # Perform cross-validation using the logistic regression model
    scores = cross_val_score(lrclf, X, y, cv=cvKFold)
    # Return the mean accuracy score from cross-validation
    return scores.mean()

In [None]:
# Naïve Bayes
def nbClassifier(X, y):
    nbclf = GaussianNB()
    scores = cross_val_score(nbclf, X, y, cv=cvKFold)
    return scores.mean()

In [None]:
# Decision Tree
def dtClassifier(X, y):
    dtclf = DecisionTreeClassifier(criterion='entropy', random_state=0)
    scores = cross_val_score(dtclf, X, y, cv=cvKFold)
    return scores.mean()

In [None]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    # Create a base Decision Tree classifier with entropy as the criterion for splitting
    base_estimator = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth)
    # Create a Bagging classifier using the specified base estimator
    bdtclf = BaggingClassifier(estimator=base_estimator, 
                              n_estimators=n_estimators, 
                              max_samples=max_samples, 
                              random_state=0)
    scores = cross_val_score(bdtclf, X, y, cv=cvKFold)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    base_estimator = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth)
    adtclf = AdaBoostClassifier(estimator=base_estimator, 
                               n_estimators=n_estimators, 
                               learning_rate=learning_rate, 
                               random_state=0)
    scores = cross_val_score(adtclf, X, y, cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    gbclf = GradientBoostingClassifier(n_estimators=n_estimators, 
                                      learning_rate=learning_rate, 
                                      random_state=0)
    scores = cross_val_score(gbclf, X, y, cv=cvKFold)
    return scores.mean()

### Part 1 Results

In [None]:
# Parameters for Part 1:

#Bagging
bag_n_estimators = 50
bag_max_samples = 100
bag_max_depth = 5

#AdaBoost
ada_n_estimators = 50
ada_learning_rate = 0.5
ada_bag_max_depth = 5

#GB
gb_n_estimators = 50
gb_learning_rate = 0.5

# Compute the accuracy for each classifier using cross-validation
lr_accuracy = logregClassifier(X, y)
nb_accuracy = nbClassifier(X, y)
dt_accuracy = dtClassifier(X,y)
bag_accuracy = bagDTClassifier(X, y, bag_n_estimators, bag_max_samples, bag_max_depth)
ada_accuracy = adaDTClassifier(X, y, ada_n_estimators, ada_learning_rate, ada_bag_max_depth)
gb_accuracy = gbClassifier(X, y, gb_n_estimators, gb_learning_rate)

# Print results for each classifier in part 1 to 4 decimal places here:
print(f"LogR average cross-validation accuracy:{lr_accuracy:.4f}")
print(f"NB average cross-validation accuracy:{nb_accuracy:.4f}")
print(f"DT average cross-validation accuracy:{dt_accuracy:.4f}")
print(f"Bagging average cross-validation accuracy:{bag_accuracy:.4f}")
print(f"AdaBoost average cross-validation accuracy:{ada_accuracy:.4f}")
print(f"GB average cross-validation accuracy:{gb_accuracy:.4f}")

LogR average cross-validation accuracy:0.6700
NB average cross-validation accuracy:0.6555
DT average cross-validation accuracy:0.7702
Bagging average cross-validation accuracy:0.7514
AdaBoost average cross-validation accuracy:0.7562
GB average cross-validation accuracy:0.7464


### Part 2: Cross-validation with parameter tuning

In [None]:
# KNN
k = [1, 3, 5, 7]
p = [1, 2]

def bestKNNClassifier(X, y):
    # create the training and test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    # Create parameter grid
    param_grid = {
        'n_neighbors': k,
        'p': p
    }
        
    knclf = KNeighborsClassifier()
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(knclf, param_grid=param_grid, 
                              cv=cvKFold, scoring='accuracy')
    
    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)
    
    # Get best parameters and score
    best_params = grid_search.best_params_
    best_cv_accuracy = grid_search.best_score_
    
    # Get test set accuracy
    test_accuracy = accuracy_score(y_test, grid_search.best_estimator_.predict(X_test))
    
    return best_params, best_cv_accuracy, test_accuracy

In [None]:
# SVM
# You should use SVC from sklearn.svm with kernel set to 'rbf'
C = [0.01, 0.1, 1, 5] 
gamma = [0.01, 0.1, 1, 10]

def bestSVMClassifier(X, y):
    # create the training and test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    # Create parameter grid with updated ranges
    param_grid = {
        'C': C,
        'gamma': gamma,
        'kernel': ['rbf']
    }
    
    # Create SVM classifier
    model = SVC()
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                              cv=cvKFold, scoring='accuracy')
    
    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)
    
    # Get best parameters and score
    best_params = grid_search.best_params_
    best_cv_accuracy = grid_search.best_score_
    
    # Get test set accuracy
    test_accuracy = accuracy_score(y_test, grid_search.best_estimator_.predict(X_test))
    
    return best_params, best_cv_accuracy, test_accuracy

In [None]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = [10, 30, 60, 100]
max_leaf_nodes = [6, 12]

def bestRFClassifier(X, y):
    # create the training and test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    # Create parameter grid with updated ranges
    param_grid = {
        'n_estimators': n_estimators,
        'max_leaf_nodes': max_leaf_nodes
    }
    
    # Create Random Forest classifier with information gain (entropy) and max_features='sqrt'
    model = RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=0)
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                              cv=cvKFold, scoring='accuracy')
    
    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)
    
    # Get best parameters and score
    best_params = grid_search.best_params_
    best_cv_accuracy = grid_search.best_score_
    
    # Get test set accuracy and F1 scores
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    
    return best_params, best_cv_accuracy, test_accuracy, macro_f1, weighted_f1

### Part 2: Results

In [None]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.
best_params_knn, knn_cv_acc, knn_test_acc = bestKNNClassifier(X, y)
print(f"KNN best k: {best_params_knn['n_neighbors']}")
print(f"KNN best p: {best_params_knn['p']}")
print(f"KNN cross-validation accuracy: {knn_cv_acc:.4f}")
print(f"KNN test set accuracy: {knn_test_acc:.4f}")

print()

best_params_svm, svm_cv_acc, svm_test_acc = bestSVMClassifier(X, y)
print(f"SVM best C: {best_params_svm['C']:.4f}")
print(f"SVM best gamma: {best_params_svm['gamma']:.4f}")
print(f"SVM cross-validation accuracy: {svm_cv_acc:.4f}")
print(f"SVM test set accuracy: {svm_test_acc:.4f}")

print()

best_params_rf, rf_cv_acc, rf_test_acc, rf_macro_f1, rf_weighted_f1 = bestRFClassifier(X, y)
print(f"RF best n_estimators: {best_params_rf['n_estimators']}")
print(f"RF best max_leaf_nodes: {best_params_rf['max_leaf_nodes']}")
print(f"RF cross-validation accuracy: {rf_cv_acc:.4f}")
print(f"RF test set accuracy: {rf_test_acc:.4f}")
print(f"RF test set macro average F1: {rf_macro_f1:.4f}")
print(f"RF test set weighted average F1: {rf_weighted_f1:.4f}")

KNN best k: 1
KNN best p: 1
KNN cross-validation accuracy: 0.7329
KNN test set accuracy: 0.6415

SVM best C: 5.0000
SVM best gamma: 10.0000
SVM cross-validation accuracy: 0.6858
SVM test set accuracy: 0.5849

RF best n_estimators: 60
RF best max_leaf_nodes: 12
RF cross-validation accuracy: 0.7883
RF test set accuracy: 0.6981
RF test set macro average F1: 0.6845
RF test set weighted average F1: 0.6956


### Part 3: Reflection

##### Write one paragraph describing the most important thing that you have learned throughout this assignment.
##### **Student 1:** The key takeaway from this assignment is the importance of systematic data preparation and model selection in improving machine learning performance. The most important thing I have learned that cleaning the data by imputing missing values with the mean helps maintain data integrity. Hence, normalizing the data using MinMaxScaler from the sklearn library ensures consistency across different feature scales before testing. I also discovered that stratified splitting in Part 1 is crucial for preserving class distributions and reducing bias in the training and testing sets. In Part 2, I explored cross-validation with parameter tuning such as adjusting C and gamma for SVM or n_estimators for ensemble methods by using tools like GridSearchCV. This approach significantly improves model performance compared to using the default settings. Beyond accuracy, metrics like macro-F1 and weighted-F1 scores from the Random Forest provide a better evaluation of class-aware performance, especially for imbalanced or multi-class data. In summary, stratified cross-validation and hyperparameter tuning are just as important as model selection in achieving accurate rice classification that can be employed in the real-world application.
##### **Student 2:** Throughout this assignment, one of the most valuable lessons I've learned is the importance of carefully preparing data before applying machine learning algorithms. I've gained practical skills in essential steps such as dividing data into training and test sets and adjusting hyperparameters to boost model accuracy. Additionally, working on this assignment greatly improved my familiarity with powerful tools in scikit-learn, especially GridSearchCV. I found the concept behind AdaBoost particularly fascinating; its approach of sequentially learning by focusing on mistakes from previous iterations was insightful and impressive. Overall, this experience clearly demonstrated how machine learning techniques can effectively address real-world problems.