## Import libraries and load data


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone

from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt




seed = 1462474
np.random.seed(seed)

In [6]:
data_path = "data/train/Features/all_features.csv"
train_df = pd.read_csv(data_path) 


In [7]:
# Split the data into features and output
y = train_df["ClassId"]
X = train_df.drop(columns=["ClassId", "image_path"])

## Feature Selection

In [16]:
# Since our data is mostly continuous, we will use mutual information to select features

# CHANGE NN while testing
mi_array = mutual_info_classif(X, y, random_state=0, n_neighbors= 7)

# Convert to series
mi_series = pd.Series(mi_array, index=X.columns)

# Sort features by importance
mi_sorted = mi_series.sort_values(ascending=False)

# Show top features
print(mi_sorted.head(10))  

# Select features with values over 0.15
top_features = mi_sorted.index[mi_sorted > 0.1]
X_selected = X[top_features]



hog_pca_0          0.882652
hog_pca_3          0.796677
Edge_Hist_Bin_6    0.596259
Edge_Hist_Bin_2    0.566049
hog_pca_1          0.554871
Edge_Hist_Bin_7    0.530975
Edge_Hist_Bin_3    0.519416
hog_pca_2          0.493234
Edge_Hist_Bin_5    0.429996
H_hist_bin_16      0.402010
dtype: float64


In [11]:
X_selected.head()

Unnamed: 0,hog_pca_0,hog_pca_3,Edge_Hist_Bin_6,Edge_Hist_Bin_2,hog_pca_1,Edge_Hist_Bin_7,Edge_Hist_Bin_3,hog_pca_2,Edge_Hist_Bin_5,H_hist_bin_16,...,hog_pca_16,ch_15,S_hist_bin_14,ch_34,hog_pca_14,ch_40,V_hist_bin_13,ch_33,ch_17,hog_pca_15
0,-0.763458,-0.638673,0.038306,0.199623,0.92788,0.060268,0.256466,0.264329,0.111521,0.022676,...,-0.213796,0.054894,0.0,0.335851,-0.97962,0.095503,0.0,0.002388,0.10581,0.450825
1,1.049284,0.90438,0.047018,0.185522,3.6082,0.233097,0.071324,-1.81719,0.112443,0.18335,...,-0.886182,0.012467,0.0,0.070214,0.101737,0.024119,0.0,0.0,0.020967,-0.335682
2,-1.55244,0.671877,0.125505,0.058121,-0.432374,0.08851,0.089124,-0.318422,0.146615,0.016829,...,-0.421448,0.268018,0.0,0.269911,0.377116,0.271127,0.061116,0.323407,0.132327,0.501386
3,-1.556871,0.613876,0.113744,0.122933,0.214406,0.117298,0.081,0.973758,0.129953,0.189296,...,-0.481589,0.0,0.0,0.801271,-0.5956,0.056455,0.0,0.509585,0.0,0.285638
4,-0.944294,-0.607014,0.10688,0.130112,-0.334833,0.128805,0.126042,0.415215,0.101269,0.0,...,0.062209,0.032836,0.011342,0.0,0.601939,0.110113,0.017013,0.0,0.065673,-0.520063


In [12]:
# See if we have any NaN values
print(X_selected.isnull().sum().sum())  

0


In [17]:

# Remove constant features, and highly correlated features
selector = VarianceThreshold(threshold=0)
X_var = selector.fit_transform(X_selected)
selected_columns = X_selected.columns[selector.get_support()]

# Remove highly correlated features
X_var_df = pd.DataFrame(X_var, columns=selected_columns)
corr_matrix = X_var_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
X_final = X_var_df.drop(columns=to_drop)


## Setup Stacking Pipeline

In [28]:
# Setup K folds for oof predictions
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
# Generate and store splits
splits = list(skf.split(X_final, y))
num_classes = len(np.unique(y))

#### 1. SVM

In [31]:
# Create pipeline for each model

"""
param_grid_svm = {
    'svc__C': [1, 5, 10, 100],
    'svc__gamma': ['scale'],    
}


# Grid search for SVM pipeline
grid_search_svm = GridSearchCV(
    svm_pipeline,
    param_grid_svm,
    cv= splits,
    scoring='accuracy',
    verbose=4
)

grid_search_svm.fit(X_final, y)
"""

# SVM pipeline
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='rbf', C = 5, gamma ='scale', probability=True, random_state=seed))
])



svm_oof_probs = np.zeros((X_final.shape[0], num_classes))

# Fit the model and get out-of-fold predictions
# Ensures we are not using the same data for training and validation
for train_idx, val_idx in splits:
    model = clone(svm_pipeline)
    model.fit(X_final.iloc[train_idx], y.iloc[train_idx])
    svm_oof_probs[val_idx] = model.predict_proba(X_final.iloc[val_idx])

Code for svm graph

In [None]:
"""# After fitting grid_search_svm or any GridSearchCV object
results = grid_search_svm.cv_results_

# Example: Plot mean test accuracy for each C value (assuming you varied 'svc__C')
C_values = results['param_svc__C'].data
mean_scores = results['mean_test_score']

plt.figure(figsize=(8, 5))
plt.plot(C_values, mean_scores, marker='o')
plt.xlabel('SVM: C value')
plt.ylabel('Mean CV Accuracy')
plt.title('SVM Grid Search: Accuracy vs C')
plt.xscale('log')
plt.grid(True)
plt.show()



print("Best SVM parameters:", grid_search_svm.best_params_)
print("Best SVM accuracy:", grid_search_svm.best_score_)

# Use the best SVM pipeline for stacking
svm_pipeline = grid_search_svm.best_estimator_"""



#### 2. RF

In [32]:
# === Commented out grid search block ===
# param_grid = {
#     'rf__n_estimators': [100, 200,300],
#     'rf__max_depth': [None, 10, 20],
#     'rf__min_samples_split': [2, 5, 10],
#     'rf__min_samples_leaf': [1, 2, 4],
#     'rf__max_features': ['sqrt', 'log2'],
# }

# grid_search = GridSearchCV(rf_pipeline, param_grid, cv=splits, scoring='accuracy', verbose=4)
# grid_search.fit(X_final, y)
# print("Best RF params:", grid_search.best_params_)
# print("Best RF accuracy:", grid_search.best_score_)
# rf_pipeline = grid_search.best_estimator_


# Uses best stacking pipeline
rf_pipeline = Pipeline([
    ('rf', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        random_state=seed
    ))
])


# Generate out of fold predicted probabilities for rf
rf_oof_probs = np.zeros((X_final.shape[0], num_classes))

for train_idx, val_idx in splits:
    model = clone(rf_pipeline)
    model.fit(X_final.iloc[train_idx], y.iloc[train_idx])
    rf_oof_probs[val_idx] = model.predict_proba(X_final.iloc[val_idx])



#### CNN

We will train this separately and use its oof probabilities as features for our metamodel

In [2]:
X_nn = np.load("X_nn.npy")           # shape: (N, 64, 64, 3), dtype=uint8
X_nn = X_nn.astype(np.float32) / 255.0

In [33]:
print(f"X_nn shape: {X_nn.shape}")  # (5488, 64, 64, 3)
print(f"y shape: {y.shape}")        # (5488,)
print(f"y dtype: {y.dtype}")        # int or int64
print(f"X_nn dtype: {X_nn.dtype}")  # float32

# Check unique labels
print(np.unique(y))


# Convert to np.int64 and np.float32

y_nn = y.astype(np.int64)
X_nn = X_nn.astype(np.float32)


X_nn shape: (5488, 64, 64, 3)
y shape: (5488,)
y dtype: int64
X_nn dtype: float32
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42]


In [38]:
from cnn import build_cnn
param_grid = {
    "num_filters": [32, 64],
    "kernel_size": [3, 5]
}
fixed_params = {
    "dropout_rate": 0.3,
    "optimizer": "adam",
    "epochs": 10
}

best_score = 0
best_params = None

for num_filters in param_grid["num_filters"]:
    for kernel_size in param_grid["kernel_size"]:
        cv_scores = []
        for train_idx, val_idx in splits:
            X_train, X_val = X_nn[train_idx], X_nn[val_idx]
            y_train = y_nn.iloc[train_idx] if hasattr(y_nn, "iloc") else y_nn[train_idx]
            y_val = y_nn.iloc[val_idx] if hasattr(y_nn, "iloc") else y_nn[val_idx]
            model = build_cnn(
                dropout_rate=fixed_params["dropout_rate"],
                num_filters=num_filters,
                kernel_size=kernel_size,
                optimizer=fixed_params["optimizer"]
            )
            model.fit(X_train, y_train, epochs=fixed_params["epochs"], batch_size=32, verbose=0)
            val_probs = model.predict(X_val)
            val_preds = val_probs.argmax(axis=1)
            acc = np.mean(val_preds == y_val)
            cv_scores.append(acc)
        mean_score = np.mean(cv_scores)
        print(f"Params: nf={num_filters}, ks={kernel_size} | CV Acc: {mean_score:.4f}")
        if mean_score > best_score:
            best_score = mean_score
            best_params = {
                "num_filters": num_filters,
                "kernel_size": kernel_size,
                **fixed_params
            }

print("Best CNN params:", best_params)
print("Best CNN CV accuracy:", best_score)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


KeyboardInterrupt: 

In [36]:
from cnn import build_cnn


cnn_oof_probs = np.zeros((X_nn.shape[0], num_classes))

for train_idx, val_idx in splits:
    X_train, X_val = X_nn[train_idx], X_nn[val_idx]
    y_train = y_nn.iloc[train_idx] 

    # Build and train a new CNN for each fold
    cnn_model = build_cnn()
    cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_val, y_nn[val_idx]))

    # Predict probabilities for the validation fold
    cnn_oof_probs[val_idx] = cnn_model.predict(X_val)

# Now concatenate with your tabular features
X_stacking = np.concatenate([X_final, cnn_oof_probs], axis=1)

Epoch 1/10
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 93ms/step - accuracy: 0.1607 - loss: 3.2679
Epoch 2/10
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 92ms/step - accuracy: 0.5987 - loss: 1.3817
Epoch 3/10


KeyboardInterrupt: 

In [35]:
cnn_oof_probs.shape

(5488, 43)

In [18]:
X_stacking = np.concatenate([X_final, cnn_oof_probs], axis=1)

In [58]:
import sklearn
import scikeras
print("scikit-learn:", sklearn.__version__)
print("scikeras:", scikeras.__version__)

scikit-learn: 1.6.1
scikeras: 0.13.0


In [None]:

# Define classifiers with pipelines for stacking
estimators = [
    ('svm', svm_pipeline),
    ('rf', rf_pipeline), 
]

# Meta classifier
final_classifier = LogisticRegression(random_state=seed, max_iter=1000)

## Setup Stacking Classifier  

In [None]:
# Stratified K-Folds cross-validator 
# Train the stacking classifier (meta classifier)   
stacking_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)



In [None]:
# Fit the stacking classifier
final_classifier.fit(X_stacking, y)

#### Evaluate each base model

In [None]:
# Evaluate base models and stacking model using cross-validation

from sklearn.base import clone


print("Cross-validation accuracy for each model (7-fold):")
for name, estimator in estimators:
    model = clone(estimator)
    scores = cross_val_score(model, X_final, y, cv = skf, scoring='accuracy')
    print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")


Cross-validation accuracy for each model (7-fold):
svm: 0.8260 ± 0.0173
rf: 0.8234 ± 0.0167
lr: 0.7977 ± 0.0105


In [None]:
# Train meta classifier
final

## Validation

In [None]:
# Stacking model (no passthrough)
# K folds, shuffle and properly stratified

scores = cross_val_score(stacking_clf, X_final, y, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {scores.mean():.4f} ± {scores.std():.4f}")



Cross-validation accuracy: 0.8515 ± 0.0102


In [None]:
# Define parameter grid for the meta-model
param_grid = {
    'final_estimator__C': [10,25],
    'final_estimator__penalty': ['l2']
}

# Wrap your stacking classifier in GridSearchCV
stacking_grid = GridSearchCV(
    stacking_clf,
    param_grid,
    cv=stacking_cv,
    scoring='accuracy',
    verbose=4
)

stacking_grid.fit(X_final, y)

print("Best meta-model params:", stacking_grid.best_params_)
print("Best stacking accuracy:", stacking_grid.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END final_estimator__C=10, final_estimator__penalty=l2;, score=0.862 total time= 5.2min
[CV 2/5] END final_estimator__C=10, final_estimator__penalty=l2;, score=0.842 total time= 5.6min
[CV 3/5] END final_estimator__C=10, final_estimator__penalty=l2;, score=0.877 total time= 4.6min
[CV 4/5] END final_estimator__C=10, final_estimator__penalty=l2;, score=0.830 total time= 3.0min
[CV 5/5] END final_estimator__C=10, final_estimator__penalty=l2;, score=0.863 total time= 5.0min
[CV 1/5] END final_estimator__C=25, final_estimator__penalty=l2;, score=0.862 total time= 4.1min
[CV 2/5] END final_estimator__C=25, final_estimator__penalty=l2;, score=0.842 total time= 5.2min
[CV 3/5] END final_estimator__C=25, final_estimator__penalty=l2;, score=0.882 total time= 5.9min
[CV 4/5] END final_estimator__C=25, final_estimator__penalty=l2;, score=0.833 total time= 5.1min
[CV 5/5] END final_estimator__C=25, final_estimator__penalty=l2;, s