## Import libraries and load data


In [182]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone

seed = 1462474
np.random.seed(seed)

In [183]:
data_path = "data/train/Features/all_features.csv"
train_df = pd.read_csv(data_path) 


In [184]:
# Split the data into features and output
y = train_df["ClassId"]
X = train_df.drop(columns=["ClassId", "image_path"])

## Feature Selection

In [185]:
# Since our data is mostly continuous, we will use mutual information to select features

# CHANGE NN while testing
mi_array = mutual_info_classif(X, y, random_state=0, n_neighbors= 7)

# Convert to series
mi_series = pd.Series(mi_array, index=X.columns)

# Sort features by importance
mi_sorted = mi_series.sort_values(ascending=False)

# Show top features
print(mi_sorted.head(10))  

# Select features with values over 0.15
top_features = mi_sorted.index[mi_sorted > 0.15]
X_selected = X[top_features]



hog_pca_0          0.882652
hog_pca_3          0.796677
Edge_Hist_Bin_6    0.596259
Edge_Hist_Bin_2    0.566049
hog_pca_1          0.554871
Edge_Hist_Bin_7    0.530975
Edge_Hist_Bin_3    0.519416
hog_pca_2          0.493234
Edge_Hist_Bin_5    0.429996
H_hist_bin_16      0.402010
dtype: float64


In [186]:
X_selected.head()

Unnamed: 0,hog_pca_0,hog_pca_3,Edge_Hist_Bin_6,Edge_Hist_Bin_2,hog_pca_1,Edge_Hist_Bin_7,Edge_Hist_Bin_3,hog_pca_2,Edge_Hist_Bin_5,H_hist_bin_16,...,V_hist_bin_2,S_hist_bin_5,S_hist_bin_13,H_hist_bin_3,aspect_ratio,Hu_1.1,Hu_1,hog_pca_6,V_hist_bin_16,H_hist_bin_12
0,-0.763458,-0.638673,0.038306,0.199623,0.92788,0.060268,0.256466,0.264329,0.111521,0.022676,...,0.068027,0.054422,0.0,0.036281,1.6875,2.870604,2.870604,-0.959387,0.0,0.013605
1,1.049284,0.90438,0.047018,0.185522,3.6082,0.233097,0.071324,-1.81719,0.112443,0.18335,...,0.03667,0.044599,0.0,0.086224,1.12,2.82235,2.82235,-0.51043,0.0,0.022795
2,-1.55244,0.671877,0.125505,0.058121,-0.432374,0.08851,0.089124,-0.318422,0.146615,0.016829,...,0.0,0.058459,0.0,0.098317,1.5625,2.946656,2.946656,0.565182,0.472099,0.014172
3,-1.556871,0.613876,0.113744,0.122933,0.214406,0.117298,0.081,0.973758,0.129953,0.189296,...,0.18335,0.12884,0.0,0.009911,0.956522,2.740183,2.740183,0.140899,0.0,0.065411
4,-0.944294,-0.607014,0.10688,0.130112,-0.334833,0.128805,0.126042,0.415215,0.101269,0.0,...,0.0,0.081285,0.130435,0.00189,0.307692,2.856199,2.856199,-0.692467,0.045369,0.030246


In [187]:
# See if we have any NaN values
print(X_selected.isnull().sum().sum())  

0


In [188]:

# Remove constant features, and highly correlated features
selector = VarianceThreshold(threshold=0)
X_var = selector.fit_transform(X_selected)
selected_columns = X_selected.columns[selector.get_support()]

# Remove highly correlated features
X_var_df = pd.DataFrame(X_var, columns=selected_columns)
corr_matrix = X_var_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
X_final = X_var_df.drop(columns=to_drop)


## Setup Stacking Pipeline

In [189]:
# Create pipeline for each model
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='rbf', probability=True, random_state=seed))
])



# Define classifiers with pipelines for stacking
estimators = [
    ('svm', svm_pipeline),
    # Will add more estimators here
]

# Meta classifier
final_classifier = LogisticRegression(random_state=seed, max_iter=1000)

## Setup Stacking Classifier  

In [190]:
# Stratified K-Folds cross-validator 
# Train the stacking classifier (meta classifier)   
stacking_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
stacking_clf = StackingClassifier(estimators=estimators, 
                                  final_estimator=final_classifier, passthrough=False, cv= stacking_cv)


#### Evaluate each base model

In [None]:
# Evaluate base models and stacking model using cross-validation

from sklearn.base import clone

# Set up Stratified K-Folds
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=seed)

print("Cross-validation accuracy for each model (7-fold):")
for name, estimator in estimators:
    model = clone(estimator)
    scores = cross_val_score(model, X_final, y, cv = skf, scoring='accuracy')
    print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")


Cross-validation accuracy for each model (5-fold):
svm: 0.8254 ± 0.0149


## Validation

In [192]:
# Stacking model (no passthrough)
# K folds, shuffle and properly stratified
scores = cross_val_score(stacking_clf, X_final, y, cv=skf, scoring='accuracy')
print(f"Cross-validation accuracy: {scores.mean():.4f} ± {scores.std():.4f}")



# With passthrough




KeyboardInterrupt: 