In [2]:
import sys, importlib
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np


sys.modules["numpy._core"] = importlib.import_module("numpy.core")
sys.modules["numpy._core.multiarray"] = importlib.import_module("numpy.core.multiarray")
sys.modules["numpy._core.umath"] = importlib.import_module("numpy.core.umath")

data = pd.read_pickle(r"C:\Users\brian\INFO 2950 - Intro to Data Science\Other Projects\rock-paper-scissors-pt2\data\train.pkl")

In [3]:
data.head()

Unnamed: 0,id,img1,img2,label
0,26018,"[[50.0103, 50.0103, 48.8964, 49.8963, 53.8959,...","[[254.97449999999998, 254.97449999999998, 254....",1
1,16353,"[[250.52349999999998, 249.1215, 250.9965, 250....","[[158.87990000000002, 160.2927, 162.8795, 163....",1
2,3823,"[[86.057, 83.8831, 81.5844, 80.8834, 77.883699...","[[134.3931, 180.7027, 187.06109999999998, 180....",-1
3,27592,"[[18.1571, 19.569899999999997, 40.9761, 52.383...","[[63.937999999999995, 79.90859999999999, 97.11...",1
4,11188,"[[196.4829, 192.9563, 204.3681, 168.324, 170.4...","[[240.97589999999997, 241.9758, 243.0897, 242....",-1


In [2]:
# ─── CELL 2 FIXED ─────────────────────────────────────────────
train = pickle.load(
    open(r"C:\Users\brian\INFO 2950 - Intro to Data Science\Other Projects\rock-paper-scissors-pt2\data\train.pkl", "rb")
)

# each of train['img1'], train['img2'] is a length‑N Series of 24×24 lists
imgs1 = np.stack(train['img1'].values)   # (N, 24, 24)
imgs2 = np.stack(train['img2'].values)   # (N, 24, 24)

X_raw = np.stack([imgs1, imgs2], axis=1) # (N, 2, 24, 24)
y     = train['label'].astype(int)       # ±1

print("X_raw:", X_raw.shape)
print("label counts:", np.bincount(y + 1))


X_raw: (40000, 2, 24, 24)
label counts: [19847     0 20153]


In [3]:
# CELL 3  ──────────────────────────────────────────────
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import numpy as np

class RPSFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Turn (N,2,24,24) image pairs into flat feature vectors.
    Params
    ------
    use_raw  : include img1||img2   (1152 dims)
    use_diff : include img1-img2    ( 576 dims)
    """
    def __init__(self, use_raw=True, use_diff=True):
        self.use_raw  = use_raw
        self.use_diff = use_diff
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # X shape: (N, 2, 24, 24)
        feats = []
        if self.use_raw:
            feats.append(X.reshape(len(X), -1))                     # (N,1152)
        if self.use_diff:
            diff = (X[:,0] - X[:,1]).reshape(len(X), -1)            # (N,576)
            feats.append(diff)
        return np.concatenate(feats, axis=1).astype(np.float32) / 255.0


In [4]:
# CELL 4  ──────────────────────────────────────────────
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from copy import deepcopy

# Pipelines + hyper‑param grids
pipelines = {
    "knn": Pipeline([
        ("feat", RPSFeatureExtractor()),
        ("scaler", StandardScaler()),
        ("cls",   KNeighborsClassifier())
    ]),
    "perc": Pipeline([
        ("feat", RPSFeatureExtractor()),
        ("scaler", StandardScaler()),
        ("cls",   Perceptron(max_iter=30))
    ]),
    "logreg": Pipeline([
        ("feat", RPSFeatureExtractor()),
        ("scaler", StandardScaler()),
        ("cls",   LogisticRegression(max_iter=1000, solver="lbfgs"))
    ]),
    "gnb": Pipeline([
        ("feat", RPSFeatureExtractor()),
        ("scaler", None),           # NB handles raw values fine
        ("cls",   GaussianNB())
    ]),
    "lin_svm": Pipeline([
        ("feat", RPSFeatureExtractor()),
        ("scaler", StandardScaler()),
        ("cls",   LinearSVC())
    ]),
    "rbf_svm": Pipeline([
        ("feat", RPSFeatureExtractor()),
        ("pca",  PCA()),            # cut dims before RBF
        ("scaler", StandardScaler()),
        ("cls",   SVC(kernel="rbf"))
    ]),
    "rf": Pipeline([
        ("feat", RPSFeatureExtractor()),
        ("cls",  RandomForestClassifier(n_jobs=-1, random_state=0))
    ]),
    "ada": Pipeline([
        ("feat", RPSFeatureExtractor()),
        ("cls",  AdaBoostClassifier(random_state=0))
    ]),
}

original_pipelines = deepcopy(pipelines)

param_grids = {
    "knn":      {"cls__n_neighbors":[1,3,5]},
    "perc":     {"cls__eta0":[1e-3,1e-2], "cls__penalty":[None,"l2"]},
    "logreg":   {"cls__C":[0.1,1,10]},
    "gnb":      {},                               # no tunables
    "lin_svm":  {"cls__C":[0.1,1,10]},
    "rbf_svm":  {"pca__n_components":[50,100],
                 "cls__C":[1,10],
                 "cls__gamma":["scale",0.01]},
    "rf":       {"cls__n_estimators":[300],
                 "cls__max_depth":[None,15]},
    "ada":      {"cls__n_estimators":[400,800],
                 "cls__learning_rate":[0.5,1.0]},
}


In [5]:
# CELL 5.0  ──────────────────────────────────────────────
# Smaller, faster hyperparam grids
quick_param_grids = {
    "knn":      {"cls__n_neighbors": [3]},
    "perc":     {"cls__eta0": [1e-2], "cls__penalty": [None]},
    "logreg":   {"cls__C": [1]},
    "gnb":      {},  # no tuning needed
    "lin_svm":  {"cls__C": [1]},
    "rbf_svm":  {"pca__n_components": [50], "cls__C": [1], "cls__gamma": ["scale"]},
    "rf":       {"cls__n_estimators": [100], "cls__max_depth": [None]},
    "ada":      {"cls__n_estimators": [50], "cls__learning_rate": [0.5, 1.0]},
}

In [7]:
# CELL 5A  ──────────────────────────────────────────────
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import pandas as pd, time, warnings
warnings.filterwarnings("ignore")

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
results_part1 = []

subset1 = ["knn", "perc", "logreg", "gnb"]

for name in subset1:
    print(f"\n=== {name.upper()} ===")
    pipe = pipelines[name]
    gs = GridSearchCV(pipe,
                      quick_param_grids[name],
                      scoring="accuracy",
                      cv=cv,
                      n_jobs=-1,
                      verbose=1)
    t0 = time.time()
    gs.fit(X_raw, y)
    duration = time.time() - t0

    print(f"Best CV acc = {gs.best_score_:.4f} | Params = {gs.best_params_}")
    results_part1.append({
        "model": name,
        "cv_acc": gs.best_score_,
        "params": gs.best_params_,
        "fit_time_s": round(duration, 1),
        "estimator": gs.best_estimator_
    })



=== KNN ===
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best CV acc = 0.5700 | Params = {'cls__n_neighbors': 3}

=== PERC ===
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best CV acc = 0.5252 | Params = {'cls__eta0': 0.01, 'cls__penalty': None}

=== LOGREG ===
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best CV acc = 0.5472 | Params = {'cls__C': 1}

=== GNB ===
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best CV acc = 0.5485 | Params = {}


In [8]:
# CELL 5B  ──────────────────────────────────────────────
results_part2 = []

subset2 = ["lin_svm", "rbf_svm", "rf"]

for name in subset2:
    print(f"\n=== {name.upper()} ===")
    pipe = pipelines[name]
    gs = GridSearchCV(pipe,
                      quick_param_grids[name],
                      scoring="accuracy",
                      cv=cv,
                      n_jobs=-1,
                      verbose=1)
    t0 = time.time()
    gs.fit(X_raw, y)
    duration = time.time() - t0

    print(f"Best CV acc = {gs.best_score_:.4f} | Params = {gs.best_params_}")
    results_part2.append({
        "model": name,
        "cv_acc": gs.best_score_,
        "params": gs.best_params_,
        "fit_time_s": round(duration, 1),
        "estimator": gs.best_estimator_
    })



=== LIN_SVM ===
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best CV acc = 0.5478 | Params = {'cls__C': 1}

=== RBF_SVM ===
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best CV acc = 0.6040 | Params = {'cls__C': 1, 'cls__gamma': 'scale', 'pca__n_components': 50}

=== RF ===
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best CV acc = 0.5970 | Params = {'cls__max_depth': None, 'cls__n_estimators': 100}

=== ADA ===
Fitting 3 folds for each of 1 candidates, totalling 3 fits


KeyboardInterrupt: 

In [9]:
# CELL 5C – Run AdaBoost only with PCA and reduced n_estimators
    
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
import time

results_ada = []

# Redefine AdaBoost pipeline with PCA and allow GridSearch to control hyperparams
pipelines["ada"] = Pipeline([
    ("feat", RPSFeatureExtractor()),
    ("pca",  PCA(n_components=100)),
    ("cls",  AdaBoostClassifier(random_state=0))
])

print(f"\n=== ADABOOST ===")
pipe = pipelines["ada"]
gs = GridSearchCV(pipe,
                  quick_param_grids["ada"],
                  scoring="accuracy",
                  cv=cv,
                  n_jobs=-1,
                  verbose=1)

t0 = time.time()
gs.fit(X_raw, y)
duration = time.time() - t0

print(f"Best CV acc = {gs.best_score_:.4f} | Params = {gs.best_params_}")
results_ada.append({
    "model": "ada",
    "cv_acc": gs.best_score_,
    "params": gs.best_params_,
    "fit_time_s": round(duration, 1),
    "estimator": gs.best_estimator_
})

# Turn into DataFrame to review
import pandas as pd
summary_ada = pd.DataFrame(results_ada)
display(summary_ada[["model", "cv_acc", "fit_time_s"]])



=== ADABOOST ===
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best CV acc = 0.5449 | Params = {'cls__learning_rate': 1.0, 'cls__n_estimators': 400}


Unnamed: 0,model,cv_acc,fit_time_s
0,ada,0.54495,667.2


In [None]:
# CELL 5  ──────────────────────────────────────────────
# WAY TOO BIG -- I SPLIT IT UP ABOVE
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import pandas as pd, time, warnings
warnings.filterwarnings("ignore")

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
results = []

for name, pipe in pipelines.items():
    print(f"=== {name} ===")
    gs = GridSearchCV(pipe,
                      param_grids[name],
                      scoring="accuracy",
                      cv=cv,
                      n_jobs=-1,
                      verbose=1)
    t0 = time.time()
    gs.fit(X_raw, y)
    duration = time.time() - t0
    
    print(f"best CV acc = {gs.best_score_:0.4f}  |  best params = {gs.best_params_}")
    results.append({
        "model": name,
        "cv_acc": gs.best_score_,
        "params": gs.best_params_,
        "fit_time_s": round(duration,1),
        "estimator": gs.best_estimator_
    })

# tidy summary
summary = pd.DataFrame(results).sort_values("cv_acc", ascending=False)
display(summary[["model","cv_acc","fit_time_s"]])
best_model = summary.iloc[0]["estimator"]   # champion pipeline


=== knn ===
Fitting 3 folds for each of 3 candidates, totalling 9 fits
best CV acc = 0.5702  |  best params = {'cls__n_neighbors': 5}
=== perc ===
Fitting 3 folds for each of 4 candidates, totalling 12 fits
best CV acc = 0.5252  |  best params = {'cls__eta0': 0.01, 'cls__penalty': None}
=== logreg ===
Fitting 3 folds for each of 3 candidates, totalling 9 fits
best CV acc = 0.5480  |  best params = {'cls__C': 0.1}
=== gnb ===
Fitting 3 folds for each of 1 candidates, totalling 3 fits
best CV acc = 0.5485  |  best params = {}
=== lin_svm ===
Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [14]:
# CELL 6 (revised) ──────────────────────────────────────────────
# Prune pipelines, insert PCA, expand grids, add KNN–PCA

# ---- START WITH A CLEAN SLATE ----
from copy import deepcopy

# original_pipelines comes from Cell 4; we make
# a deep copy so that we don't mutate the originals.
# original_pipelines = deepcopy(pipelines)
base = deepcopy(original_pipelines)          # keep a pristine master
pipelines = {k: base[k] for k in ["rbf_svm", "rf"]}

# now continue with the rest (PCA insertion, knn_pca, grids) ...

from sklearn.decomposition import PCA
from sklearn.neighbors     import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline      import Pipeline

# 1) Prune to only the top models
pipelines = { k: pipelines[k] for k in ["rbf_svm", "rf"] }

# 2) Insert PCA only into pipelines that lack a "pca" step
for name, pipe in pipelines.items():
    if "pca" not in pipe.named_steps:
        pipe.steps.insert(1, ("pca", PCA()))  
        # now rf will get a PCA, but rbf_svm keeps its original one

# 3) Expanded hyper-param grids
quick_param_grids = {
    "rbf_svm": {
        "pca__n_components": [100],
        "cls__C":            [1],
        "cls__gamma":        ["scale", 0.01]
    },
    "rf": {
        "pca__n_components": [100],               
        "cls__n_estimators": [300, 500],
        "cls__max_depth":    [None, 10, 15],
        "cls__min_samples_leaf": [1, 3]
    }
}

# 4) Add the PCA‐knn baseline
pipelines["knn_pca"] = Pipeline([
    ("feat",   RPSFeatureExtractor()),
    ("pca",    PCA(n_components=100)),
    ("scaler", StandardScaler()),
    ("cls",    KNeighborsClassifier())
])
quick_param_grids["knn_pca"] = {"cls__n_neighbors": [3]}

print("Pipelines:", pipelines.keys())
print("Grids:    ", quick_param_grids.keys())


Pipelines: dict_keys(['rbf_svm', 'rf', 'knn_pca'])
Grids:     dict_keys(['rbf_svm', 'rf', 'knn_pca'])


In [15]:
# CELL 7  ──────────────────────────────────────────────
# Cross-validate the tuned pipelines
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import pandas as pd, time

cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
results_tuned = []

for name, pipe in pipelines.items():
    print(f"\n=== {name.upper():<8} ===")
    gs = GridSearchCV(pipe,
                      quick_param_grids[name],
                      scoring="accuracy",
                      cv=cv,
                      n_jobs=2,
                      verbose=1)
    
    t0 = time.time()
    gs.fit(X_raw, y)

    
    print(f"→ best CV = {gs.best_score_:.4f}, params = {gs.best_params_}")
    results_tuned.append({
        "model": name,
        "cv_acc": gs.best_score_,
        "params": gs.best_params_,
        "estimator": gs.best_estimator_
    })

# Tabulate
df_tuned = pd.DataFrame(results_tuned).sort_values("cv_acc", ascending=False)
display(df_tuned[["model","cv_acc"]])
best_model = df_tuned.iloc[0]["estimator"]
print("Champion:", df_tuned.iloc[0].to_dict())



=== RBF_SVM  ===
Fitting 2 folds for each of 2 candidates, totalling 4 fits
→ best CV = 0.5997, params = {'cls__C': 1, 'cls__gamma': 0.01, 'pca__n_components': 100}

=== RF       ===
Fitting 2 folds for each of 12 candidates, totalling 24 fits
→ best CV = 0.5787, params = {'cls__max_depth': None, 'cls__min_samples_leaf': 1, 'cls__n_estimators': 500, 'pca__n_components': 100}

=== KNN_PCA  ===
Fitting 2 folds for each of 1 candidates, totalling 2 fits
→ best CV = 0.5557, params = {'cls__n_neighbors': 3}


Unnamed: 0,model,cv_acc
0,rbf_svm,0.599675
1,rf,0.578675
2,knn_pca,0.5557


Champion: {'model': 'rbf_svm', 'cv_acc': 0.599675, 'params': {'cls__C': 1, 'cls__gamma': 0.01, 'pca__n_components': 100}, 'estimator': Pipeline(steps=[('feat', RPSFeatureExtractor()), ('pca', PCA(n_components=100)),
                ('scaler', StandardScaler()), ('cls', SVC(C=1, gamma=0.01))])}


In [19]:
# CELL 8  ──────────────────────────────────────────────
# Build & CV the soft-voting ensemble
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

# pick top-2 or top-3 from df_tuned
top3 = df_tuned.head(3)
estimators = [(r["model"], r["estimator"]) for _, r in top3.iterrows()]

vote = VotingClassifier(estimators=estimators, voting="hard")
cv_score = cross_val_score(
    vote, X_raw, y,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring="accuracy", n_jobs=-1
).mean()

print(f"Hard‑vote CV accuracy: {cv_score:.4f}")
print("Ensemble members:", [e[0] for e in estimators])
print(f"Ensemble CV accuracy: {cv_score:.4f}")


Hard‑vote CV accuracy: 0.6025
Ensemble members: ['rbf_svm', 'rf', 'knn_pca']
Ensemble CV accuracy: 0.6025


In [5]:
# CELL 8.1 ──────────────────────────────────────────────

# CELL 8b – Manual import of best‑model params + rebuild vote
import pandas as pd
from sklearn.pipeline     import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors     import KNeighborsClassifier
from sklearn.svm           import SVC
from sklearn.ensemble      import RandomForestClassifier, VotingClassifier

# 1) Reconstruct the results DataFrame
df_tuned = pd.DataFrame([
    {
      "model": "rbf_svm",
      "cv_acc": 0.5997,
      "params": {
         "pca__n_components": 100,
         "cls__C": 1,
         "cls__gamma": 0.01
      }
    },
    {
      "model": "rf",
      "cv_acc": 0.5787,
      "params": {
         "pca__n_components": 100,
         "cls__n_estimators": 500,
         "cls__max_depth": None,
         "cls__min_samples_leaf": 1
      }
    },
    {
      "model": "knn_pca",
      "cv_acc": 0.5557,
      "params": {
         "cls__n_neighbors": 3
      }
    }
]).sort_values("cv_acc", ascending=False)

# 2) Rebuild each pipeline (must mirror your Cell 6 definitions)
pipelines = {}

# RBF‑SVM pipeline
pipelines["rbf_svm"] = Pipeline([
    ("feat",   RPSFeatureExtractor()),
    ("pca",    PCA(n_components=100)),
    ("scaler", StandardScaler()),
    ("cls",    SVC())
])
pipelines["rbf_svm"].set_params(**df_tuned.loc[0, "params"])

# Random Forest pipeline
pipelines["rf"] = Pipeline([
    ("feat",   RPSFeatureExtractor()),
    ("pca",    PCA(n_components=100)),
    ("cls",    RandomForestClassifier(random_state=0))
])
pipelines["rf"].set_params(**df_tuned.loc[1, "params"])

# k-NN‑PCA pipeline
pipelines["knn_pca"] = Pipeline([
    ("feat",   RPSFeatureExtractor()),
    ("pca",    PCA(n_components=100)),
    ("scaler", StandardScaler()),
    ("cls",    KNeighborsClassifier())
])
pipelines["knn_pca"].set_params(**df_tuned.loc[2, "params"])

# 3) Build the hard‑voting ensemble
estimators = [(m, pipelines[m]) for m in df_tuned["model"].tolist()]
vote = VotingClassifier(estimators=estimators, voting="hard")

print("Rebuilt vote with:", [m for m,_ in estimators])


Rebuilt vote with: ['rbf_svm', 'rf', 'knn_pca']


In [7]:
# CELL 8.25 – Rebuild vote without needing df_tuned["estimator"]

from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# 1) Manually specify the best params from your printout
best_params = {
    "rbf_svm":  {"pca__n_components":100, "cls__C":1,     "cls__gamma":0.01},
    "rf":       {"pca__n_components":100, "cls__n_estimators":500,
                 "cls__max_depth":None, "cls__min_samples_leaf":1},
    "knn_pca":  {"cls__n_neighbors":3}
}

# 2) Reconstruct each pipeline exactly as in Cell 6
rbf_svm = Pipeline([
    ("feat",   RPSFeatureExtractor()),
    ("pca",    PCA()),            # will set n_components next
    ("scaler", StandardScaler()),
    ("cls",    SVC())
])
rf = Pipeline([
    ("feat",   RPSFeatureExtractor()),
    ("pca",    PCA()),
    ("cls",    RandomForestClassifier(random_state=0))
])
knn_pca = Pipeline([
    ("feat",   RPSFeatureExtractor()),
    ("pca",    PCA()),
    ("scaler", StandardScaler()),
    ("cls",    KNeighborsClassifier())
])

# 3) Apply the best_params to each
rbf_svm.set_params(**best_params["rbf_svm"])
rf.set_params(**best_params["rf"])
knn_pca.set_params(**best_params["knn_pca"])

# 4) Build the voting ensemble
vote = VotingClassifier(
    estimators=[
        ("svm", rbf_svm),
        ("rf",  rf),
        ("knn", knn_pca)
    ],
    voting="hard"
)

print("Rebuilt vote with:", [name for name,_ in vote.estimators])


Rebuilt vote with: ['svm', 'rf', 'knn']


In [9]:
# CELL 8.5 – Generate submission CSV from hard‑vote ensemble
import pickle
import pandas as pd
from sklearn.pipeline import make_pipeline

# 1) Fit the ensemble on the full training set
vote.fit(X_raw, y)

# 2) Load test data
test = pickle.load(open(r"C:\Users\brian\INFO 2950 - Intro to Data Science\Other Projects\rock-paper-scissors-pt2\data\test.pkl", "rb"))
imgs1 = np.stack(test["img1"].values).astype(np.float32)
imgs2 = np.stack(test["img2"].values).astype(np.float32)
X_test = np.stack([imgs1, imgs2], axis=1)  # shape: (20000,2,24,24)

# 3) Predict labels {–1, +1}
y_pred = vote.predict(X_test)
# If it ended up in {0,1}, convert:
y_pred = np.where(y_pred > 0, 1, -1).astype(int)

# 4) Build and save CSV
submission = pd.DataFrame({
    "id":    test["id"],
    "label": y_pred
})

submission.to_csv(
    r"C:\Users\brian\INFO 2950 - Intro to Data Science\Other Projects\rock-paper-scissors-pt2\submissions\ensemble_hard_vote.csv",
    index=False
)
print("Saved submission → submissions/ensemble_hard_vote.csv")



Saved submission → submissions/ensemble_hard_vote.csv


In [20]:
# CELL 9 – Soft-Voting Ensemble (SVM with probability=True)
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

# 1) Clone the best SVM pipeline and turn on probability output
best_svm = clone(df_tuned.loc[df_tuned["model"] == "rbf_svm", "estimator"].iloc[0])
best_svm.set_params(cls__probability=True)

# 2) Build new estimator list with modified SVM
estimators = [
    ("svm", best_svm),
    ("rf", df_tuned.loc[df_tuned["model"] == "rf", "estimator"].iloc[0]),
    ("knn", df_tuned.loc[df_tuned["model"] == "knn_pca", "estimator"].iloc[0])
]

# 3) Create and evaluate the soft-voting ensemble
vote = VotingClassifier(estimators=estimators, voting="soft")

cv_score = cross_val_score(
    vote, X_raw, y,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring="accuracy",
    n_jobs=-1
).mean()

print("Soft-vote ensemble members:", [name for name, _ in estimators])
print(f"Soft-vote ensemble CV accuracy: {cv_score:.4f}")


Soft-vote ensemble members: ['svm', 'rf', 'knn']
Soft-vote ensemble CV accuracy: 0.5939


In [None]:
# CELL 10 – Pretrained CNN feature‑extraction setup
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import pickle

# 1) Load raw arrays (ensure you’ve run Cells 1+2)
#    X_raw: (N,2,24,24), y: (N,)
#    test raw if needed similarly.

# 2) Transforms: grayscale→3×RGB, resize→224×224, toTensor, normalize
tf = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224,224)),
    transforms.Grayscale(num_output_channels=3),   # replicate to 3 channels
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std =[0.229,0.224,0.225])
])

class RPSPreCNN(Dataset):
    def __init__(self, X_np, y_np=None, transform=None):
        self.X = X_np.astype(np.uint8)   # expected 0–255
        self.y = y_np
        self.tf = transform
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        img1, img2 = self.X[idx]
        # stack the two transforms along channel dimension:
        t1 = self.tf(img1)
        t2 = self.tf(img2)
        x = torch.cat([t1, t2], dim=0)    # shape (6,224,224)
        if self.y is not None:
            return x, (1 if self.y[idx]>0 else 0)
        return x

# Build dataset + loader for feature extraction
train_ds = RPSPreCNN(X_raw, y, transform=tf)
train_ld = DataLoader(train_ds, batch_size=64, shuffle=False, num_workers=0)


In [None]:
# CELL 11 – Extract CNN features (MobileNetV2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.mobilenet_v2(pretrained=True).features.to(device)
model.eval()

def extract_features(loader):
    feats, labs = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            out = model(xb)                   # (B,1280,7,7)
            out = out.mean([2,3])             # global avg pool → (B,1280)
            feats.append(out.cpu().numpy())
            labs .append(yb.numpy())
    return np.vstack(feats), np.hstack(labs)

X_cnn, y_cnn = extract_features(train_ld)
print("CNN features:", X_cnn.shape, "labels:", np.bincount(y_cnn))


In [None]:
# CELL 12 – Logistic Regression on CNN features
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

clf = LogisticRegression(max_iter=500)
cv_score = cross_val_score(clf, X_cnn, y_cnn, cv=3, scoring="accuracy", n_jobs=2).mean()
print(f"CV accuracy on CNN features: {cv_score:.4f}")

# Refit on full
clf.fit(X_cnn, y_cnn)
