In [11]:
# === Cell 1: Mount Google Drive and import libraries ===

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# === Cell 2: Load the BEED dataset ===

# Change this filename if yours is named differently (e.g. "BEED.csv")
beed_path = "/content/drive/MyDrive/COGS118A_Final_Project/Data/BEED_Data.csv"

df_beed = pd.read_csv(beed_path)

print("Shape of BEED dataset:", df_beed.shape)
print("\nColumns:")
print(df_beed.columns)

print("\nFirst 5 rows:")
display(df_beed.head())

print("\nLabel distribution (y):")
print(df_beed['y'].value_counts())

Shape of BEED dataset: (8000, 17)

Columns:
Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11',
       'X12', 'X13', 'X14', 'X15', 'X16', 'y'],
      dtype='object')

First 5 rows:


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,y
0,4,7,18,25,28,27,20,10,-10,-18,-20,-16,13,32,12,10,0
1,87,114,120,106,76,54,28,5,-19,-49,-85,-102,-100,-89,-61,-21,0
2,-131,-133,-140,-131,-123,-108,-58,-51,-70,-77,-76,-76,-73,-57,-40,-14,0
3,68,104,73,34,-12,-26,-38,-36,-67,-88,-25,31,18,-4,6,-29,0
4,-67,-90,-97,-94,-86,-71,-43,-11,23,46,58,50,39,19,-9,-41,0



Label distribution (y):
y
0    2000
1    2000
2    2000
3    2000
Name: count, dtype: int64


In [13]:
# === Cell 3: Prepare features and labels for BEED ===

# Features: all columns except 'y'
X_beed = df_beed.drop(columns=['y'])

# Labels: the 'y' column converted to a 1D numpy array
y_beed = df_beed['y'].to_numpy()

print("X_beed shape:", X_beed.shape)
print("y_beed shape:", y_beed.shape)
print("Unique labels:", np.unique(y_beed))

X_beed shape: (8000, 16)
y_beed shape: (8000,)
Unique labels: [0 1 2 3]


In [14]:
# === Cell 4: Define function to run all models with GridSearchCV ===

def run_all_models_with_gridsearch(X_train, y_train, X_test, y_test,
                                   dataset_name="BEED",
                                   train_size=0.8,
                                   trial=0):
    """
    Runs Logistic Regression, SVM (RBF), Random Forest, and MLP
    with GridSearchCV on the given train/test split.

    Returns a DataFrame with one row per model, including:
    - dataset name
    - train_size
    - trial index
    - model name
    - best hyperparameters
    - CV accuracy
    - train accuracy
    - test accuracy
    """
    models_and_parameters = {
        "LogisticRegression": (
            Pipeline([
                ('scaler', StandardScaler()),
                ('clf', LogisticRegression(max_iter=1000))
            ]),
            {
                'clf__C': [0.01, 0.1, 1, 10, 100]
            }
        ),
        "SVM_RBF": (
            Pipeline([
                ('scaler', StandardScaler()),
                ('clf', SVC(kernel='rbf', random_state=0))
            ]),
            {
                'clf__C': [0.1, 1, 10, 100],
                'clf__gamma': ['scale', 0.01, 0.1, 1]
            }
        ),
        "RandomForest": (
            Pipeline([
                ('clf', RandomForestClassifier(random_state=0))
            ]),
            {
                'clf__n_estimators': [100, 300],
                'clf__max_depth': [None, 10, 20],
                'clf__min_samples_leaf': [1, 5]
            }
        ),
        "MLP": (
            Pipeline([
                ('scaler', StandardScaler()),
                ('clf', MLPClassifier(
                    max_iter=1000,
                    early_stopping=True,
                    n_iter_no_change=10,
                    random_state=0
                ))
            ]),
            {
                'clf__hidden_layer_sizes': [(50,), (100,), (50, 50)],
                'clf__alpha': [0.0001, 0.001],
                'clf__learning_rate_init': [0.001, 0.01]
            }
        )
    }

    results = []

    for name, (pipe, param_grid) in models_and_parameters.items():
        print(f"\n=== {dataset_name} | {name} | train_size={train_size}, trial={trial} ===")

        grid = GridSearchCV(
            estimator=pipe,
            param_grid=param_grid,
            scoring='accuracy',
            cv=5,
            n_jobs=-1,
            verbose=0
        )

        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        best_params = grid.best_params_
        best_cv_score = grid.best_score_

        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)

        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)

        print("  Best params:", best_params)
        print(f"  CV accuracy:   {best_cv_score:.3f}")
        print(f"  Train accuracy:{train_acc:.3f}")
        print(f"  Test accuracy: {test_acc:.3f}")

        results.append({
            "dataset": dataset_name,
            "train_size": train_size,
            "trial": trial,
            "model": name,
            "best_params": best_params,
            "cv_accuracy": best_cv_score,
            "train_accuracy": train_acc,
            "test_accuracy": test_acc
        })

    return pd.DataFrame(results)

In [None]:
# === Cell 5: Run BEED experiments for train_sizes = [0.2, 0.5, 0.8] and trials = [0, 1, 2] ===

train_sizes = [0.2, 0.5, 0.8]
trials = [0, 1, 2]  # three different random seeds

beed_results_all = []

for ts in train_sizes:
    for t in trials:
        print(f"\n######## BEED Dataset: train_size={ts}, trial={t} ########")

        X_train, X_test, y_train, y_test = train_test_split(
            X_beed, y_beed,
            train_size=ts,
            stratify=y_beed,
            random_state=t
        )

        df_res = run_all_models_with_gridsearch(
            X_train, y_train, X_test, y_test,
            dataset_name="BEED",
            train_size=ts,
            trial=t
        )

        beed_results_all.append(df_res)

# Combine everything
beed_results = pd.concat(beed_results_all, ignore_index=True)
print("\n=== First few rows of beed_results ===")
display(beed_results.head())


######## BEED Dataset: train_size=0.2, trial=0 ########

=== BEED | LogisticRegression | train_size=0.2, trial=0 ===
  Best params: {'clf__C': 0.01}
  CV accuracy:   0.480
  Train accuracy:0.487
  Test accuracy: 0.479

=== BEED | SVM_RBF | train_size=0.2, trial=0 ===
  Best params: {'clf__C': 10, 'clf__gamma': 1}
  CV accuracy:   0.902
  Train accuracy:0.974
  Test accuracy: 0.898

=== BEED | RandomForest | train_size=0.2, trial=0 ===
  Best params: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__n_estimators': 300}
  CV accuracy:   0.887
  Train accuracy:1.000
  Test accuracy: 0.886

=== BEED | MLP | train_size=0.2, trial=0 ===
  Best params: {'clf__alpha': 0.0001, 'clf__hidden_layer_sizes': (100,), 'clf__learning_rate_init': 0.01}
  CV accuracy:   0.884
  Train accuracy:0.938
  Test accuracy: 0.879

######## BEED Dataset: train_size=0.2, trial=1 ########

=== BEED | LogisticRegression | train_size=0.2, trial=1 ===
  Best params: {'clf__C': 0.1}
  CV accuracy:   0.483
  Tr

In [None]:
def run_all_models_with_gridsearch(X_train, y_train, X_test, y_test,
                                   dataset_name="EEG_Eye_State",
                                   train_size=0.8,
                                   trial=0):
    models_and_parameters = {
        "LogisticRegression": (
            Pipeline([
                ('scaler', StandardScaler()),
                ('clf', LogisticRegression(max_iter=1000))
            ]),
            {
                'clf__C': [0.01, 0.1, 1, 10, 100]
            }
        ),
        "SVM_RBF": (
            Pipeline([
                ('scaler', StandardScaler()),
                ('clf', SVC(kernel='rbf', random_state=0))
            ]),
            {
                'clf__C': [0.1, 1, 10, 100],
                'clf__gamma': ['scale', 0.01, 0.1, 1]
            }
        ),
        "RandomForest": (
            Pipeline([
                ('clf', RandomForestClassifier(random_state=0))
            ]),
            {
                'clf__n_estimators': [100, 300],
                'clf__max_depth': [None, 10, 20],
                'clf__min_samples_leaf': [1, 5]
            }
        ),
        "MLP": (
            Pipeline([
                ('scaler', StandardScaler()),
                ('clf', MLPClassifier(
                    max_iter=1000,
                    early_stopping=True,
                    n_iter_no_change=10,
                    random_state=0
                ))
            ]),
            {
                'clf__hidden_layer_sizes': [(50,), (100,), (50, 50)],
                'clf__alpha': [0.0001, 0.001],
                'clf__learning_rate_init': [0.001, 0.01]
            }
        )
    }

    results = []

    for name, (pipe, param_grid) in models_and_parameters.items():
        print(f"\n=== {name} (train_size={train_size}, trial={trial}) ===")

        grid = GridSearchCV(
            estimator=pipe,
            param_grid=param_grid,
            scoring='accuracy',
            cv=5,
            n_jobs=-1,
            verbose=0
        )

        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        best_params = grid.best_params_
        best_cv_score = grid.best_score_

        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)

        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)

        print("  Best params:", best_params)
        print(f"  CV accuracy:   {best_cv_score:.3f}")
        print(f"  Train accuracy:{train_acc:.3f}")
        print(f"  Test accuracy: {test_acc:.3f}")

        results.append({
            "dataset": dataset_name,
            "train_size": train_size,
            "trial": trial,
            "model": name,
            "best_params": best_params,
            "cv_accuracy": best_cv_score,
            "train_accuracy": train_acc,
            "test_accuracy": test_acc
        })

    return pd.DataFrame(results)

In [None]:
# === BEED Summary Cell ===

beed_summary = (
    beed_results
    .groupby(["model", "train_size"])
    .agg(
        mean_test_acc=("test_accuracy", "mean"),
        std_test_acc=("test_accuracy", "std"),
        mean_train_acc=("train_accuracy", "mean"),
        mean_cv_acc=("cv_accuracy", "mean")
    )
    .reset_index()
)

print("=== BEED summary (averaged over 3 trials) ===")
display(beed_summary)

In [None]:
results_dir = "/content/drive/MyDrive/COGS118A_Final_Project/Results"

beed_summary.to_csv(f"{results_dir}/beed_results_summary.csv", index=False)
print("Saved BEED summary to results folder.")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", font_scale=1.2)

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(
    data=beed_summary,
    x="train_size",
    y="mean_test_acc",
    hue="model",
    marker="o"
)
plt.title("BEED Dataset â€” Test Accuracy vs Train Size")
plt.xlabel("Train Size")
plt.ylabel("Mean Test Accuracy")
plt.ylim(0.5, 1.0)
plt.legend(title="Model")
plt.savefig("/content/drive/MyDrive/COGS118A_Final_Project/Plots/beed_testacc_vs_trainsize.png", dpi=300, bbox_inches='tight')
plt.show()