# Phishing SMS : CoPhi @ SJTU
**Author**: Napassorn LITCHIOWONG (Pleng/林艺文)
pleng@u.nus.edu

# Phishing SMS : CoPhi @ SJTU
**Author**: Napassorn LITCHIOWONG (Pleng/林艺文)
pleng@u.nus.edu

Edit the config path and output dir before running


In [1]:
!pip install tensorflow gensim



In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, sys, json, yaml, importlib, traceback, glob, shutil, subprocess, getpass
import pandas as pd
from datetime import datetime

REPO_DIR = "/content/drive/MyDrive/sms-baselines"
MAX_RUNS_PER_BASELINE = 5
OVERWRITE_EXISTING = True
CHECKPOINT_EVERY_RUN = True
PUSH_EVERY_BASELINE = True

if not os.path.exists(REPO_DIR):
    raise FileNotFoundError(f"Repository not found at {REPO_DIR}")

sys.path.insert(0, REPO_DIR)
sys.path.insert(0, os.path.join(REPO_DIR, "src"))
from models.shared import set_seed

train_csv = os.path.join(REPO_DIR, "data", "splits", "train.csv")
test_csv = os.path.join(REPO_DIR, "data", "splits", "test.csv")
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

config_paths = sorted(glob.glob(os.path.join(REPO_DIR, "configs", "bl_*.yaml")))
BASE_EXP_DIR = os.path.join(REPO_DIR, "experiments")
os.makedirs(BASE_EXP_DIR, exist_ok=True)

print(f"Experiments will be saved to: {BASE_EXP_DIR}")
print(f"Working directory: {os.getcwd()}")
print(f"Repository directory: {REPO_DIR}")
print(f"Checkpoint mode: Save after each run = {CHECKPOINT_EVERY_RUN}")
print(f"Push mode: Push after each baseline = {PUSH_EVERY_BASELINE}")

def check_baseline_complete(baseline_dir, expected_runs):
    if not os.path.exists(baseline_dir):
        return False

    summary_csv = os.path.join(baseline_dir, "summary.csv")
    if os.path.exists(summary_csv):
        try:
            df = pd.read_csv(summary_csv)
            successful_runs = df[df['accuracy'].notna()]
            return len(successful_runs) >= expected_runs
        except:
            pass

    run_dirs = [item for item in os.listdir(baseline_dir)
                if os.path.isdir(os.path.join(baseline_dir, item))]

    successful_runs = 0
    for run_dir in run_dirs:
        results_path = os.path.join(baseline_dir, run_dir, "results.json")
        if os.path.exists(results_path):
            try:
                with open(results_path) as f:
                    results = json.load(f)
                if results.get("accuracy") is not None:
                    successful_runs += 1
            except:
                pass

    return successful_runs >= expected_runs

def cleanup_old_runs(baseline_dir, max_runs=MAX_RUNS_PER_BASELINE):
    if not os.path.exists(baseline_dir):
        return
    run_dirs = [(item, os.path.join(baseline_dir, item), os.path.getmtime(os.path.join(baseline_dir, item)))
                for item in os.listdir(baseline_dir)
                if os.path.isdir(os.path.join(baseline_dir, item))]
    run_dirs.sort(key=lambda x: x[2], reverse=True)
    for run_name, run_path, _ in run_dirs[max_runs:]:
        shutil.rmtree(run_path, ignore_errors=True)

def get_run_dir(baseline_dir, baseline_id, seed):
    os.makedirs(baseline_dir, exist_ok=True)
    if OVERWRITE_EXISTING:
        for item in os.listdir(baseline_dir):
            item_path = os.path.join(baseline_dir, item)
            if os.path.isdir(item_path):
                meta_path = os.path.join(item_path, "run_meta.json")
                if os.path.exists(meta_path):
                    try:
                        with open(meta_path) as f:
                            meta = json.load(f)
                        if meta.get("seed") == seed:
                            return item_path
                    except:
                        pass
    now = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
    run_id = f"{baseline_id}_seed{seed}_{now}"
    run_dir = os.path.join(baseline_dir, run_id)
    os.makedirs(run_dir, exist_ok=True)
    return run_dir

auth_setup_done = False
def setup_github_auth():
    global auth_setup_done
    if auth_setup_done:
        return True

    print("Setting up GitHub authentication...")
    token = getpass.getpass("Enter your GitHub Personal Access Token: ")
    username = "iamdiluxedbutcooler"
    remote_url = f"https://{username}:{token}@github.com/{username}/sms-baselines.git"
    try:
        os.chdir(REPO_DIR)
        subprocess.run(["git", "remote", "remove", "origin"], capture_output=True)
        subprocess.run(["git", "remote", "add", "origin", remote_url], check=True)
        print("GitHub authentication configured successfully!")
        auth_setup_done = True
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to configure GitHub auth: {e}")
        return False

def commit_and_push_checkpoint(message_suffix=""):
    try:
        os.chdir(REPO_DIR)
        result = subprocess.run(["git", "status", "--porcelain"], capture_output=True, text=True)
        if not result.stdout.strip():
            print("No changes to commit")
            return True

        subprocess.run(["git", "add", "experiments/"], check=True)
        subprocess.run(["git", "add", "src/"], check=True)
        subprocess.run(["git", "add", "configs/"], check=True)

        result = subprocess.run(["git", "diff", "--cached", "--name-only"], capture_output=True, text=True)
        if not result.stdout.strip():
            print("No changes staged for commit")
            return True

        timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        commit_msg = f"Checkpoint {timestamp}{message_suffix}"
        subprocess.run(["git", "commit", "-m", commit_msg], check=True)
        subprocess.run(["git", "push", "origin", "main"], check=True)
        print(f"Checkpoint pushed at {timestamp}")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Checkpoint push failed: {e}")
        return False

def save_progress_state(baseline_id, completed_runs, all_results):
    progress_file = os.path.join(BASE_EXP_DIR, "training_progress.json")
    progress_data = {
        "last_baseline": baseline_id,
        "completed_runs": completed_runs,
        "timestamp": datetime.utcnow().isoformat(),
        "total_results": len(all_results)
    }
    with open(progress_file, "w") as f:
        json.dump(progress_data, f, indent=2)

def load_progress_state():
    progress_file = os.path.join(BASE_EXP_DIR, "training_progress.json")
    if os.path.exists(progress_file):
        try:
            with open(progress_file) as f:
                return json.load(f)
        except:
            pass
    return {}

progress_state = load_progress_state()
if progress_state:
    print(f"Previous session found: {progress_state.get('last_baseline')} - {progress_state.get('completed_runs')} runs")

all_results = []
skipped_baselines = []
github_auth_configured = False

for cfg_idx, cfg_path in enumerate(config_paths):
    print(f"\nProcessing ({cfg_idx+1}/{len(config_paths)}): {os.path.basename(cfg_path)}")
    with open(cfg_path) as f:
        config = yaml.safe_load(f)
    baseline_id = config.get("baseline_id", os.path.splitext(os.path.basename(cfg_path))[0])
    module_path = config.get("module")
    if not module_path:
        print(f"  No module specified, skipping")
        continue

    baseline_dir = os.path.join(BASE_EXP_DIR, baseline_id)
    repeats = config.get("training", {}).get("repeats_for_variance", 1)

    if check_baseline_complete(baseline_dir, repeats):
        print(f"  Baseline {baseline_id} already complete ({repeats} runs), skipping")
        skipped_baselines.append(baseline_id)

        summary_csv = os.path.join(baseline_dir, "summary.csv")
        if os.path.exists(summary_csv):
            try:
                df = pd.read_csv(summary_csv)
                for _, row in df.iterrows():
                    if pd.notna(row.get('accuracy')):
                        all_results.append({
                            "baseline": baseline_id,
                            "seed": row.get("seed"),
                            "accuracy": row.get("accuracy"),
                            "outdir": row.get("outdir")
                        })
            except:
                pass
        continue

    try:
        module = importlib.import_module(module_path)
    except Exception as e:
        print(f"  Import failed for {module_path}: {e}")
        continue

    baseline_completed_runs = 0
    seed_base = config.get("training", {}).get("seed_base", 42)

    for i in range(repeats):
        seed = seed_base + i
        set_seed(seed)
        print(f"  Running seed {seed} ({i+1}/{repeats})...")
        outdir = get_run_dir(baseline_dir, baseline_id, seed)
        run_meta = {"baseline": baseline_id, "seed": seed, "timestamp": datetime.utcnow().isoformat()}
        with open(os.path.join(outdir, "run_meta.json"), "w") as f:
            json.dump(run_meta, f, indent=2)

        try:
            entry_fn = None
            expected = f"run_{baseline_id}"
            if hasattr(module, expected):
                entry_fn = getattr(module, expected)
            else:
                for name in dir(module):
                    if name.startswith("run_") and callable(getattr(module, name)):
                        entry_fn = getattr(module, name)
                        break
            if not entry_fn:
                raise Exception(f"No run function found in {module_path}")

            results = entry_fn(
                train_df["text"].astype(str).tolist(),
                train_df["label"].astype(str).tolist(),
                test_df["text"].astype(str).tolist(),
                test_df["label"].astype(str).tolist(),
                outdir, seed, config
            )
            accuracy = results.get("accuracy")
            print(f"    Accuracy: {accuracy}")
            all_results.append({
                "baseline": baseline_id,
                "seed": seed,
                "accuracy": accuracy,
                "outdir": outdir
            })
            baseline_completed_runs += 1

            if CHECKPOINT_EVERY_RUN:
                baseline_results = [r for r in all_results if r["baseline"] == baseline_id]
                if baseline_results:
                    pd.DataFrame(baseline_results).to_csv(os.path.join(baseline_dir, "summary.csv"), index=False)

                save_progress_state(baseline_id, baseline_completed_runs, all_results)

                if not github_auth_configured:
                    github_auth_configured = setup_github_auth()

                if github_auth_configured:
                    commit_and_push_checkpoint(f" - {baseline_id} run {i+1}/{repeats}")

        except Exception as e:
            print(f"    Error: {e}")
            with open(os.path.join(outdir, "error.txt"), "w") as f:
                f.write(traceback.format_exc())
            all_results.append({
                "baseline": baseline_id,
                "seed": seed,
                "accuracy": None,
                "outdir": outdir,
                "error": str(e)
            })

    cleanup_old_runs(baseline_dir)
    baseline_results = [r for r in all_results if r["baseline"] == baseline_id]
    if baseline_results:
        pd.DataFrame(baseline_results).to_csv(os.path.join(baseline_dir, "summary.csv"), index=False)

    if PUSH_EVERY_BASELINE and github_auth_configured:
        commit_and_push_checkpoint(f" - {baseline_id} completed")
        print(f"  Baseline {baseline_id} results pushed to GitHub")

    save_progress_state(baseline_id, baseline_completed_runs, all_results)

if all_results:
    pd.DataFrame(all_results).to_csv(os.path.join(BASE_EXP_DIR, "experiments_summary.csv"), index=False)

print(f"\nCompleted! Results in {BASE_EXP_DIR}")
print(f"Each baseline keeps max {MAX_RUNS_PER_BASELINE} runs")

if skipped_baselines:
    print(f"\nSkipped baselines (already complete): {', '.join(skipped_baselines)}")

print(f"\nExperiments directory contents:")
if os.path.exists(BASE_EXP_DIR):
    for item in os.listdir(BASE_EXP_DIR):
        item_path = os.path.join(BASE_EXP_DIR, item)
        if os.path.isdir(item_path):
            run_count = len([x for x in os.listdir(item_path) if os.path.isdir(os.path.join(item_path, x))])
            status = "(SKIPPED)" if item in skipped_baselines else "(TRAINED)"
            print(f"  {item}: {run_count} runs {status}")
        else:
            print(f"  {item}: file")

if not github_auth_configured:
    print("\nSetting up final GitHub push...")
    if setup_github_auth():
        commit_and_push_checkpoint(" - Final results")
        print("Final results pushed to GitHub!")
    else:
        print("GitHub authentication failed. Results saved locally only.")

progress_file = os.path.join(BASE_EXP_DIR, "training_progress.json")
if os.path.exists(progress_file):
    os.remove(progress_file)
    print("Training progress file cleaned up.")

Mounted at /content/drive
Experiments will be saved to: /content/drive/MyDrive/sms-baselines/experiments
Working directory: /content
Repository directory: /content/drive/MyDrive/sms-baselines
Checkpoint mode: Save after each run = True
Push mode: Push after each baseline = True

Processing (1/3): bl_nlp_01.yaml
  Baseline bl_nlp_01 already complete (5 runs), skipping

Processing (2/3): bl_nlp_02.yaml
  Baseline bl_nlp_02 already complete (5 runs), skipping

Processing (3/3): bl_nn_01.yaml
  Running seed 42 (1/5)...


  run_meta = {"baseline": baseline_id, "seed": seed, "timestamp": datetime.utcnow().isoformat()}


Original train labels: {'smishing', 'ham', 'spam'}
Original test labels: {'smishing', 'ham', 'spam'}




Training model...
Epoch 1/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 291ms/step - accuracy: 0.7995 - loss: 0.5459 - val_accuracy: 0.6799 - val_loss: 0.4328
Epoch 2/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 287ms/step - accuracy: 0.7555 - loss: 0.3958 - val_accuracy: 0.6695 - val_loss: 0.4221
Epoch 3/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 293ms/step - accuracy: 0.7459 - loss: 0.3813 - val_accuracy: 0.7050 - val_loss: 0.3964
Epoch 4/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 286ms/step - accuracy: 0.7546 - loss: 0.3702 - val_accuracy: 0.7385 - val_loss: 0.3709
Epoch 5/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 282ms/step - accuracy: 0.7603 - loss: 0.3533 - val_accuracy: 0.7741 - val_loss: 0.3235
Epoch 6/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 281ms/step - accuracy: 0.7821 - loss: 0.3131 - val_accuracy: 0.8347 - val_loss:



Accuracy: 0.8209205020920503


  "timestamp": datetime.utcnow().isoformat(),


Neural network training completed successfully
    Accuracy: 0.8209205020920503
Setting up GitHub authentication...
Enter your GitHub Personal Access Token: ··········
GitHub authentication configured successfully!


  timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


Checkpoint push failed: Command '['git', 'commit', '-m', 'Checkpoint 20250923_001626 - bl_nn_01 run 1/5']' returned non-zero exit status 128.
  Running seed 43 (2/5)...
Original train labels: {'smishing', 'ham', 'spam'}
Original test labels: {'smishing', 'ham', 'spam'}


  run_meta = {"baseline": baseline_id, "seed": seed, "timestamp": datetime.utcnow().isoformat()}


Training model...
Epoch 1/50




[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 288ms/step - accuracy: 0.7523 - loss: 0.5512 - val_accuracy: 0.7992 - val_loss: 0.4294
Epoch 2/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 280ms/step - accuracy: 0.7365 - loss: 0.4146 - val_accuracy: 0.7406 - val_loss: 0.4059
Epoch 3/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 282ms/step - accuracy: 0.7314 - loss: 0.3977 - val_accuracy: 0.7803 - val_loss: 0.3910
Epoch 4/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 282ms/step - accuracy: 0.7506 - loss: 0.3795 - val_accuracy: 0.7678 - val_loss: 0.3788
Epoch 5/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 282ms/step - accuracy: 0.7663 - loss: 0.3611 - val_accuracy: 0.8159 - val_loss: 0.3378
Epoch 6/50
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 281ms/step - accuracy: 0.7753 - loss: 0.3289 - val_accuracy: 0.7866 - val_loss: 0.2316
Epoch 7/50
[1m135/13