In [None]:
from pathlib import Path
import os
import getpass
import shutil
import subprocess

REPO_DIR = Path('/content/AnomalyDetection')
REPO_URL = os.environ.get('ANOMALY_REPO_URL', 'https://github.com/kh87joo2/AnomalyDetection.git')
FORCE_RECLONE = os.environ.get('ANOMALY_FORCE_RECLONE', '0') == '1'


def is_repo_root(path: Path) -> bool:
    required = [
        path / 'requirements.txt',
        path / 'configs',
        path / 'trainers',
        path / 'configs' / 'patchtst_ssl.yaml',
        path / 'configs' / 'swinmae_ssl.yaml',
    ]
    return all(p.exists() for p in required)


def clone_repo(repo_url: str, token: str = '') -> subprocess.CompletedProcess:
    clone_url = repo_url
    if token and repo_url.startswith('https://'):
        clone_url = repo_url.replace('https://', f'https://{token}@', 1)
    return subprocess.run(['git', 'clone', clone_url, str(REPO_DIR)], text=True, capture_output=True)


if is_repo_root(REPO_DIR):
    print(f'[info] using existing repo: {REPO_DIR}')
elif REPO_DIR.exists():
    if FORCE_RECLONE:
        print(f'[warn] removing existing directory because ANOMALY_FORCE_RECLONE=1: {REPO_DIR}')
        shutil.rmtree(REPO_DIR)
    else:
        raise RuntimeError(
            f'{REPO_DIR} exists but is not a valid repo root.\n'
            'Set ANOMALY_FORCE_RECLONE=1 to allow removal and reclone, or fix the directory manually.'
        )

if not is_repo_root(REPO_DIR):
    print(f'[info] cloning repo: {REPO_URL}')
    token = os.environ.get('ANOMALY_GH_TOKEN', '').strip()
    result = clone_repo(REPO_URL, token=token)

    if result.returncode != 0 and not token:
        print('[warn] public clone failed. If repo is private, enter a GitHub PAT.')
        token = getpass.getpass('GitHub PAT (private repo only): ').strip()
        if token:
            result = clone_repo(REPO_URL, token=token)

    if result.returncode != 0:
        if result.stderr:
            print('[git]', result.stderr.strip().splitlines()[-1])
        raise RuntimeError('git clone failed. Check repo URL, network, and token permissions.')

if not is_repo_root(REPO_DIR):
    raise FileNotFoundError('Repo root validation failed after clone.')

os.chdir(REPO_DIR)
print('cwd:', Path.cwd())
print('requirements.txt exists:', Path('requirements.txt').exists())
print('configs exists:', Path('configs').exists())
print('trainers exists:', Path('trainers').exists())


In [None]:
# KAGGLE_DATA_DOWNLOAD_PATCHTST
from pathlib import Path
import json
import os
import subprocess
import sys
import zipfile

import pandas as pd
import yaml

REPO_DIR = Path('/content/AnomalyDetection')
RAW_DIR = REPO_DIR / 'data' / 'raw' / 'swat'
OUT_DIR = REPO_DIR / 'data' / 'fdc'
DATASET = 'vishala28/swat-dataset-secure-water-treatment-system'

RAW_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)


def run(cmd):
    cmd = [str(x) for x in cmd]
    print('$', ' '.join(cmd))
    subprocess.run(cmd, check=True)


run([sys.executable, '-m', 'pip', 'install', '-q', 'kaggle', 'pandas', 'pyyaml'])

kaggle_dir = Path('/root/.kaggle')
kaggle_dir.mkdir(parents=True, exist_ok=True)
kaggle_json = kaggle_dir / 'kaggle.json'

if not kaggle_json.exists():
    user = os.environ.get('KAGGLE_USERNAME', '').strip()
    key = os.environ.get('KAGGLE_KEY', '').strip()
    if user and key:
        kaggle_json.write_text(json.dumps({'username': user, 'key': key}), encoding='utf-8')
    else:
        from google.colab import files

        print('Upload kaggle.json from https://www.kaggle.com/settings/account')
        uploaded = files.upload()
        if 'kaggle.json' not in uploaded:
            raise FileNotFoundError('kaggle.json not uploaded')
        with kaggle_json.open('wb') as f:
            f.write(uploaded['kaggle.json'])

os.chmod(kaggle_json, 0o600)

run(['kaggle', 'datasets', 'download', '-d', DATASET, '-p', str(RAW_DIR), '--force'])

for zip_path in sorted(RAW_DIR.glob('*.zip')):
    print('[extract]', zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall(RAW_DIR)

csv_files = sorted(RAW_DIR.rglob('*.csv'))
if not csv_files:
    raise FileNotFoundError(f'No CSV found under {RAW_DIR}')

print('csv_found:', len(csv_files))
for p in csv_files[:20]:
    print('-', p)


def pick_fdc_csv(files):
    def score(path: Path):
        name = path.name.lower()
        if 'normal' in name:
            return (0, len(name))
        if 'train' in name:
            return (1, len(name))
        if 'attack' in name:
            return (9, len(name))
        return (5, len(name))

    return sorted(files, key=score)[0]


src = pick_fdc_csv(csv_files)
print('selected_source:', src)

df = pd.read_csv(src)
orig_cols = list(df.columns)

col_map = {str(c).strip().lower(): c for c in df.columns}
ts_col = None
for key in ['timestamp', 'time', 'datetime', 'date']:
    if key in col_map:
        ts_col = col_map[key]
        break

if ts_col is None:
    df.insert(0, 'timestamp', range(len(df)))
else:
    df = df.rename(columns={ts_col: 'timestamp'})

# Drop common label columns if present.
drop_names = {'label', 'attack', 'normal/attack', 'normal_attack', 'class', 'status'}
for c in list(df.columns):
    if str(c).strip().lower() in drop_names:
        df = df.drop(columns=[c])

# Numeric-cast all non-timestamp columns.
feature_cols = [c for c in df.columns if c != 'timestamp']
for c in feature_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

out_csv = OUT_DIR / 'swat_fdc_train.csv'
df.to_csv(out_csv, index=False)

print('prepared_csv:', out_csv)
print('shape:', df.shape)
print('columns_in:', len(orig_cols), 'columns_out:', len(df.columns))

base_cfg = REPO_DIR / 'configs' / 'patchtst_ssl.yaml'
real_cfg = REPO_DIR / 'configs' / 'patchtst_ssl_real.yaml'
cfg = yaml.safe_load(base_cfg.read_text(encoding='utf-8'))
cfg['data']['source'] = 'csv'
cfg['data']['path'] = '/content/AnomalyDetection/data/fdc/*.csv'
cfg['data']['timestamp_col'] = 'timestamp'
real_cfg.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding='utf-8')

print('real_config_written:', real_cfg)
print('now run the training cell below; it uses *_real.yaml if present')


In [None]:
# DATA_CHECK_PATCHTST
from pathlib import Path
import pandas as pd

fdc_dir = Path('/content/AnomalyDetection/data/fdc')
csv_files = sorted(fdc_dir.glob('*.csv'))

print('fdc_dir:', fdc_dir)
print('csv_count:', len(csv_files))
if not csv_files:
    raise FileNotFoundError(f'No CSV files found in {fdc_dir}. Run the Kaggle download cell first.')

sample = csv_files[0]
print('sample_file:', sample)

df = pd.read_csv(sample)
print('shape:', df.shape)
print('columns[:20]:', df.columns[:20].tolist())
print(df.head(3))

required = {'timestamp'}
missing = [c for c in required if c not in df.columns]
if missing:
    print('[warn] missing required columns:', missing)
else:
    print('[ok] required columns exist')

non_ts_cols = [c for c in df.columns if c != 'timestamp']
if non_ts_cols:
    numeric_na_ratio = df[non_ts_cols].apply(pd.to_numeric, errors='coerce').isna().mean().mean()
    print('non_timestamp_numeric_na_ratio:', float(numeric_na_ratio))


In [None]:
from pathlib import Path
import subprocess
import sys
import shlex


def run(cmd):
    cmd = [str(x) for x in cmd]
    print('$', ' '.join(shlex.quote(x) for x in cmd))
    proc = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
    )

    assert proc.stdout is not None
    for line in proc.stdout:
        print(line, end='')

    code = proc.wait()
    if code != 0:
        raise RuntimeError(f"Command failed ({code}): {' '.join(cmd)}")


req = Path('requirements.txt')
if not req.exists():
    raise FileNotFoundError(
        f"requirements.txt not found in cwd={Path.cwd()}. Run bootstrap cell first or fix repo path."
    )

run([sys.executable, '-m', 'pip', 'install', '-U', 'pip'])
run([sys.executable, '-m', 'pip', 'install', '-r', str(req)])


In [None]:
import torch

print('torch:', torch.__version__)
print('cuda_available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('cuda_device_count:', torch.cuda.device_count())
    print('cuda_device_0:', torch.cuda.get_device_name(0))


In [None]:
import subprocess
import sys
from pathlib import Path

cfg = Path('configs/patchtst_ssl_real.yaml')
if not cfg.exists():
    print('[warn] real config not found, fallback to synthetic config')
    cfg = Path('configs/patchtst_ssl.yaml')

cmd = [sys.executable, '-m', 'trainers.train_patchtst_ssl', '--config', str(cfg)]
print('$', ' '.join(cmd))
result = subprocess.run(cmd)
if result.returncode != 0:
    raise RuntimeError(f"Training failed with exit code {result.returncode}")


In [None]:
from pathlib import Path

checkpoint_path = Path('checkpoints/patchtst_ssl.pt')
print('checkpoint_exists:', checkpoint_path.exists(), checkpoint_path)
assert checkpoint_path.exists(), f'Missing checkpoint: {checkpoint_path}'
print('checkpoint_size_bytes:', checkpoint_path.stat().st_size)
