# Sequence Classification Template – Time-Ordered Sequences

This template is for **sequence classification** problems, where each sample is a **sequence of observations**:

- Classifying a time window of sensor readings  
- Detecting fraud from a series of transactions  
- Labeling a short time series as "normal" vs "abnormal"  

We show two approaches:

1. **Feature-based**: hand-crafted features from sequences → tabular classifier  
2. **Deep sequence model** (optional): simple 1D CNN via Keras  


In [None]:
# ========== 1. Imports & Config (Sequence Classification) ==========

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# Optional deep learning
try:
    import tensorflow as tf
    from tensorflow import keras
    KERAS_AVAILABLE = True
except ImportError:
    KERAS_AVAILABLE = False

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["figure.dpi"] = 100

# ---- Config ----
DATA_DIR = Path("../input")
DATA_FILE = "sequence_data.csv"

# Assume each row = one sequence expanded across columns: seq_0, seq_1, ..., seq_T-1
SEQ_PREFIX = "seq_"
TARGET_COL = "label"

RANDOM_STATE = 42


In [None]:
# ========== 2. Load Sequence Data ==========

def load_data(data_dir: Path = DATA_DIR, data_file: str = DATA_FILE) -> pd.DataFrame:
    path = data_dir / data_file
    if not path.exists():
        raise FileNotFoundError(f"Data file not found: {path}")
    df = pd.read_csv(path)
    print("Data shape:", df.shape)
    display(df.head())
    return df


df = load_data()


### 3️⃣ Identify Sequence Columns & Visualize

We treat columns like `seq_0 ... seq_T-1` as an ordered sequence.

We will:

- Extract those columns into an array `(n_samples, T)`  
- Plot a few random sequences to see shapes  


In [None]:
seq_cols = [c for c in df.columns if c.startswith(SEQ_PREFIX)]
print("Sequence columns:", seq_cols[:10], "..." if len(seq_cols) > 10 else "" )

X_seq = df[seq_cols].values
y = df[TARGET_COL].values

print("Sequence array shape:", X_seq.shape)

# Plot a few random sequences
for i in range(3):
    idx = np.random.randint(0, X_seq.shape[0])
    plt.plot(X_seq[idx], label=f"sample {idx}, label={y[idx]}")
plt.title("Random sequence samples")
plt.xlabel("Time step")
plt.ylabel("Value")
plt.legend()
plt.show()


### 4️⃣ Approach 1 – Feature-Based Classification

We engineer features from each sequence, e.g.:

- mean, std, min, max, median, range  
- simple slope (last - first)  

Then train any tabular classifier (RF, XGBoost, etc.).  
This is a strong, simple baseline.


In [None]:
def sequence_to_features(X):
    # X: (n_samples, T)
    feats = {}
    feats["mean"] = X.mean(axis=1)
    feats["std"] = X.std(axis=1)
    feats["min"] = X.min(axis=1)
    feats["max"] = X.max(axis=1)
    feats["median"] = np.median(X, axis=1)
    feats["range"] = feats["max"] - feats["min"]
    feats["slope"] = (X[:, -1] - X[:, 0]) / (X.shape[1] - 1)
    return pd.DataFrame(feats)


X_feats = sequence_to_features(X_seq)
display(X_feats.head())

X_train_f, X_valid_f, y_train, y_valid = train_test_split(
    X_feats, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

rf = RandomForestClassifier(
    n_estimators=300, max_depth=None, n_jobs=-1, random_state=RANDOM_STATE
)
rf.fit(X_train_f, y_train)
y_pred = rf.predict(X_valid_f)
print("Feature-based RF:")
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("F1 (weighted):", f1_score(y_valid, y_pred, average="weighted"))
print(classification_report(y_valid, y_pred, digits=4))


### 5️⃣ Approach 2 – Simple 1D CNN (Optional)

If Keras is available, we can treat sequences as `(T, 1)` and use a 1D CNN.

This can capture local temporal patterns more flexibly than hand-crafted features, 
but needs more data and tuning.


In [None]:
if KERAS_AVAILABLE:
    X_seq_3d = X_seq[..., np.newaxis]

    X_train_s, X_valid_s, y_train_s, y_valid_s = train_test_split(
        X_seq_3d, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )

    num_classes = len(np.unique(y))

    model = keras.Sequential([
        keras.layers.Conv1D(32, kernel_size=3, activation="relu", input_shape=X_train_s.shape[1:]),
        keras.layers.MaxPooling1D(pool_size=2),
        keras.layers.Conv1D(64, kernel_size=3, activation="relu"),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.Dense(num_classes, activation="softmax"),
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(1e-3),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )

    history = model.fit(
        X_train_s, y_train_s,
        validation_data=(X_valid_s, y_valid_s),
        epochs=20,
        batch_size=64,
        verbose=0,
    )

    plt.plot(history.history["accuracy"], label="train_acc")
    plt.plot(history.history["val_accuracy"], label="val_acc")
    plt.legend()
    plt.title("Sequence CNN accuracy")
    plt.show()

    y_pred_s = np.argmax(model.predict(X_valid_s, verbose=0), axis=1)
    print("Sequence CNN:")
    print("Accuracy:", accuracy_score(y_valid_s, y_pred_s))
    print("F1 (weighted):", f1_score(y_valid_s, y_pred_s, average="weighted"))
    print(classification_report(y_valid_s, y_pred_s, digits=4))
else:
    print("Keras not available; skipping deep sequence model.")


### 6️⃣ Next Steps

- Add richer sequence features (autocorrelation, FFT, etc.).  
- Tune CNN architecture (depth, filters, regularization).  
- For multi-sensor data, treat each sensor as a separate channel `(T, n_channels)`.
