In [None]:
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# edaflow-lite: Violation Classifier (Baseline)\n",
        "\n",
        "Goal: classify violation type (setup / transition / max_capacitance) using simple features.\n",
        "\n",
        "This notebook is a scaffold for expansion into a real ML baseline."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "from parser.timing_parser import load_report, parse_timing_report\n",
        "from parser.violation_summary import infer_violation_type\n",
        "\n",
        "text = load_report('../reports/timing_report.txt')\n",
        "paths = parse_timing_report(text)\n",
        "\n",
        "rows = []\n",
        "for p in paths:\n",
        "    rows.append({\n",
        "        'path_group': p.path_group,\n",
        "        'slack': p.slack,\n",
        "        'notes': ' '.join(p.notes),\n",
        "        'label': infer_violation_type(p),\n",
        "    })\n",
        "\n",
        "df = pd.DataFrame(rows)\n",
        "df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.preprocessing import OneHotEncoder\n",
        "from sklearn.compose import ColumnTransformer\n",
        "from sklearn.pipeline import Pipeline\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.metrics import classification_report\n",
        "\n",
        "X = df[['path_group','slack','notes']]\n",
        "y = df['label']\n",
        "\n",
        "pre = ColumnTransformer(\n",
        "    transformers=[\n",
        "        ('group', OneHotEncoder(handle_unknown='ignore'), ['path_group']),\n",
        "        ('notes', TfidfVectorizer(), 'notes'),\n",
        "    ],\n",
        "    remainder='passthrough'\n",
        ")\n",
        "\n",
        "clf = Pipeline([\n",
        "    ('pre', pre),\n",
        "    ('lr', LogisticRegression(max_iter=200))\n",
        "])\n",
        "\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)\n",
        "clf.fit(X_train, y_train)\n",
        "pred = clf.predict(X_test)\n",
        "print(classification_report(y_test, pred))"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.10"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}