# Toxic Comment Detection (Jupyter Notebook Edition)

This notebook reproduces the full toxic comment detection workflow—data loading, preprocessing, feature extraction, model training, hyper-parameter tuning, evaluation, and inference—in a single, self-contained environment. Run the notebook top-to-bottom inside Jupyter or JupyterLab to generate all artefacts and visualisations.


## 1. Environment Setup

The following cell imports every library used throughout the workflow, defines project-wide constants, and ensures that output folders exist for metrics and figures.


In [None]:
import os
from pathlib import Path
from IPython.display import display, Markdown
import ipywidgets as widgets
import re
from typing import Iterable, List, Optional, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
    ConfusionMatrixDisplay,
)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

RANDOM_STATE = 42
DATA_PATH = Path("train.csv")
ARTIFACTS_DIR = Path("artifacts")
RESULTS_DIR = Path("results")
EXTERNAL_TEST_PATH = Path("test.csv")
EXTERNAL_RESULTS_DIR = RESULTS_DIR / "external_test"

ARTIFACTS_DIR.mkdir(exist_ok=True)
RESULTS_DIR.mkdir(exist_ok=True)
EXTERNAL_RESULTS_DIR.mkdir(exist_ok=True)

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

np.random.seed(RANDOM_STATE)


## 2. Load the Dataset

The Kaggle-style dataset ships with the repository as `train.csv`. It contains two columns: `comment_text` and the binary target `toxic`. We normalise the column names for convenience and perform a quick integrity check.


In [None]:
df_raw = pd.read_csv(DATA_PATH)
df_raw.columns = [c.strip().lower() for c in df_raw.columns]
if not {"comment_text", "toxic"}.issubset(df_raw.columns):
    raise ValueError("Expected 'comment_text' and 'toxic' columns in train.csv")

df_raw = df_raw[["comment_text", "toxic"]].copy()
df_raw["toxic"] = df_raw["toxic"].astype(int)
print(df_raw.shape)
df_raw.head()


## 3. Text Preprocessing Utilities

We replicate the five-stage cleaning pipeline from the original project: lower-casing, punctuation removal, tokenisation, stop-word removal, and lemmatisation. Each helper function is pure so that the transformations can be chained or reused individually.


In [None]:
STOP_WORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()


def lowercase(text: str) -> str:
    return str(text).lower()


def remove_punctuation(text: str) -> str:
    return re.sub(r"[^a-z\s]", "", str(text))


def tokenize(text: str) -> List[str]:
    return str(text).split()


def remove_stopwords(tokens: Iterable[str]) -> List[str]:
    return [w for w in tokens if w and w not in STOP_WORDS]


def lemmatise(tokens: Iterable[str]) -> List[str]:
    return [LEMMATIZER.lemmatize(w) for w in tokens]


def join_tokens(tokens: Iterable[str]) -> str:
    return " ".join(tokens)


def clean_comment(text: str) -> Tuple[str, List[str]]:
    lowered = lowercase(text)
    no_punct = remove_punctuation(lowered)
    tokens = tokenize(no_punct)
    tokens = remove_stopwords(tokens)
    tokens = lemmatise(tokens)
    cleaned = join_tokens(tokens)
    return cleaned, tokens


def preprocess_comments(
    comments: Iterable[str],
    labels: Optional[Iterable[int]] = None,
) -> pd.DataFrame:
    processed_records = []
    lemmatised_tokens = []
    for comment in map(str, comments):
        cleaned, tokens = clean_comment(comment)
        processed_records.append(cleaned)
        lemmatised_tokens.append(tokens)
    data = {
        "comment": processed_records,
        "tokens": lemmatised_tokens,
    }
    if labels is not None:
        data["toxic"] = np.asarray(labels, dtype=int)
    return pd.DataFrame(data)


## 4. Run the Preprocessing Pipeline

This step materialises intermediate artefacts that mirror the original script. Besides the final cleaned text we also persist TF–IDF matrices and helper CSV files so that downstream analysis can reuse them if needed.


In [None]:
preprocessed_df = preprocess_comments(
    df_raw["comment_text"],
    labels=df_raw["toxic"].values,
)

preprocessed_df.to_csv(ARTIFACTS_DIR / "final_preprocessed.csv", index=False)
preprocessed_df.head()


### Optional: External Test Set

If a `test.csv` file is present (with at least `comment_text` and optional `id`/`toxic` columns), the next cell will preprocess it so that baseline and tuned models can be compared on the held-out data.


In [None]:
external_test_df = None
external_test_labels = None

if EXTERNAL_TEST_PATH.exists():
    external_raw = pd.read_csv(EXTERNAL_TEST_PATH)
    external_raw.columns = [c.strip().lower() for c in external_raw.columns]
    if "comment_text" not in external_raw.columns:
        raise ValueError("Expected 'comment_text' column in test.csv")
    if "id" not in external_raw.columns:
        external_raw["id"] = np.arange(len(external_raw))
    label_values = external_raw["toxic"].values if "toxic" in external_raw.columns else None
    external_test_df = preprocess_comments(
        external_raw["comment_text"],
        labels=label_values,
    )
    external_test_df.insert(0, "comment_text", external_raw["comment_text"].astype(str).values)
    external_test_df.insert(0, "id", external_raw["id"].values)
    if "toxic" in external_test_df.columns:
        external_test_labels = external_test_df["toxic"].values.astype(int)
    external_test_df.to_csv(ARTIFACTS_DIR / "external_test_preprocessed.csv", index=False)
    print(f"External test set loaded with {len(external_test_df)} rows.")
else:
    print(f"No external test set found at {EXTERNAL_TEST_PATH.resolve()}. Skipping external evaluation.")


We also inspect token counts to ensure the cleaning behaves as expected.


In [None]:
word_counts = preprocessed_df["comment"].apply(lambda text: len(text.split()))
plt.figure(figsize=(8, 4))
plt.hist(word_counts, bins=50, color="#4C72B0")
plt.title("Word Count Distribution After Cleaning")
plt.xlabel("Number of words")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(ARTIFACTS_DIR / "wordcount_hist.png", bbox_inches="tight")
plt.show()


## 5. TF–IDF Feature Extraction

The notebook limits the vocabulary to 5,000 terms (minimum document frequency of 2) to remain faithful to the Python package version.


In [None]:
vectorizer = TfidfVectorizer(max_features=5000, min_df=2)
X_tfidf = vectorizer.fit_transform(preprocessed_df["comment"].values)
y = preprocessed_df["toxic"].values

external_test_matrix = None
if external_test_df is not None:
    external_test_matrix = vectorizer.transform(external_test_df["comment"].values)
    sparse.save_npz(ARTIFACTS_DIR / "external_test_X_tfidf.npz", external_test_matrix)
    print("External test TF-IDF matrix shape:", external_test_matrix.shape)

sparse.save_npz(ARTIFACTS_DIR / "X_tfidf.npz", X_tfidf)
pd.DataFrame({"toxic": y}).to_csv(ARTIFACTS_DIR / "y.csv", index=False)

print("TF-IDF matrix shape:", X_tfidf.shape)


## 6. Train/Test Split

We stratify the split to preserve the 50/50 class balance.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y,
)

X_train.shape, X_test.shape


## 7. Evaluation Helpers

Utility functions for metric computation, ROC handling, confusion-matrix plotting, and the reusable training loop are defined below.


In [None]:
def get_scores_for_roc(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        return model.decision_function(X)
    return None


def compute_metrics(y_true, y_pred, y_score=None):
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": np.nan,
    }
    if y_score is not None:
        try:
            metrics["roc_auc"] = roc_auc_score(y_true, y_score)
        except ValueError:
            metrics["roc_auc"] = np.nan
    return metrics


def summarise_metrics(model_name, phase, metrics):
    row = {"model": model_name, "phase": phase}
    row.update(metrics)
    return row


def display_confusion_matrix(y_true, y_pred, title, ax=None):
    cm = confusion_matrix(y_true, y_pred)
    created_fig = False
    if ax is None:
        fig, ax = plt.subplots(figsize=(5, 4))
        created_fig = True
    disp = ConfusionMatrixDisplay(cm)
    disp.plot(ax=ax, values_format='d', cmap='Blues', colorbar=False)
    ax.set_title(title)
    ax.grid(False)
    if created_fig:
        fig.tight_layout()
        plt.show()
    return ax


def display_roc_curve(y_true, y_score, title, ax=None):
    created_fig = False
    if ax is None:
        fig, ax = plt.subplots(figsize=(5, 4))
        created_fig = True
    if y_score is None:
        ax.axis('off')
        ax.text(0.5, 0.5, 'ROC curve not available', ha='center', va='center', fontsize=11)
        ax.set_title(title)
    else:
        fpr, tpr, _ = roc_curve(y_true, y_score)
        ax.plot(fpr, tpr, label='Model', color='#4C72B0')
        ax.plot([0, 1], [0, 1], linestyle='--', color='grey', label='Chance')
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title(title)
        ax.legend()
    if created_fig:
        fig.tight_layout()
        plt.show()
    return ax

## 8. Model-Specific Training Blocks

Each classifier now runs inside its own code cell so you can inspect the baseline training, cross-validation, hyper-parameter tuning, validation metrics, comparison plots, and optional external-test evaluation independently.


In [None]:
metric_order = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
CV_FOLDS = 3
SCORING_METRIC = 'f1'

all_model_results = {}
metrics_summary_rows = []
tuned_models = {}
best_model_name: Optional[str] = None
best_model_metrics: Optional[dict] = None


In [None]:
def run_model_workflow(name, short_name, factory, param_grid, scoring=SCORING_METRIC, cv=CV_FOLDS):
    # Train baseline and tuned variants of a model, returning a rich results dictionary.
    baseline_model = factory()
    baseline_model.fit(X_train, y_train)
    baseline_pred = baseline_model.predict(X_test)
    baseline_score = get_scores_for_roc(baseline_model, X_test)
    baseline_metrics = compute_metrics(y_test, baseline_pred, baseline_score)
    cv_scores = cross_val_score(factory(), X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    grid_search = GridSearchCV(
        factory(),
        param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=True,
    )
    grid_search.fit(X_train, y_train)
    tuned_model = grid_search.best_estimator_
    tuned_pred = tuned_model.predict(X_test)
    tuned_score = get_scores_for_roc(tuned_model, X_test)
    tuned_metrics = compute_metrics(y_test, tuned_pred, tuned_score)
    result = {
        'name': name,
        'short_name': short_name,
        'baseline': {
            'model': baseline_model,
            'metrics': baseline_metrics,
            'y_pred': baseline_pred,
            'y_score': baseline_score,
        },
        'tuned': {
            'model': tuned_model,
            'metrics': tuned_metrics,
            'y_pred': tuned_pred,
            'y_score': tuned_score,
            'best_params': grid_search.best_params_,
        },
        'cv_scores': cv_scores,
        'grid_search': grid_search,
    }
    metrics_summary_rows.append(summarise_metrics(name, 'baseline', baseline_metrics))
    metrics_summary_rows.append(summarise_metrics(name, 'tuned', tuned_metrics))
    tuned_models[name] = tuned_model
    if external_test_matrix is not None:
        baseline_external_pred = baseline_model.predict(external_test_matrix)
        tuned_external_pred = tuned_model.predict(external_test_matrix)
        baseline_external_score = get_scores_for_roc(baseline_model, external_test_matrix)
        tuned_external_score = get_scores_for_roc(tuned_model, external_test_matrix)
        external_bundle = {
            'baseline': {
                'pred': baseline_external_pred,
                'score': baseline_external_score,
            },
            'tuned': {
                'pred': tuned_external_pred,
                'score': tuned_external_score,
            },
        }
        if external_test_labels is not None:
            external_bundle['baseline']['metrics'] = compute_metrics(
                external_test_labels, baseline_external_pred, baseline_external_score
            )
            external_bundle['tuned']['metrics'] = compute_metrics(
                external_test_labels, tuned_external_pred, tuned_external_score
            )
        output_columns = {
            'id': external_test_df['id'],
            'comment_text': external_test_df['comment_text'],
            'clean_comment': external_test_df['comment'],
            'baseline_prediction': baseline_external_pred,
            'tuned_prediction': tuned_external_pred,
        }
        if baseline_external_score is not None:
            output_columns['baseline_score'] = baseline_external_score
        if tuned_external_score is not None:
            output_columns['tuned_score'] = tuned_external_score
        predictions_df = pd.DataFrame(output_columns)
        predictions_path = EXTERNAL_RESULTS_DIR / f"{short_name}_predictions.csv"
        predictions_df.to_csv(predictions_path, index=False)
        external_bundle['predictions_path'] = predictions_path
        result['external'] = external_bundle
    all_model_results[name] = result
    return result
def present_model_results(result):
    model_name = result['name']
    cv_scores = result['cv_scores']
    display(Markdown(f"**{CV_FOLDS}-fold cross-validation ({SCORING_METRIC.upper()})**"))
    print(f"Scores: {np.round(cv_scores, 4)}")
    print(f"Mean: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    display(Markdown("**Best hyper-parameters from tuning**"))
    print(result['tuned']['best_params'])
    cv_results_df = pd.DataFrame(result['grid_search'].cv_results_)
    display(
        cv_results_df[
            ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
        ].sort_values('rank_test_score').head()
    )
    comparison_df = pd.DataFrame({
        'Baseline': result['baseline']['metrics'],
        'Tuned': result['tuned']['metrics'],
    }).loc[metric_order]
    display(Markdown("**Validation set metrics**"))
    display(comparison_df)
    fig, ax = plt.subplots(figsize=(8, 5))
    comparison_df.T.plot(kind='bar', ax=ax, color=['#4C72B0', '#55A868'])
    ax.set_title(f"{model_name} – Baseline vs Tuned Metrics")
    ax.set_ylabel('Score')
    ax.set_ylim(0, 1)
    ax.legend(loc='lower right')
    plt.xticks(rotation=45)
    fig.tight_layout()
    plt.show()
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    display_confusion_matrix(y_test, result['baseline']['y_pred'], f"{model_name} – Baseline", ax=axes[0])
    display_confusion_matrix(y_test, result['tuned']['y_pred'], f"{model_name} – Tuned", ax=axes[1])
    fig.suptitle(f"{model_name} – Confusion Matrices")
    fig.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    display_roc_curve(y_test, result['baseline']['y_score'], f"{model_name} – Baseline ROC", ax=axes[0])
    display_roc_curve(y_test, result['tuned']['y_score'], f"{model_name} – Tuned ROC", ax=axes[1])
    fig.suptitle(f"{model_name} – ROC Curves")
    fig.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()
    external = result.get('external')
    if external is None:
        print('External test evaluation skipped (no test.csv detected).')
    else:
        baseline_metrics = external['baseline'].get('metrics')
        tuned_metrics = external['tuned'].get('metrics')
        if baseline_metrics and tuned_metrics:
            external_df = pd.DataFrame({
                'Baseline': baseline_metrics,
                'Tuned': tuned_metrics,
            }).loc[metric_order]
            display(Markdown("**External test metrics**"))
            display(external_df)
        else:
            print('External test set does not provide labels; predictions saved to:')
            print(str(external['predictions_path']))
    return result


### Logistic Regression


In [None]:
logreg_results = run_model_workflow(
    name='Logistic Regression',
    short_name='logreg',
    factory=lambda: LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    param_grid={
        'C': [0.1, 1.0, 5.0],
        'solver': ['liblinear', 'lbfgs'],
        'penalty': ['l2'],
    },
)
present_model_results(logreg_results)


### Multinomial Naive Bayes


In [None]:
nb_results = run_model_workflow(
    name='Multinomial Naive Bayes',
    short_name='nb',
    factory=lambda: MultinomialNB(),
    param_grid={
        'alpha': [0.1, 1.0, 10.0],
        'fit_prior': [True, False],
    },
)
present_model_results(nb_results)


### Linear Support Vector Machine


In [None]:
svm_results = run_model_workflow(
    name='Linear SVM (LinearSVC)',
    short_name='svm',
    factory=lambda: LinearSVC(random_state=RANDOM_STATE, dual=False),
    param_grid={
        'C': [0.5, 1.0, 2.0],
        'loss': ['squared_hinge'],
        'class_weight': [None, 'balanced'],
    },
)
present_model_results(svm_results)


### Decision Tree


In [None]:
dt_results = run_model_workflow(
    name='Decision Tree',
    short_name='dt',
    factory=lambda: DecisionTreeClassifier(random_state=RANDOM_STATE),
    param_grid={
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 20, 40],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5],
    },
)
present_model_results(dt_results)


### Random Forest


In [None]:
rf_results = run_model_workflow(
    name='Random Forest',
    short_name='rf',
    factory=lambda: RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=200),
    param_grid={
        'n_estimators': [200, 400],
        'max_depth': [None, 20, 40],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5],
    },
)
present_model_results(rf_results)


### k-Nearest Neighbours


In [None]:
knn_results = run_model_workflow(
    name='k-Nearest Neighbours',
    short_name='knn',
    factory=lambda: KNeighborsClassifier(),
    param_grid={
        'n_neighbors': [3, 5, 11],
        'weights': ['uniform', 'distance'],
        'metric': ['minkowski', 'manhattan'],
    },
)
present_model_results(knn_results)


## 9. Consolidated Metrics and Model Ranking

The table below aggregates validation metrics for every baseline and tuned model. It also identifies the best-performing tuned estimator by F1 score.


In [None]:
metrics_df = pd.DataFrame(metrics_summary_rows)
if metrics_df.empty:
    raise RuntimeError('No metrics captured—ensure each model cell executed successfully.')
metrics_df = metrics_df.sort_values(['model', 'phase']).reset_index(drop=True)
metrics_df.to_csv(RESULTS_DIR / 'model_metrics.csv', index=False)
display(metrics_df)

tuned_only = metrics_df[metrics_df['phase'] == 'tuned'].copy()
if tuned_only.empty:
    raise RuntimeError('Tuned metrics missing.')
best_row = tuned_only.sort_values('f1', ascending=False).iloc[0]
best_model_name = best_row['model']
best_model_metrics = best_row.to_dict()
display(Markdown(
    f"**Best tuned model:** {best_model_name} (F1 = {best_row['f1']:.4f}, Accuracy = {best_row['accuracy']:.4f}, "
    f"Precision = {best_row['precision']:.4f}, Recall = {best_row['recall']:.4f}, AUC = {best_row['roc_auc']:.4f})"
))


In [None]:
comparison_table = metrics_df.pivot_table(index='model', columns='phase', values=metric_order)
comparison_table = comparison_table.reindex(columns=['baseline', 'tuned'], level=1)
comparison_table = comparison_table.sort_index()
display(comparison_table)

fig, axes = plt.subplots(len(metric_order), 1, figsize=(10, 3 * len(metric_order)))
for idx, metric in enumerate(metric_order):
    ax = axes[idx]
    subset = metrics_df.pivot(index='model', columns='phase', values=metric).reindex(comparison_table.index)
    subset.plot(kind='bar', ax=ax, color=['#4C72B0', '#55A868'])
    ax.set_title(f'{metric.title()} by Model (Baseline vs Tuned)')
    ax.set_ylabel(metric.title())
    ax.legend(loc='lower right')
    ax.set_xlabel('')
fig.tight_layout()
plt.show()


## 10. External Test Evaluation Summary

When `test.csv` is available, predictions and metrics (if labels are present) are collected below for a side-by-side comparison of baseline versus tuned estimators.


In [None]:
if external_test_matrix is not None:
    summary_rows = []
    for name, result in all_model_results.items():
        external = result.get('external')
        if not external:
            continue
        baseline_pred = external['baseline']['pred']
        tuned_pred = external['tuned']['pred']
        row = {
            'model': name,
            'baseline_positive_rate': float(np.mean(baseline_pred)),
            'tuned_positive_rate': float(np.mean(tuned_pred)),
            'prediction_shift': float(np.mean(tuned_pred) - np.mean(baseline_pred)),
            'prediction_agreement': float(np.mean(baseline_pred == tuned_pred)),
            'predictions_path': str(external.get('predictions_path', '')),
        }
        baseline_metrics = external['baseline'].get('metrics')
        tuned_metrics = external['tuned'].get('metrics')
        if baseline_metrics and tuned_metrics:
            for metric in metric_order:
                row[f'{metric}_baseline'] = baseline_metrics[metric]
                row[f'{metric}_tuned'] = tuned_metrics[metric]
        summary_rows.append(row)
    if summary_rows:
        external_summary_df = pd.DataFrame(summary_rows)
        external_summary_df.to_csv(EXTERNAL_RESULTS_DIR / 'summary.csv', index=False)
        display(external_summary_df)
        print(f'External predictions written to {EXTERNAL_RESULTS_DIR.resolve()}')
    else:
        print('External test set detected but no predictions recorded.')
else:
    print(f'External test evaluation skipped (no test.csv at {EXTERNAL_TEST_PATH.resolve()}).')


## 11. Interactive Inference Helper

Select the best tuned estimator (highest F1 score) or experiment with any trained model to classify arbitrary comments.


In [None]:
# Utility helpers for single-comment inference across tuned models
def predict_single_comment(model_name: str, comment: str) -> Tuple[int, Optional[float]]:
    if not tuned_models:
        raise RuntimeError('No trained tuned models detected. Run the training cells first.')
    if model_name not in tuned_models:
        raise ValueError(f'Unknown model: {model_name}')
    if not comment.strip():
        raise ValueError('Comment must not be empty.')
    cleaned, _ = clean_comment(comment)
    features = vectorizer.transform([cleaned])
    model = tuned_models[model_name]
    prediction = int(model.predict(features)[0])
    confidence: Optional[float] = None
    if hasattr(model, 'predict_proba'):
        confidence = float(model.predict_proba(features)[0, 1])
    elif hasattr(model, 'decision_function'):
        confidence = float(model.decision_function(features)[0])
    return prediction, confidence

model_names = sorted(tuned_models.keys())
if not model_names:
    raise RuntimeError('No trained tuned models detected. Run the training cells first.')

default_model = best_model_name if best_model_name in tuned_models else model_names[0]

print('Available tuned models:')
for idx, name in enumerate(model_names, start=1):
    marker = ' (default)' if name == default_model else ''
    print(f'  {idx}. {name}{marker}')

try:
    selection = input(f'Select a model by number or name [{default_model}]: ').strip()
    if not selection:
        model_name = default_model
    elif selection.isdigit():
        selection_index = int(selection)
        if not 1 <= selection_index <= len(model_names):
            raise ValueError('Selection index out of range.')
        model_name = model_names[selection_index - 1]
    else:
        if selection not in tuned_models:
            raise ValueError('Unknown model selection.')
        model_name = selection

    comment = input('Enter a comment to classify: ').strip()
    prediction, confidence = predict_single_comment(model_name, comment)
except Exception as exc:
    print(f'Error: {exc}')
else:
    label = 'Toxic' if prediction else 'Not toxic'
    print(f'Prediction ({model_name}): {label}')
    if confidence is not None:
        print(f'Confidence score: {confidence:.3f}')


The notebook remains intentionally linear: rerun it from the top whenever you adjust preprocessing, parameter grids, or add a new external test file to keep every artefact in sync.
