In [3]:
import re
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt


In [None]:
CATEGORIES_RAW = [
    "sci.crypt",              # Cyber
    "talk.politics.misc",     # Compliance-ish (proxy)
    "talk.politics.guns",     # Physical threat proxy
    "soc.religion.christian", # Harassment/other proxy
    "rec.autos",              # Fraud/other proxy
    "misc.forsale",           # Fraud/other proxy
]

raw = fetch_20newsgroups(subset="all", categories=CATEGORIES_RAW, remove=("headers", "footers", "quotes"))

# Map raw newsgroups to your demo labels
MAP_TO_LABEL = {
    "sci.crypt": "Cyber",
    "talk.politics.misc": "Sanctions/Compliance",
    "talk.politics.guns": "Physical Threat",
    "soc.religion.christian": "Harassment",
    "rec.autos": "Other",
    "misc.forsale": "Fraud",
}

labels = [MAP_TO_LABEL[raw.target_names[t]] for t in raw.target]

df = pd.DataFrame({"text": raw.data, "label": labels})
df = df[df["text"].str.len() > 50].reset_index(drop=True)  # basic cleanup
df["label"].value_counts()


ValueError: 'comp.security' is not in list

In [None]:
def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    s = re.sub(r"[^a-z0-9\s\-\']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["clean"] = df["text"].astype(str).map(clean_text)
df[["label", "clean"]].head()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

len(X_train), len(X_test), pd.Series(y_train).value_counts()


In [None]:
clf = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.9,
        stop_words="english"
    )),
    ("model", LogisticRegression(max_iter=2000, n_jobs=None))
])

clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print(classification_report(y_test, pred))


In [None]:
labels_sorted = sorted(df["label"].unique())
cm = confusion_matrix(y_test, pred, labels=labels_sorted)

plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation="nearest")
plt.xticks(range(len(labels_sorted)), labels_sorted, rotation=45, ha="right")
plt.yticks(range(len(labels_sorted)), labels_sorted)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar()
plt.tight_layout()
plt.show()


In [None]:
vectorizer = clf.named_steps["tfidf"]
model = clf.named_steps["model"]

feature_names = np.array(vectorizer.get_feature_names_out())
classes = model.classes_

top_k = 12
rows = []
for i, cls in enumerate(classes):
    coefs = model.coef_[i]
    top_pos_idx = np.argsort(coefs)[-top_k:][::-1]
    rows.append((cls, ", ".join(feature_names[top_pos_idx])))

pd.DataFrame(rows, columns=["class", "top_terms"])


In [None]:
SEVERITY = {
    "Physical Threat": 1.00,
    "Cyber": 0.85,
    "Sanctions/Compliance": 0.75,
    "Fraud": 0.70,
    "Harassment": 0.55,
    "Other": 0.30
}

proba = clf.predict_proba(X_test)
pred_labels = clf.predict(X_test)

# risk score = max prob * severity(label) * 100
max_proba = proba.max(axis=1)
severity = np.array([SEVERITY[l] for l in pred_labels])
risk_score = (max_proba * severity * 100).round(1)

out = pd.DataFrame({
    "text": df.loc[X_test.index, "text"].values,
    "clean": X_test.values,
    "predicted_category": pred_labels,
    "confidence": max_proba.round(3),
    "risk_score": risk_score
})

out.sort_values("risk_score", ascending=False).head(10)


In [None]:
def top_keywords_for_doc(text, vectorizer, top_n=8):
    vec = vectorizer.transform([text])
    if vec.nnz == 0:
        return []
    idx = np.argsort(vec.data)[-top_n:][::-1]
    feature_idx = vec.indices[idx]
    return feature_names[feature_idx].tolist()

def analyst_summary(row):
    kws = top_keywords_for_doc(row["clean"], vectorizer, top_n=6)
    return f"{row['predicted_category']} signal detected. Key terms: {', '.join(kws[:6])}."

out["top_keywords"] = out["clean"].apply(lambda s: top_keywords_for_doc(s, vectorizer, top_n=8))
out["analyst_summary"] = out.apply(analyst_summary, axis=1)

out[["predicted_category", "risk_score", "confidence", "top_keywords", "analyst_summary"]].head(8)


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_entities(text, max_ents=8):
    doc = nlp(text[:2000])  # cap length for speed
    ents = []
    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG", "GPE", "LOC", "DATE"}:
            ents.append(f"{ent.text} ({ent.label_})")
    # de-dup while preserving order
    seen = set()
    uniq = []
    for e in ents:
        if e not in seen:
            uniq.append(e)
            seen.add(e)
    return uniq[:max_ents]

out["key_entities"] = out["text"].apply(extract_entities)
out[["predicted_category", "risk_score", "key_entities", "analyst_summary"]].head(10)


In [None]:
analyst_view = out.copy()
analyst_view["text"] = analyst_view["text"].str.replace(r"\s+", " ", regex=True).str.slice(0, 260) + "â€¦"

analyst_view = analyst_view[[
    "text",
    "predicted_category",
    "risk_score",
    "confidence",
    "top_keywords",
    "key_entities",
    "analyst_summary"
]].sort_values("risk_score", ascending=False)

analyst_view.head(15)
