refer to https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [None]:
from time import time

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer # information retrieval technique, similar to BoW

cats = [
    "alt.atheism",
    "sci.space",
]
chosen_seed = 200


def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6


def load_dataset(verbose=False, remove=()):
    """Load and vectorize the 20 newsgroups dataset."""

    data_train = fetch_20newsgroups(
        subset="train",
        categories=cats,
        shuffle=True,
        random_state=chosen_seed,
        remove=('headers', 'footers', 'quotes'),
    )

    data_test = fetch_20newsgroups(
        subset="test",
        categories=cats,
        shuffle=True,
        random_state=chosen_seed,
        remove=('headers', 'footers', 'quotes'),
    )


    # order of labels in `target_names` can be different from `categories`
    target_names = data_train.target_names

    # split target in a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    # Extracting features from the training data using a sparse vectorizer
    t0 = time()
    vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english"
    )

    X_train = vectorizer.fit_transform(data_train.data)
    duration_train = time() - t0

    # Extracting features from the test data using the same vectorizer
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration_test = time() - t0

    feature_names = vectorizer.get_feature_names_out()

    if verbose:
        # compute size of loaded data
        data_train_size_mb = size_mb(data_train.data)
        data_test_size_mb = size_mb(data_test.data)

        print(
            f"{len(data_train.data)} documents - "
            f"{data_train_size_mb:.2f}MB (training set)"
        )
        print(f"{len(data_test.data)} documents - {data_test_size_mb:.2f}MB (test set)")
        print(f"{len(target_names)} categories")
        print(
            f"vectorize training done in {duration_train:.3f}s "
            f"at {data_train_size_mb / duration_train:.3f}MB/s"
        )
        print(f"n_samples: {X_train.shape[0]}, n_features: {X_train.shape[1]}")
        print(
            f"vectorize testing done in {duration_test:.3f}s "
            f"at {data_test_size_mb / duration_test:.3f}MB/s"
        )
        print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}")

    return X_train, X_test, y_train, y_test, feature_names, target_names

In [47]:
X_train, X_test, y_train, y_test, feature_names, target_names = load_dataset(
    verbose=True
)

hi 713
1073 documents - 1.33MB (training set)
713 documents - 0.81MB (test set)
2 categories
vectorize training done in 0.142s at 9.337MB/s
n_samples: 1073, n_features: 3420
vectorize testing done in 0.089s at 9.148MB/s
n_samples: 713, n_features: 3420


trying to replicate unbalanced testing (90/10)

In [46]:
from time import time
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

cats = [
    "alt.atheism",
    "sci.space",
]
chosen_seed = 200


def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6


def load_dataset(verbose=False, remove=()):
    """Load and vectorize the 20 newsgroups dataset."""

    data_train = fetch_20newsgroups(
        subset="train",
        categories=cats,
        shuffle=True,
        random_state=chosen_seed,
        remove=('headers', 'footers', 'quotes'),
    )

    data_test = fetch_20newsgroups(
        subset="test",
        categories=cats,
        shuffle=True,
        random_state=chosen_seed,
        remove=('headers', 'footers', 'quotes'),
    )
    
    # Separate ham and phishing
    ham = [(text, label) for text, label in zip(data_test.data, data_test.target) if label == 0]
    phishing = [(text, label) for text, label in zip(data_test.data, data_test.target) if label == 1]

    # Choose size for imbalanced test set (e.g., total 2000 samples)
    ham_n = int(len(data_test.data)*0.90)
    phishing_n = len(data_test.data)-ham_n
    print("hi", ham_n+phishing_n)
    np.random.seed(chosen_seed)

    rng = np.random.default_rng(seed=chosen_seed) 
    ham_sample = rng.choice(ham, size=ham_n, replace=True)
    phishing_sample = rng.choice(phishing, size=phishing_n, replace=True)

    # Combine and shuffle
    test_combined = list(ham_sample) + list(phishing_sample)
    np.random.shuffle(test_combined)

    # Unzip into X and y
    X_test_raw, y_test = zip(*test_combined)
    X_test_raw = list(X_test_raw)
    y_test = np.array(y_test)
    




    # order of labels in `target_names` can be different from `categories`
    target_names = data_train.target_names

    # split target in a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    # Extracting features from the training data using a sparse vectorizer
    t0 = time()
    vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english"
    )

    X_train = vectorizer.fit_transform(data_train.data)
    duration_train = time() - t0

    # Extracting features from the test data using the same vectorizer
    t0 = time()
    X_test = vectorizer.transform(X_test_raw)
    duration_test = time() - t0

    feature_names = vectorizer.get_feature_names_out()

    if verbose:
        # compute size of loaded data
        data_train_size_mb = size_mb(data_train.data)
        data_test_size_mb = size_mb(data_test.data)

        print(
            f"{len(data_train.data)} documents - "
            f"{data_train_size_mb:.2f}MB (training set)"
        )
        print(f"{len(data_test.data)} documents - {data_test_size_mb:.2f}MB (test set)")
        print(f"{len(target_names)} categories")
        print(
            f"vectorize training done in {duration_train:.3f}s "
            f"at {data_train_size_mb / duration_train:.3f}MB/s"
        )
        print(f"n_samples: {X_train.shape[0]}, n_features: {X_train.shape[1]}")
        print(
            f"vectorize testing done in {duration_test:.3f}s "
            f"at {data_test_size_mb / duration_test:.3f}MB/s"
        )
        print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}")

    return X_train, X_test, y_train, y_test, feature_names, target_names

In [41]:
from sklearn import metrics
from sklearn.utils.extmath import density


def benchmark(clf, custom_name=False):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print(f"train time: {train_time:.3}s")

    t0 = time()
    y_pred = clf.predict(X_test)
    test_time = time() - t0
    print(f"test time:  {test_time:.3}s")

    accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    precision = metrics.precision_score(y_true=y_test, y_pred=y_pred, pos_label=1)
    recall = metrics.recall_score(y_true=y_test, y_pred=y_pred, pos_label=1)

    print(f"Accuracy: {accuracy*100:.2f}%")
    print(f"Precision: {precision*100:.2f}%")    # true positives/(true positives+false positives) --> out of all emails predicted as phishing, how many were phishing?
    print(f"Recall: {recall*100:.2f}%")          # true positives/(true positives+false negatives) --> how many phishing were caught out of the total phishing amount?


    if hasattr(clf, "coef_"):
        print(f"dimensionality: {clf.coef_.shape[1]}")
        print(f"density: {density(clf.coef_)}")
        print()

    print()
    if custom_name:
        clf_descr = str(custom_name)
    else:
        clf_descr = clf.__class__.__name__
    return clf_descr, accuracy, train_time, test_time

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import LinearSVC

results = []
for clf, name in (
    (LogisticRegression(C=5, max_iter=1000), "Logistic Regression"),
    (ComplementNB(alpha=0.1), "Complement naive Bayes"),    # Sparse naive Bayes classifier
    # naive Bayes classifier outperforms logistic regression while having a shorter training time
    
):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf, name))

Logistic Regression
________________________________________________________________________________
Training: 
LogisticRegression(C=5, max_iter=1000)
train time: 0.0119s
test time:  0.000283s
Accuracy: 87.66%
Precision: 85.25%
Recall: 93.91%
dimensionality: 3420
density: 1.0


Complement naive Bayes
________________________________________________________________________________
Training: 
ComplementNB(alpha=0.1)
train time: 0.00192s
test time:  0.000338s
Accuracy: 88.64%
Precision: 91.73%
Recall: 87.31%

