In [1]:
!pip uninstall -qqy jupyterlab kfp 2>/dev/null  # Remove unused conflicting packages
!pip install -U -q "google-genai==1.7.0"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-adk 1.18.0 requires google-genai<2.0.0,>=1.45.0, but you have google-genai 1.7.0 which is incompatible.
google-cloud-aiplatform 1.125.0 requires google-genai<2.0.0,>=1.37.0, but you have google-genai 1.7.0 which is incompatible.[0m[31m
[0m

In [2]:
from google import genai
from google.genai import types

genai.__version__

'1.7.0'

In [3]:
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

client = genai.Client(api_key=GOOGLE_API_KEY)

In [4]:
from sklearn.datasets import fetch_20newsgroups

train = fetch_20newsgroups(subset="train")
test = fetch_20newsgroups(subset="test")

train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
print(train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







#### Preprocess text

In [6]:
import email
import re

import pandas as pd


def preprocess_newsgroup_row(data):
    # Extract only the subject and body
    msg = email.message_from_string(data)
    text = f"{msg['Subject']}\n\n{msg.get_payload()}"
    # Strip any remaining email addresses
    text = re.sub(r"[\w\.-]+@[\w\.-]+", "", text)
    # Truncate each entry to 5,000 characters
    text = text[:5000]

    return text


def preprocess_newsgroup_data(newsgroup_dataset):
    # Put data points into dataframe
    df = pd.DataFrame(
        {"Text": newsgroup_dataset.data, "Label": newsgroup_dataset.target}
    )
    # Clean up the text
    df["Text"] = df["Text"].apply(preprocess_newsgroup_row)
    # Match label to target name index
    df["Class Name"] = df["Label"].map(lambda l: newsgroup_dataset.target_names[l])

    return df

In [7]:
df_train = preprocess_newsgroup_data(train)
df_test = preprocess_newsgroup_data(test)

df_train.head()

Unnamed: 0,Text,Label,Class Name
0,WHAT car is this!?\n\n I was wondering if anyo...,7,rec.autos
1,SI Clock Poll - Final Call\n\nA fair number of...,4,comp.sys.mac.hardware
2,"PB questions...\n\nwell folks, my mac plus fin...",4,comp.sys.mac.hardware
3,Re: Weitek P9000 ?\n\nRobert J.C. Kyanko () wr...,1,comp.graphics
4,Re: Shuttle Launch Question\n\nFrom article <>...,14,sci.space


In [8]:
def sample_data(df, num_samples, classes_to_keep):
    # Sample rows, selecting num_samples of each Label.
    df = (
        df.groupby("Label")[df.columns]
        .apply(lambda x: x.sample(num_samples))
        .reset_index(drop=True)
    )

    df = df[df["Class Name"].str.contains(classes_to_keep)]

    # We have fewer categories now, so re-calibrate the label encoding.
    df["Class Name"] = df["Class Name"].astype("category")
    df["Encoded Label"] = df["Class Name"].cat.codes

    return df

In [9]:
TRAIN_NUM_SAMPLES = 100
TEST_NUM_SAMPLES = 25
# Class name should contain 'sci' to keep science categories.
CLASSES_TO_KEEP = "sci"

df_train = sample_data(df_train, TRAIN_NUM_SAMPLES, CLASSES_TO_KEEP)
df_test = sample_data(df_test, TEST_NUM_SAMPLES, CLASSES_TO_KEEP)

In [10]:
df_train.value_counts("Class Name")

Class Name
sci.crypt          100
sci.electronics    100
sci.med            100
sci.space          100
Name: count, dtype: int64

#### create embeddings

In [11]:
from google.api_core import retry
import tqdm
from tqdm.rich import tqdm as tqdmr
import warnings

tqdmr.pandas()

warnings.filterwarnings("ignore", category=tqdm.TqdmExperimentalWarning)

# create a helper function
is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

@retry.Retry(predicate=is_retriable, timeout=300.0)
def get_embeddings(text):
    response = client.models.embed_content(
        model="models/text-embedding-004",
        contents=text,
        config=types.EmbedContentConfig(
            task_type="classification",
        ),
    )

    return response.embeddings[0].values

def create_embeddings(df):
    df["Embeddings"] = df["Text"].progress_apply(get_embeddings)
    return df

In [12]:
df_train = create_embeddings(df_train)
df_test = create_embeddings(df_test)

Output()

Output()

In [13]:
df_train.head()

Unnamed: 0,Text,Label,Class Name,Encoded Label,Embeddings
1100,The battle is joined\n\nIt looks like Dorothy ...,11,sci.crypt,0,"[0.010916892, 0.015981745, -0.05892634, 0.0188..."
1101,are we being hysterical? No!\n\n\nIn article <...,11,sci.crypt,0,"[0.012027563, 0.025119599, -0.058926936, -0.00..."
1102,Re: text of White House announcement and Q&As ...,11,sci.crypt,0,"[-0.015403877, 0.030044148, -0.037202857, 0.03..."
1103,"Re: Once tapped, your code is no good any more...",11,sci.crypt,0,"[0.0045050536, 0.020362409, -0.061563283, 0.00..."
1104,Re: Clipper considered harmful\n\nIn article <...,11,sci.crypt,0,"[0.004346496, 0.029194878, -0.06622962, 0.0297..."


### Build a classification model

In [14]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class ClassificationModel(nn.Module):
#     def __init__(self, input_size, hidden_dim, num_classes):
#         super(ClassificationModel, self).__init__()
#         self.hidden = nn.Linear(input_size, hidden_dim)
#         self.output = nn.Linear(hidden_dim, num_classes)

#     def forward(self, x):
#         x = F.relu(self.hidden(x))
#         x = self.output(x)
#         return x        

In [15]:
# input_size = len(df_train['Embeddings'].iloc[0])
# num_classes = df_train['Encoded Label'].nunique()

# input_size, num_classes

In [16]:
# model = ClassificationModel(input_size, input_size, num_classes)
# model

In [17]:
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(f"name: {name}, parama_count: {param.shape}, total_params: {param.numel()}")
#         print()

# total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# total_params

### Train the model

In [18]:
# from torch.utils.data import DataLoader, TensorDataset
# from torch.optim import Adam
# import numpy as np
# import matplotlib.pyplot as plt

# NUM_EPOCHS = 20
# BATCH_SIZE = 32
# PATIENCE = 3
# LR = 1e-3

# x_train = torch.tensor(np.stack(df_train['Embeddings'].values), dtype=torch.float32)
# y_train = torch.tensor(df_train['Encoded Label'].values, dtype=torch.long)

# x_val = torch.tensor(np.stack(df_test['Embeddings'].values), dtype=torch.float32)
# y_val = torch.tensor(df_test['Encoded Label'].values, dtype=torch.long)

# train_dataset = TensorDataset(x_train, y_train)
# test_dataset = TensorDataset(x_val, y_val)

# train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# val_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
# model.to(device)

# criterion = nn.CrossEntropyLoss()
# optimizer = Adam(model.parameters(), lr=LR)

# train_losses, train_accs = [], []
# val_losses, val_accs = [], []

# best_val_acc = 0
# patience_counter = 0
# best_state = None

# for epoch in range(1, NUM_EPOCHS + 1):
#     # training
#     model.train()
#     running_loss = 0.
#     correct = 0
#     total = 0

#     for xb, yb in train_dataloader:
#         xb, yb = xb.to(device), yb.to(device)
#         optimizer.zero_grad()
#         logits = model(xb)
#         loss = criterion(logits, yb)
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item() * xb.size(0)
#         preds = logits.argmax(dim=1)
#         correct += (preds == yb).sum().item()
#         total += xb.size(0)

#     epoch_train_loss = running_loss / total
#     epoch_train_acc = correct / total
#     train_losses.append(epoch_train_loss)
#     train_accs.append(epoch_train_acc)

#     # validation
#     model.eval()
#     val_running_loss = 0.
#     val_correct = 0
#     val_total = 0

#     with torch.no_grad():
#         for xb, yb in val_dataloader:
#             xb, yb = xb.to(device), yb.to(device)
#             logits = model(xb)
#             loss = criterion(logits, yb)
#             val_running_loss += loss.item() * xb.size(0)
#             preds = logits.argmax(dim=1)
#             val_correct += (preds == yb).sum().item()
#             val_total += xb.size(0)

#         epoch_val_loss = val_running_loss / total
#         epoch_val_acc = val_correct / total
#         val_losses.append(epoch_val_loss)
#         val_accs.append(epoch_val_acc)

#     print(
#         f"Epoch {epoch}/{NUM_EPOCHS} "
#         f"Train loss: {epoch_train_loss:.4f}, Train acc: {epoch_train_acc:.4f} | "
#         f"Val loss: {epoch_val_loss:.4f}, Val acc: {epoch_val_acc:.4f}"
#     )

#     if epoch_val_acc > best_val_acc:
#         best_val_acc = epoch_val_acc
#         best_state = {k: v.cpu() for k, v in model.state_dict().items()}
#         patience_counter = 0

#     else:
#         patience_counter += 1
#         if patience_counter >= PATIENCE:
#             print(f"Early stopping at epoch {epoch} (no improvement in val acc for {PATIENCE} epochs).")
#             break

# if best_state is not None:
#     model.load_state_dict(best_state)
#     print(f"Loaded best model with val acc: {best_val_acc:.4f}")

# plt.subplot(1, 2, 1)
# plt.plot(range(1, len(train_accs) + 1), train_accs, label="Train Acc")
# plt.plot(range(1, len(val_accs) + 1), val_accs, label="Val Acc")
# plt.xlabel("Epoch")
# plt.ylabel("Accuracy")
# plt.title("Accuracy")
# plt.legend()
# plt.grid(True)

# # Loss subplot
# plt.subplot(1, 2, 2)
# plt.plot(range(1, len(train_losses) + 1), train_losses, label="Train Loss")
# plt.plot(range(1, len(val_losses) + 1), val_losses, label="Val Loss")
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.title("Loss")
# plt.legend()
# plt.grid(True)

# plt.tight_layout()
# plt.show()

# # ---------- Optional: save history and model ----------
# history = {
#     "train_losses": train_losses,
#     "train_accs": train_accs,
#     "val_losses": val_losses,
#     "val_accs": val_accs,
# }
        

In [19]:
# model.eval()
# criterion = torch.nn.CrossEntropyLoss()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# val_loss, val_correct, val_total = 0.0, 0, 0

# with torch.no_grad():
#     for xb, yb in val_dataloader:
#         xb, yb = xb.to(device), yb.to(device)
#         preds = model(xb)
#         loss = criterion(preds, yb)
#         val_loss += loss.item() * xb.size(0)

#         _, predicted = torch.max(preds, 1)
#         val_correct += (predicted == yb).sum().item()
#         val_total += yb.size(0)

# # Compute metrics
# avg_val_loss = val_loss / val_total
# val_accuracy = val_correct / val_total

# # Return dict (like return_dict=True)
# results = {
#     "val_loss": avg_val_loss,
#     "val_accuracy": val_accuracy,
# }

# print(results)

In [20]:
# import keras
# from keras import layers


# def build_classification_model(input_size: int, num_classes: int) -> keras.Model:
#     return keras.Sequential(
#         [
#             layers.Input([input_size], name="embedding_inputs"),
#             layers.Dense(input_size, activation="relu", name="hidden"),
#             layers.Dense(num_classes, activation="softmax", name="output_probs"),
#         ]
#     )

In [21]:
# embedding_size = len(df_train["Embeddings"].iloc[0])

# classifier = build_classification_model(
#     embedding_size, len(df_train["Class Name"].unique())
# )
# classifier.summary()

# classifier.compile(
#     loss=keras.losses.SparseCategoricalCrossentropy(),
#     optimizer=keras.optimizers.Adam(learning_rate=0.001),
#     metrics=["accuracy"],
# )

In [22]:
# import numpy as np


# NUM_EPOCHS = 20
# BATCH_SIZE = 32

# # Split the x and y components of the train and validation subsets.
# y_train = df_train["Encoded Label"]
# x_train = np.stack(df_train["Embeddings"])
# y_val = df_test["Encoded Label"]
# x_val = np.stack(df_test["Embeddings"])

# # Specify that it's OK to stop early if accuracy stabilises.
# early_stop = keras.callbacks.EarlyStopping(monitor="accuracy", patience=3)

# # Train the model for the desired number of epochs.
# history = classifier.fit(
#     x=x_train,
#     y=y_train,
#     validation_data=(x_val, y_val),
#     callbacks=[early_stop],
#     batch_size=BATCH_SIZE,
#     epochs=NUM_EPOCHS,
# )

In [23]:
# classifier.evaluate(x=x_val, y=y_val, return_dict=True)

In [28]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.cuda.amp import GradScaler, autocast
import time

# ============================
# === CONFIG / HYPERPARAMS ===
# ============================
NUM_EPOCHS = 20
BATCH_SIZE = 32
LR = 1e-3
PATIENCE = 3
WEIGHT_DECAY = 0.0
NUM_WORKERS = 4          # adjust to your environment
PREFETCH_FACTOR = 2
PIN_MEMORY = True
SEED = 42

# ===============================
# === REPRODUCIBILITY / SPEED ===
# ===============================
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True   # enable cuDNN autotuner for best conv performance

# ============================
# === DATA PREPARATION (YOU) ==
# ============================
# Expect df_train and df_val to exist and contain:
#   df_train["Embeddings"] -> sequence-like per-row (list/np.array)
#   df_train["Encoded Label"] -> integer labels
#
# Convert to numpy then tensors

# Example:
# x_train = np.stack(df_train["Embeddings"].values).astype(np.float32)
# y_train = df_train["Encoded Label"].values.astype(np.int64)
# x_val   = np.stack(df_val["Embeddings"].values).astype(np.float32)
# y_val   = df_val["Encoded Label"].values.astype(np.int64)

# Replace below placeholders with actual conversion
x_train = np.stack(df_train["Embeddings"].values).astype(np.float32)
y_train = df_train["Encoded Label"].values.astype(np.int64)
x_val   = np.stack(df_test["Embeddings"].values).astype(np.float32)
y_val   = df_test["Encoded Label"].values.astype(np.int64)

train_tensor_x = torch.from_numpy(x_train)
train_tensor_y = torch.from_numpy(y_train)
val_tensor_x = torch.from_numpy(x_val)
val_tensor_y = torch.from_numpy(y_val)

train_ds = TensorDataset(train_tensor_x, train_tensor_y)
val_ds = TensorDataset(val_tensor_x, val_tensor_y)

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    prefetch_factor=PREFETCH_FACTOR
)

val_loader = DataLoader(
    val_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    prefetch_factor=PREFETCH_FACTOR
)

# ============================
# === MODEL DEFINITION =======
# ============================
class ClassificationModel(nn.Module):
    def __init__(self, input_size: int, num_classes: int):
        super().__init__()
        self.hidden = nn.Linear(input_size, input_size)
        self.output = nn.Linear(input_size, num_classes)
        # optional: initialize like Keras (Glorot/Xavier uniform)
        nn.init.xavier_uniform_(self.hidden.weight)
        nn.init.zeros_(self.hidden.bias)
        nn.init.xavier_uniform_(self.output.weight)
        nn.init.zeros_(self.output.bias)

    def forward(self, x):
        x = F.relu(self.hidden(x))
        return self.output(x)   # raw logits (no softmax)

input_size = train_tensor_x.shape[1]
num_classes = int(np.unique(y_train).shape[0])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ClassificationModel(input_size, num_classes).to(device)

# ============================
# === LOSS / OPT / AMP =======
# ============================
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR, eps=1e-7, weight_decay=WEIGHT_DECAY)
scaler = GradScaler(enabled=torch.cuda.is_available())  # mixed precision if GPU available

# ============================
# === TRAINING LOOP ==========
# ============================
history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}
best_val_acc = -np.inf
best_state = None
patience_counter = 0

for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start = time.time()
    # ---- train ----
    model.train()
    train_loss_sum = 0.0
    correct = 0
    total = 0

    for xb, yb in train_loader:
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)

        optimizer.zero_grad()
        with autocast(enabled=torch.cuda.is_available()):
            logits = model(xb)
            loss = criterion(logits, yb)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss_sum += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += xb.size(0)

    train_loss = train_loss_sum / total
    train_acc = correct / total

    # ---- validate ----
    model.eval()
    val_loss_sum = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            with autocast(enabled=torch.cuda.is_available()):
                logits = model(xb)
                loss = criterion(logits, yb)
            val_loss_sum += loss.item() * xb.size(0)
            preds = logits.argmax(dim=1)
            val_correct += (preds == yb).sum().item()
            val_total += xb.size(0)

    val_loss = val_loss_sum / val_total
    val_acc = val_correct / val_total

    history["train_loss"].append(train_loss)
    history["train_acc"].append(train_acc)
    history["val_loss"].append(val_loss)
    history["val_acc"].append(val_acc)

    epoch_time = time.time() - epoch_start
    print(
        f"Epoch {epoch:02d}/{NUM_EPOCHS} "
        f"time={epoch_time:.1f}s "
        f"train_loss={train_loss:.4f} train_acc={train_acc:.4f} "
        f"val_loss={val_loss:.4f} val_acc={val_acc:.4f}"
    )

    # ---- early stopping / best model
    if val_acc > best_val_acc + 1e-12:
        best_val_acc = val_acc
        best_state = {k: v.cpu() for k, v in model.state_dict().items()}
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print(f"Early stopping: no improvement in val_acc for {PATIENCE} epochs.")
            break

# restore best model weights
if best_state is not None:
    model.load_state_dict(best_state)
    model.to(device)
    print(f"Restored best model with val_acc={best_val_acc:.4f}")

# Save model if desired
# torch.save(model.state_dict(), "best_model.pt")

# history is identical-ish to Keras history
print("Training finished.")

  scaler = GradScaler(enabled=torch.cuda.is_available())  # mixed precision if GPU available
  with autocast(enabled=torch.cuda.is_available()):


Epoch 01/20 time=0.5s train_loss=1.3642 train_acc=0.3925 val_loss=1.2985 val_acc=0.3800


  with autocast(enabled=torch.cuda.is_available()):


Epoch 02/20 time=0.5s train_loss=1.2179 train_acc=0.5625 val_loss=1.1782 val_acc=0.6900
Epoch 03/20 time=0.4s train_loss=1.0719 train_acc=0.7125 val_loss=1.0742 val_acc=0.6400
Epoch 04/20 time=0.4s train_loss=0.9219 train_acc=0.8100 val_loss=0.9235 val_acc=0.7800
Epoch 05/20 time=0.4s train_loss=0.7435 train_acc=0.8950 val_loss=0.7667 val_acc=0.8400
Epoch 06/20 time=0.4s train_loss=0.5949 train_acc=0.9275 val_loss=0.6634 val_acc=0.8000
Epoch 07/20 time=0.4s train_loss=0.4740 train_acc=0.9500 val_loss=0.5381 val_acc=0.8900
Epoch 08/20 time=0.4s train_loss=0.3910 train_acc=0.9500 val_loss=0.4780 val_acc=0.9000
Epoch 09/20 time=0.4s train_loss=0.3005 train_acc=0.9625 val_loss=0.4017 val_acc=0.9200
Epoch 10/20 time=0.4s train_loss=0.2522 train_acc=0.9625 val_loss=0.3856 val_acc=0.9000
Epoch 11/20 time=0.4s train_loss=0.2320 train_acc=0.9750 val_loss=0.3388 val_acc=0.9300
Epoch 12/20 time=0.9s train_loss=0.1921 train_acc=0.9775 val_loss=0.3386 val_acc=0.9300
Epoch 13/20 time=0.4s train_loss

In [29]:
def make_prediction(text: str) -> list[float]:
    """Infer category probabilities from the provided text."""
    # Get embedding (this depends on your embedding function)
    embedded = get_embeddings(text)     # should return a list or numpy array
    if isinstance(embedded, list):
        embedded = np.array(embedded, dtype=np.float32)
    if embedded.ndim == 1:
        embedded = embedded[np.newaxis, :]  # make it a batch of 1

    # Convert to torch tensor
    inp = torch.tensor(embedded, dtype=torch.float32).to(device)

    # Inference mode (no gradients)
    model.eval()
    with torch.no_grad():
        logits = model(inp)
        probs = torch.softmax(logits, dim=1)  # convert to probabilities
        probs = probs.cpu().numpy().flatten().tolist()

    return probs

In [30]:
new_text = """
First-timer looking to get out of here.

Hi, I'm writing about my interest in travelling to the outer limits!

What kind of craft can I buy? What is easiest to access from this 3rd rock?

Let me know how to do that please.
"""

result = make_prediction(new_text)

for idx, category in enumerate(df_test["Class Name"].cat.categories):
    print(f"{category}: {result[idx] * 100:0.2f}%")

sci.crypt: 0.20%
sci.electronics: 0.22%
sci.med: 0.14%
sci.space: 99.43%
