In [3]:
get_ipython().system_raw("unrar x /content/Dataset.rar")

In [1]:
import json
import spacy
from spacy.training import Example
from sklearn.model_selection import train_test_split
from spacy.util import minibatch
import random
import tqdm

In [2]:
# 1. Load and prepare the dataset
def load_data(file_path):
    texts = []
    labels = []
    categories = set()

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            # Combine headline and short description
            text = f"{entry['headline']}. {entry['short_description']}"
            category = entry['category']

            texts.append(text)
            labels.append(category)
            categories.add(category)

    return texts, labels, sorted(categories)

In [4]:

# Load your data
texts, labels, all_cats = load_data('Dataset.json')

# 2. Convert to spaCy's training format
def create_cats_dict(label, all_categories):
    return {cat: 1.0 if cat == label else 0.0 for cat in all_categories}


In [8]:

train_data = [(text, {'cats': create_cats_dict(label, all_cats)})
             for text, label in zip(texts, labels)]

# 3. Split data into train and evaluation sets
train_data, eval_data = train_test_split(
    train_data,
    test_size=0.2,
    random_state=42,
    stratify=labels  # Maintain class distribution
)

# 4. Create spaCy model
nlp = spacy.blank('en')
textcat = nlp.add_pipe('textcat')

# Add all categories to the model
for cat in all_cats:
    textcat.add_label(cat)

# 5. Training setup
optimizer = nlp.begin_training()
n_epochs = 10
batch_size = 64  # Adjust based on your GPU memory


In [10]:
import spacy
from spacy.training import Example
from spacy.util import minibatch
import random
from tqdm.auto import tqdm
import numpy as np

# 1. Pre-process all text upfront (HUGE speedup)
print("Pre-processing all text...")
preprocessed_docs = []
for text, _ in tqdm(train_data, desc="Pre-processing"):
    preprocessed_docs.append(nlp.make_doc(text))

# 2. Pre-generate Example objects
train_examples = [
    Example.from_dict(doc, annotations)
    for doc, (_, annotations) in tqdm(
        zip(preprocessed_docs, train_data),
        desc="Creating examples",
        total=len(train_data))
]

# 3. Optimized training loop
print("\nStarting optimized training...")
for epoch in range(n_epochs):
    random.shuffle(train_examples)
    losses = {}

    # Use larger batches for CPU (better utilization)
    cpu_batch_size = max(batch_size * 2, 256)  # At least 256

    for batch in tqdm(
        minibatch(train_examples, size=cpu_batch_size),
        desc=f"Epoch {epoch+1}",
        leave=False
    ):
        try:
            nlp.update(batch, sgd=optimizer, losses=losses)
        except Exception as e:
            print(f"Error in batch: {str(e)}")
            continue

    print(f"Epoch {epoch+1} Loss: {losses.get('textcat', 0.0):.3f}")

Pre-processing all text...


Pre-processing:   0%|          | 0/167621 [00:00<?, ?it/s]

Creating examples:   0%|          | 0/167621 [00:00<?, ?it/s]


Starting optimized training...


Epoch 1: 0it [00:00, ?it/s]

Epoch 1 Loss: 10.819


Epoch 2: 0it [00:00, ?it/s]

Epoch 2 Loss: 8.162


Epoch 3: 0it [00:00, ?it/s]

Epoch 3 Loss: 7.207


Epoch 4: 0it [00:00, ?it/s]

Epoch 4 Loss: 6.526


Epoch 5: 0it [00:00, ?it/s]

Epoch 5 Loss: 5.955


Epoch 6: 0it [00:00, ?it/s]

Epoch 6 Loss: 5.430


Epoch 7: 0it [00:00, ?it/s]

Epoch 7 Loss: 4.970


Epoch 8: 0it [00:00, ?it/s]

Epoch 8 Loss: 4.559


Epoch 9: 0it [00:00, ?it/s]

Epoch 9 Loss: 4.168


Epoch 10: 0it [00:00, ?it/s]

Epoch 10 Loss: 3.804


In [11]:

# 7. Evaluation
def evaluate_model(model, eval_data):
    correct = 0
    total = len(eval_data)

    for text, annotations in eval_data:
        doc = nlp(text)
        true_cat = [k for k, v in annotations['cats'].items() if v == 1.0][0]
        pred_cat = max(doc.cats, key=doc.cats.get)

        if pred_cat == true_cat:
            correct += 1

    return correct / total


In [12]:

accuracy = evaluate_model(nlp, eval_data)
print(f"Evaluation Accuracy: {accuracy*100:.2f}%")

# 8. Test with a sample prediction
test_text = "New AI breakthrough in cancer research announced by scientists"
doc = nlp(test_text)
print("\nSample Prediction:")
for cat, score in doc.cats.items():
    print(f"{cat}: {score:.4f}")
print(f"Predicted Category: {max(doc.cats, key=doc.cats.get)}")

# (Optional) Save the model
nlp.to_disk("news_classifier")

Evaluation Accuracy: 59.89%

Sample Prediction:
ARTS: 0.0000
ARTS & CULTURE: 0.0000
BLACK VOICES: 0.0000
BUSINESS: 0.0147
COLLEGE: 0.0001
COMEDY: 0.0002
CRIME: 0.0001
CULTURE & ARTS: 0.0000
DIVORCE: 0.0000
EDUCATION: 0.0000
ENTERTAINMENT: 0.0000
ENVIRONMENT: 0.0000
FIFTY: 0.0000
FOOD & DRINK: 0.0000
GOOD NEWS: 0.0000
GREEN: 0.0000
HEALTHY LIVING: 0.0046
HOME & LIVING: 0.0000
IMPACT: 0.0053
LATINO VOICES: 0.0000
MEDIA: 0.0003
MONEY: 0.0002
PARENTING: 0.0000
PARENTS: 0.0000
POLITICS: 0.0005
QUEER VOICES: 0.0000
RELIGION: 0.0000
SCIENCE: 0.0008
SPORTS: 0.0000
STYLE: 0.0000
STYLE & BEAUTY: 0.0000
TASTE: 0.0000
TECH: 0.9220
THE WORLDPOST: 0.0307
TRAVEL: 0.0000
U.S. NEWS: 0.0000
WEDDINGS: 0.0000
WEIRD NEWS: 0.0004
WELLNESS: 0.0193
WOMEN: 0.0001
WORLD NEWS: 0.0005
WORLDPOST: 0.0000
Predicted Category: TECH


In [21]:
import shutil
shutil.make_archive("news_classifier", 'zip', "news_classifier")

'/content/news_classifier.zip'

In [22]:
from google.colab import files
files.download("news_classifier.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>