In [None]:
import nltk
import string
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
import nltk
nltk.download('punkt_tab')

# Load 20 Newsgroups Dataset
categories = ['sci.space', 'rec.sport.baseball', 'comp.graphics', 'talk.politics.mideast']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Preprocessing function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    cleaned = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(cleaned)

# Preprocess all data
print("Preprocessing texts...")
cleaned_data = [preprocess(doc) for doc in newsgroups.data]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(cleaned_data, newsgroups.target, test_size=0.2, random_state=42)

# Build a pipeline: TF-IDF + Naive Bayes Classifier
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Train the model
print("Training the model...")
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Preprocessing texts...
Training the model...

Classification Report:
                       precision    recall  f1-score   support

        comp.graphics       0.93      0.87      0.90       220
   rec.sport.baseball       0.85      0.94      0.89       178
            sci.space       0.88      0.84      0.86       189
talk.politics.mideast       0.91      0.93      0.92       192

             accuracy                           0.89       779
            macro avg       0.89      0.89      0.89       779
         weighted avg       0.89      0.89      0.89       779

Accuracy: 0.8922
