In [1]:
pip install torch scikit-learn pandas numpy nltk




In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import numpy as np


In [6]:
# Load a subset of 20 newsgroups data
categories = ['sci.space', 'rec.autos', 'talk.politics.guns', 'comp.graphics']

newsgroups = fetch_20newsgroups(subset='all', categories=categories)

X = newsgroups.data  # The news articles
y = newsgroups.target  # The corresponding labels


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Transform the text data into TF-IDF feature vectors
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()


In [9]:
X_train_tensor = torch.tensor(X_train_tfidf, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_tfidf, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)


In [10]:
class NewsClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NewsClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


In [11]:
input_dim = X_train_tfidf.shape[1]  # Number of features
hidden_dim = 128  # Hidden layer size
output_dim = len(categories)  # Number of categories

model = NewsClassifier(input_dim, hidden_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [12]:
epochs = 10
batch_size = 64

for epoch in range(epochs):
    model.train()  # Set model to training mode
    for i in range(0, X_train_tensor.size(0), batch_size):
        # Get mini-batches
        X_batch = X_train_tensor[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]

        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')


Epoch [1/10], Loss: 0.8563
Epoch [2/10], Loss: 0.2216
Epoch [3/10], Loss: 0.0633
Epoch [4/10], Loss: 0.0272
Epoch [5/10], Loss: 0.0147
Epoch [6/10], Loss: 0.0091
Epoch [7/10], Loss: 0.0062
Epoch [8/10], Loss: 0.0044
Epoch [9/10], Loss: 0.0034
Epoch [10/10], Loss: 0.0026


In [13]:
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)

# Convert predictions to numpy arrays for scikit-learn metrics
y_pred = predicted.numpy()
y_true = y_test_tensor.numpy()

# Calculate and print accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

print('\nClassification Report:')
print(classification_report(y_true, y_pred, target_names=categories))


Accuracy: 98.32%

Classification Report:
                    precision    recall  f1-score   support

         sci.space       0.96      0.99      0.98       194
         rec.autos       0.99      0.99      0.99       214
talk.politics.guns       0.99      0.97      0.98       199
     comp.graphics       0.99      0.98      0.98       165

          accuracy                           0.98       772
         macro avg       0.98      0.98      0.98       772
      weighted avg       0.98      0.98      0.98       772

