In [2]:
import spacy
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [4]:
nlp = spacy.load("en_core_web_sm")


In [5]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))


In [6]:
def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

In [7]:
X = np.array([preprocess_text(text) for text in newsgroups.data])
y = np.array(newsgroups.target)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Extract features using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
# Train a Support Vector Machine (SVM) classifier
classifier = SVC(kernel='linear', C=1.0)
classifier.fit(X_train_tfidf, y_train)

In [9]:
# Make predictions on the test set
predictions = classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, predictions, target_names=newsgroups.target_names))

Accuracy: 0.65
Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.50      0.56      0.53       151
           comp.graphics       0.57      0.64      0.60       202
 comp.os.ms-windows.misc       0.60      0.60      0.60       195
comp.sys.ibm.pc.hardware       0.54      0.60      0.57       183
   comp.sys.mac.hardware       0.71      0.59      0.64       205
          comp.windows.x       0.74      0.68      0.71       215
            misc.forsale       0.74      0.66      0.70       193
               rec.autos       0.41      0.70      0.52       196
         rec.motorcycles       0.54      0.62      0.58       168
      rec.sport.baseball       0.79      0.73      0.76       211
        rec.sport.hockey       0.90      0.79      0.84       198
               sci.crypt       0.82      0.67      0.74       201
         sci.electronics       0.56      0.61      0.58       202
                 sci.med       0.80 