In [1]:
# import libraries for preprocessing
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# importing 20NG dataset
from sklearn.datasets import fetch_20newsgroups

# feature selection libraries
from sklearn.linear_model import Lasso

# importing logistic regression to perform classification task
from sklearn.linear_model import LogisticRegression

In [2]:
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, remove=("headers", "footers", "quotes"))
X_20ng = newsgroups.data
y_20ng = newsgroups.target
# Converting Text data to vectors
vectorizer = TfidfVectorizer(max_features=None, stop_words="english")
vectors = vectorizer.fit_transform(X_20ng)

In [3]:
feature_pipeline = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("lasso", Lasso(alpha=0.1, random_state=42, max_iter=1000))
])

In [4]:
feature_pipeline.fit(vectors, y_20ng)
lasso_model = feature_pipeline.named_steps["lasso"]
coef = np.abs(lasso_model.coef_)
top_200_indices = np.argsort(-coef)[:200]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(vectors, y_20ng, random_state=42, stratify=y_20ng, test_size=0.2)

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train[:, top_200_indices], y_train)

In [9]:
accuracy_score(y_test, classifier.predict(X_test[:, top_200_indices]))

0.4405835543766578