# Importing necessary libraries

In [1]:
# import libraries for preprocessing
from typing import Callable, Any
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# importing 20NG dataset
from sklearn.datasets import fetch_20newsgroups

# feature selection libraries
from sklearn.feature_selection import chi2, mutual_info_classif, SelectKBest

# importing logistic regression to perform classification task
from sklearn.linear_model import LogisticRegression

# Reading data and converting it into vectors using TFIDVectorizer

In [2]:
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, remove=("headers", "footers", "quotes"))
X_20ng = newsgroups.data
y_20ng = newsgroups.target
# Converting Text data to vectors
vectorizer = TfidfVectorizer(max_features=None, stop_words="english")
vectors = vectorizer.fit_transform(X_20ng)

In [3]:
def build_pipeline(feature_selection_object: Callable[[Any], Any], k: int):
    """
    Build a classification pipeline with feature selection object that can be selected by the user
    :param k: Best K features
    :param feature_selection_object: function to perform pairwise feature selection
    :return: pipeline object
    """
    return Pipeline([
        ("feature_selection", SelectKBest(score_func=chi2, k=k)),
        ("logistic regression", LogisticRegression(random_state=42, max_iter=1000)),
    ])

# Using Chi2 for feature selection

In [4]:
pipeline = build_pipeline(chi2, 200)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(vectors, y_20ng, random_state=42, stratify=y_20ng, test_size=0.2)

In [6]:
pipeline.fit(X_train, y_train)

In [7]:
# checking performance on test set
f1_score(y_test, pipeline.predict(X_test), average='weighted')

0.57038781804458

# Using Mutual Information for feature selection

In [8]:
pipeline = build_pipeline(mutual_info_classif, 200)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(vectors, y_20ng, random_state=42, stratify=y_20ng, test_size=0.2)
pipeline.fit(X_train, y_train)

In [10]:
f1_score(y_test, pipeline.predict(X_test), average='weighted')

0.57038781804458