In [6]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.metrics import accuracy_score

In [7]:
newsgroups = fetch_20newsgroups(subset='all')
X, y = newsgroups.data, newsgroups.target

print('Number of classes:', len(newsgroups.target_names))
print(newsgroups.target_names)
print('Number of samples:', len(X))

Number of classes: 20
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Number of samples: 18846


In [8]:
vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

print('Train size:', X_train.shape)
print('Test size:', X_test.shape)

Train size: (13192, 10000)
Test size: (5654, 10000)


In [10]:
baseline_model = LogisticRegression(max_iter=1000, solver='liblinear')
baseline_model.fit(X_train, y_train)
baseline_pred = baseline_model.predict(X_test)
baseline_accuracy = accuracy_score(y_test, baseline_pred)
print(f"Baseline Logistic Regression Accuracy (All Features): {baseline_accuracy*100:.4f}")


Baseline Logistic Regression Accuracy (All Features): 87.2480


In [12]:
chi2_selector = SelectKBest(chi2, k=200) 
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
X_test_chi2 = chi2_selector.transform(X_test)

chi2_model = LogisticRegression(max_iter=1000, solver='liblinear')
chi2_model.fit(X_train_chi2, y_train)
chi2_pred = chi2_model.predict(X_test_chi2)
chi2_accuracy = accuracy_score(y_test, chi2_pred)
print(f"Logistic Regression Accuracy (Chi-squared): {chi2_accuracy*100:.4f}")

Logistic Regression Accuracy (Chi-squared): 66.2540


In [13]:
mi_selector = SelectKBest(mutual_info_classif, k=200)
X_train_mi = mi_selector.fit_transform(X_train, y_train)
X_test_mi = mi_selector.transform(X_test)

mi_model = LogisticRegression(max_iter=1000, solver='liblinear')
mi_model.fit(X_train_mi, y_train)
mi_pred = mi_model.predict(X_test_mi)
mi_accuracy = accuracy_score(y_test, mi_pred)
print(f"Logistic Regression Accuracy (Mutual Information): {mi_accuracy:.4f}")

print("\nPerformance Comparison:")
print(f"Baseline (All Features): {baseline_accuracy:.4f}")
print(f"Chi-squared (200 Features): {chi2_accuracy:.4f}")
print(f"Mutual Information (200 Features): {mi_accuracy:.4f}")




Logistic Regression Accuracy (Mutual Information): 0.3442

Performance Comparison:
Baseline (All Features): 0.8725
Chi-squared (200 Features): 0.6625
Mutual Information (200 Features): 0.3442
