## Q3: Pairwise feature selection for text 

Use scikit-learn built in "chi2" criteria to select top 200 features, then rerun classification tasks, compare performance with 3A Q1. 

Repeat pipeline with "mutual information criteria. 


In [23]:
# import newsgroups dataset:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
ng_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
ng_train_feat = ng_train.data
ng_train_labels = ng_train.target

ng_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
ng_test_feat = ng_test.data
ng_test_labels = ng_test.target

In [25]:
# Initialize the TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=6, max_df=0.7)

# Fit and transform the training features
ng_train_features = tfidf_vectorizer.fit_transform(ng_train_feat)

# Transform the testing features
ng_test_features = tfidf_vectorizer.transform(ng_test_feat)

In [26]:
# observe dimensionality:
print(ng_train_features.shape)
print(ng_test_features.shape)

(11314, 5000)
(7532, 5000)


In [27]:
from sklearn.feature_selection import chi2


# isalpha function

# Select top 200 features
num_features = 200
chi2score = chi2(ng_train_features, ng_train_labels)[0]
indices = np.argsort(chi2score)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())[indices]

# Get the feature names
feature_names = [feature_names[i] for i in range(num_features)]

# Get the feature indices
feature_indices = [indices[i] for i in range(num_features)]

# Select the top 200 features
ng_train_features = ng_train_features[:, feature_indices]
ng_test_features = ng_test_features[:, feature_indices]

# print out the top 200 features:
print(feature_names)

# remove features that have length 3 or less:
feature_names = [feature for feature in feature_names if feature.isalpha() ==]
print(feature_names)

# To remove these features
    # -> min_df=3 in the TfidfVectorizer?

SyntaxError: invalid syntax (2526200830.py, line 26)

In [None]:
# Perform the same on mutual information
from sklearn.feature_selection import mutual_info_classif

# Select top 200 features
num_features = 200
mi = mutual_info_classif(ng_train_features, ng_train_labels)
indices = np.argsort(mi)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())[indices]

# Get the feature names
feature_names = [feature_names[i] for i in range(num_features)]

# Get the feature indices
feature_indices = [indices[i] for i in range(num_features)]

# Select the top 200 features
ng_train_features = ng_train_features[:, feature_indices]
ng_test_features = ng_test_features[:, feature_indices]

# print out the top 200 features:
print(feature_names)

# remove words less than 3 characters
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000, min_df=3)




['16m', '2mb', '101', '0q', '06', '0tq', '10', '100', '104', '106', '150', '05', '1eq', '1y', '3000', '00', '253', '1972', '1024x768', '132', '133', '131', '129', '147', '135', '125', '110', '11', '0qax', '180', '15', '000', '1000', '109', '1t', '450', '1z4', '2p', '0t', '1024', '040', '04', '08', '105', '09', '03', '02', '01', '149', '144', '13', '14', '120', '145', '146', '140', '07', '0m', '19', '102', '117', '128', '12', '112', '0d', '0i', '1200', '2000', '300', '46', '127', '1q', '1982', '1977', '18', '1983', '1d9', '1fpl', '22', '1968', '1z6e', '1978', '1975', '1s', '130', '220', '1920', '2400', '1981', '170', '16', '20', '212', '26', '17', '4000', '286', '39', '192', '1970', '1914', '47', '45', '3d', '408', '1986', '225', '160', '1979', '1f', '1w', '1964', '1967', '182', '21', '1mb', '1992', '1st', '1p', '3p', '1990', '255', '33', '41', '2q', '280', '1915', '3c', '199', '1989', '250', '32', '36', '256', '2b', '48', '34u', '34', '3t', '2di', '1987', '2x', '30', '2tg', '3k', '303'