## Q3: Pairwise feature selection for text 

Use scikit-learn built in "chi2" criteria to select top 200 features, then rerun classification tasks, compare performance with 3A Q1. 

Repeat pipeline with "mutual information criteria. 


In [39]:
# import newsgroups dataset:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
ng_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
ng_train_feat = ng_train.data
ng_train_labels = ng_train.target

ng_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
ng_test_feat = ng_test.data
ng_test_labels = ng_test.target

In [41]:
# Initialize the TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=15)

# Fit and transform the training features
ng_train_features = tfidf_vectorizer.fit_transform(ng_train_feat)

# Transform the testing features
ng_test_features = tfidf_vectorizer.transform(ng_test_feat)

In [42]:
# observe dimensionality:
print(ng_train_features.shape)
print(ng_test_features.shape)

(11314, 5000)
(7532, 5000)


In [43]:
from sklearn.feature_selection import chi2

# isalpha function

# Select top 200 features
num_features = 200
chi2score = chi2(ng_train_features, ng_train_labels)[0]
indices = np.argsort(chi2score)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())[indices]

# Get the feature names
feature_names = [feature_names[i] for i in range(num_features)]

# Get the feature indices
feature_indices = [indices[i] for i in range(num_features)]

# Select the top 200 features
ng_train_features = ng_train_features[:, feature_indices]
ng_test_features = ng_test_features[:, feature_indices]

for feature_name in feature_names:
    if feature_name.isalpha():
        print(feature_name)
    
    
# print out the top 200 features:
#print(feature_names)



# To remove these features
    # -> min_df=3 in the TfidfVectorizer?

gq
capable
categories
depends
yd
exception
indication
tr
strictly
initially
extremely
fashion
steps
releases
ready
meetings
qq
remains
regularly
minimal
passes
improved
huge
ei
virtually
pacific
tz
bruce
princeton
gk
procedures
sections
closely
dates
special
characteristics
creative
mountain
contained
examined
martin
primarily
consists
slowly
bringing
mainly
arthur
occured
locations
prefer
ff
failing
ne
guidelines
measured
attached
brief
writers
jr
begins
repeat
santa
offered
greater
largely
similarly
pieces
turned
divided
financial
gw
dependent
mn
desired
conflicts
iron
unique
acceptance
rapidly
extend
closed
continuing
maryland
om
heavily
adequate
successful
quickly
shortly
compared
october
aspect
medium
quarter
dropped
dont
blow
substantial
perfectly
dedicated
furthermore
thu
introduction
massachusetts
benefit
oct
combination
stores
listed
half
appearance
telling
pre
covered
hoping
priority
tries
district
november
du
impressed
entering
obscure
assumes
wants
bothered
sorts
vi
utah
di

In [44]:
# Perform the same on mutual information
from sklearn.feature_selection import mutual_info_classif

# Select top 200 features
num_features = 200
mi = mutual_info_classif(ng_train_features, ng_train_labels)
indices = np.argsort(mi)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())[indices]

# Get the feature names
feature_names = [feature_names[i] for i in range(num_features)]

# Get the feature indices
feature_indices = [indices[i] for i in range(num_features)]

# Select the top 200 features
ng_train_features = ng_train_features[:, feature_indices]
ng_test_features = ng_test_features[:, feature_indices]

# print out the top 200 features:
print(feature_names)

# remove words less than 3 characters
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000, min_df=3)




['12', '000', '040', '00', '175', '110', '129', '120', '128', '1024x768', '28', '1914', '199', '1964', '40', '02', '210', '415', '07', '11', '127', '2b', '1975', '312', '117', '112', '3b', '135', '54', '1t', '214', '1990', '5th', '1920', '39', '36', '101', '34', '1024', '192', '141', '286', '03', '18', '1992', '170', '182', '146', '500', '3a', '1984', '17', '1st', '1994', '408', '3rd', '113', '202', '1000', '1d', '1967', '20', '180', '50', '1mb', '125', '1915', '1980', '1972', '1988', '140', '240', '21', '1977', '200', '1989', '49', '106', '30', '38', '15', '133', '09', '225', '47', '08', '2mb', '150', '250', '26', '212', '255', '2400', '4m', '1991', '24', '144', '1974', '35', '320', '100', '360', '1200', '59', '5s', '130', '32', '10', '149', '213', '06', '23', '206', '13', '2000', '1993', '1978', '131', '33', '48', '3t', '5000', '4th', '486', '1983', '2d', '46', '230', '160', '4t', '253', '1970', '44', '105', '05', '4mb', '512', '147', '350', '22', '386', '300', '145', '53', '1986', '

