## Q3: Pairwise feature selection for text 

Use scikit-learn built in "chi2" criteria to select top 200 features, then rerun classification tasks, compare performance with 3A Q1. 

Repeat pipeline with "mutual information criteria. 


In [49]:
# import newsgroups dataset:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
ng_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
ng_train_feat = ng_train.data
ng_train_labels = ng_train.target

ng_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
ng_test_feat = ng_test.data
ng_test_labels = ng_test.target

In [51]:
# Initialize the TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=15)

# Fit and transform the training features
ng_train_features = tfidf_vectorizer.fit_transform(ng_train_feat)

# Transform the testing features
ng_test_features = tfidf_vectorizer.transform(ng_test_feat)

In [39]:
# observe dimensionality:
print(ng_train_features.shape)
print(ng_test_features.shape)

(11314, 5000)
(7532, 5000)


In [47]:
from sklearn.feature_selection import chi2

# isalpha function

# Select top 200 features
num_features = 200
chi2score = chi2(ng_train_features, ng_train_labels)[0]
indices = np.argsort(chi2score)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())[indices]

# Get the feature names
feature_names = [feature_names[i] for i in range(num_features)]

# Get the feature indices
feature_indices = [indices[i] for i in range(num_features)]

# Select the top 200 features
ng_train_features = ng_train_features[:, feature_indices]
ng_test_features = ng_test_features[:, feature_indices]

for feature_name in feature_names:
    if feature_name.isalpha():
        print(feature_name)
    
    
# print out the top 200 features:
#print(feature_names)



# To remove these features
    # -> min_df=3 in the TfidfVectorizer?

gq
capable
categories
depends
yd
exception
indication
tr
strictly
initially
extremely
fashion
steps
releases
ready
meetings
qq
remains
regularly
minimal
passes
improved
huge
ei
virtually
pacific
tz
bruce
princeton
gk
procedures
sections
closely
dates
special
characteristics
creative
mountain
contained
examined
martin
primarily
consists
slowly
bringing
mainly
arthur
occured
locations
prefer
ff
failing
ne
guidelines
measured
attached
brief
writers
jr
begins
repeat
santa
offered
greater
largely
similarly
pieces
turned
divided
financial
gw
dependent
mn
desired
conflicts
iron
unique
acceptance
rapidly
extend
closed
continuing
maryland
om
heavily
adequate
successful
quickly
shortly
compared
october
aspect
medium
quarter
dropped
dont
blow
substantial
perfectly
dedicated
furthermore
thu
introduction
massachusetts
benefit
oct
combination
stores
listed
half
appearance
telling
pre
covered
hoping
priority
tries
district
november
du
impressed
entering
obscure
assumes
wants
bothered
sorts
vi
utah
di

In [48]:
# Rerun a classification task from 3A with the reduced feature set

# Decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize the Decision Tree Classifier with the features from the reduced feature set
clf = DecisionTreeClassifier()

# Fit the classifier
clf.fit(ng_train_features, ng_train_labels)

# Predict the labels
ng_pred = clf.predict(ng_test_features)

# Calculate the accuracy
accuracy = accuracy_score(ng_test_labels, ng_pred)
print('Accuracy:', accuracy)



Accuracy: 0.06638343069569835


In [52]:
# Perform the same on mutual information
from sklearn.feature_selection import mutual_info_classif

# Select top 200 features
num_features = 200
mi = mutual_info_classif(ng_train_features, ng_train_labels)
indices = np.argsort(mi)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())[indices]

# Get the feature names
feature_names = [feature_names[i] for i in range(num_features)]

# Get the feature indices
feature_indices = [indices[i] for i in range(num_features)]

# Select the top 200 features
ng_train_features = ng_train_features[:, feature_indices]
ng_test_features = ng_test_features[:, feature_indices]

for feature_name in feature_names:
    if feature_name.isalpha():
        print(feature_name)
    




recchi
uci
cpsr
emacs
kc
handler
fj
di
tz
te
iq
lg
yd
dg
gq
siggraph
quicktime
wb
powerpc
atlas
grounded
ships
revolver
homicides
stevens
amour
cylinders
ics
wk
pedal
explorer
rk
defaults
astronomical
orchid
ci
biz
breaker
midi
gk
dv
tq
qq
sectors
lk
paradox
ds
ar
kt
thu
bishop
shouting
karabagh
gr
ankara
baptism
probes
visualization
coalition
theists
infallible
dh
soderstrom
rt
abs
catcher
xm
alias
uv
rr
fg
nm
infections
mv
cx
gamma
consortium
ff
ozone
voting
mk
clinical
yeast
ww
weaver
winners
azerbaijanis
saint
grass
mob
briefing
mat
homicide
axe
miracles
ahl
tutorial
hst
ctrl
gw
raster
soda
pex
joystick
eu
lj
maine
truetype
cincinnati
esdi
fort
wt
immune
dd
ink
zealand
swedish
equation
bare
ghost
qur
allah
ra
mormon
tue
ncsl
peak
jp
cv
clemens
ir
parity
patent
decryption
mh
toolkits
om
outlet
xdm
ga
molecular
bo
sheets
throttle
mice
steam
lo
neck
viking
dk
stones
aclu
rally
einstein
clinic
wright
prevention
royal
fruit
commentary
pistol
spanish
presentations




In [53]:
# Rerun a classification task from 3A with the reduced feature set

# Initialize the Decision Tree Classifier with the features from the reduced feature set
clf = DecisionTreeClassifier()

# Fit the classifier
clf.fit(ng_train_features, ng_train_labels)

# Predict the labels
ng_pred = clf.predict(ng_test_features)

# Calculate the accuracy
accuracy = accuracy_score(ng_test_labels, ng_pred)
print('Accuracy:', accuracy)


Accuracy: 0.10435475305363781
