## Q3: Pairwise feature selection for text 

Use scikit-learn built in "chi2" criteria to select top 200 features, then rerun classification tasks, compare performance with 3A Q1. 

Repeat pipeline with "mutual information criteria. 


In [33]:
# import newsgroups dataset:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
ng_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
ng_train_feat = ng_train.data
ng_train_labels = ng_train.target

ng_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
ng_test_feat = ng_test.data
ng_test_labels = ng_test.target

In [35]:
# Initialize the TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training features
ng_train_features = tfidf_vectorizer.fit_transform(ng_train_feat)

# Transform the testing features
ng_test_features = tfidf_vectorizer.transform(ng_test_feat)

In [36]:
# observe dimensionality:
print(ng_train_features.shape)
print(ng_test_features.shape)

(11314, 5000)
(7532, 5000)


In [37]:
from sklearn.feature_selection import chi2

# Select top 200 features
num_features = 200
chi2score = chi2(ng_train_features, ng_train_labels)[0]
indices = np.argsort(chi2score)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())[indices]

# Get the feature names
feature_names = [feature_names[i] for i in range(num_features)]

# Get the feature indices
feature_indices = [indices[i] for i in range(num_features)]

# Select the top 200 features
ng_train_features = ng_train_features[:, feature_indices]
ng_test_features = ng_test_features[:, feature_indices]

# print out the top 200 features:
print(feature_names)

# remove features that have length 3 or less:
feature_names = [feature for feature in feature_names if len(feature) > 3]
print(feature_names)



['f9f9', '6ei4', '9l3', '0qax', '1f9', '9f8', 'yf9', 'v9fq', 'i0l', 'u3l', '7klj', '2tg', 'a945', '1z6e', 'm1t', 'a865', 'q30t', 'bs0t', '5g9p', 'a86r', 'u34u', 'bhjn', '0tq', 'mg9v', 'bxom', 'f9d', 'r186', '5g9v', 'g9p', 'gizw', 'ghj', '9f9', 'nuy', 'wwiz', 'p4u', '2tct', 'm9v', '1fpl', 'qtm', '7ex', 'mtm', '7ez', 'mb8f', 'b8g', 'wm4u', '7kn', '1d9l', 'r8f', 'b8e', 'nrhj', 'qax', '1eq', 'b4q', 'vmk', 'uz', 'f0', '7z', 'uy', 'd6', '7ey', '6um', 'okz', 'gq', 'bxn', '15o', '4e', 'fy', '6e', 'q5o', 'oj', '1f', 'yd', '0m', '1s', '7r', 'yj', 'capable', 'ij', 'qv', 'gy', 'kn', 'depends', '0i', 'maxbyte', 'giz', 'exception', 'indication', 'ei', 'categories', '9l', 'extremely', 'qq', 'strictly', '8n', '3o', 'fashion', 'initially', 'steps', '2tm', 'yx', 'q45', 'kj', 'releases', '0q', '2j', 'fyn', 'ready', '9d', 'qk', 'hj', 'gk', 'tz', '9f', 'dy', 'bl', 'remains', '4t', 'regularly', 'minimal', 'virtually', '133', '6c', '3c', '3p', 'z5', 'passes', 'improved', 'vm', 'huge', 'sections', 'c8v', 'bru

In [38]:
# Perform the same on mutual information
from sklearn.feature_selection import mutual_info_classif

# Select top 200 features
num_features = 200
mi = mutual_info_classif(ng_train_features, ng_train_labels)
indices = np.argsort(mi)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())[indices]

# Get the feature names
feature_names = [feature_names[i] for i in range(num_features)]

# Get the feature indices
feature_indices = [indices[i] for i in range(num_features)]

# Select the top 200 features
ng_train_features = ng_train_features[:, feature_indices]
ng_test_features = ng_test_features[:, feature_indices]

# print out the top 200 features:
print(feature_names)

# remove words less than 3 characters
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000, min_df=3)




['1983', '1f', '192', '182', '30', '34u', '3b', '32', '2j', '2q', '2di', '2mb', '102', '170', '15o', '240', '0d', '04', '10', '1024', '104', '147', '0t', '0qax', '0q', '2d', '1q', '03', '1f9', '214', '1972', '130', '146', '127', '13', '132', '101', '17', '109', '106', '105', '133', '0tq', '0i', '120', '13q', '1993', '20', '25', '135', '000', '01', '02', '040', '05', '06', '07', '08', '09', '13p', '100', '1000', '1024x768', '0m', '11', '14', '140', '144', '145', '180', '128', '125', '00', '12', '117', '112', '110', '200', '2b', '3k', '1200', '1979', '1p', '1977', '1d9l', '16m', '1984', '212', '1eq', '1968', '1w', '1978', '1t', '129', '1974', '23', '1920', '253', '1982', '160', '15', '29', '22', '16', '3d', '28', '3o', '1914', '1970', '19', '39', '400', '1989', '45', '213', '1mb', '1980', '150', '1z4', '1964', '1967', '202', '1992', '1st', '18', '1z6e', '1y', '3c', '1990', '26', '36', '34l', '2tct', '27', '1915', '37', '1987', '256', '3a', '3p', '280', '2pl', '3l', '34', '2tg', '1988', '

