## Q3: Pairwise feature selection for text 

Use scikit-learn built in "chi2" criteria to select top 200 features, then rerun classification tasks, compare performance with 3A Q1. 

Repeat pipeline with "mutual information criteria. 


In [50]:
# import newsgroups dataset:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
ng_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
ng_train_feat = ng_train.data
ng_train_labels = ng_train.target

ng_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
ng_test_feat = ng_test.data
ng_test_labels = ng_test.target

In [52]:
# Initialize the TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=15)

# Fit and transform the training features
ng_train_features = tfidf_vectorizer.fit_transform(ng_train_feat)

# Transform the testing features
ng_test_features = tfidf_vectorizer.transform(ng_test_feat)

In [53]:
# observe dimensionality:
print(ng_train_features.shape)
print(ng_test_features.shape)

(11314, 5000)
(7532, 5000)


In [54]:
from sklearn.feature_selection import chi2

# isalpha function

# Select top 200 features
num_features = 200
chi2score = chi2(ng_train_features, ng_train_labels)[0]
indices = np.argsort(chi2score)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())[indices]

# Get the feature names
feature_names = [feature_names[i] for i in range(num_features)]

# Get the feature indices
feature_indices = [indices[i] for i in range(num_features)]

# Select the top 200 features
ng_train_features = ng_train_features[:, feature_indices]
ng_test_features = ng_test_features[:, feature_indices]

for feature_name in feature_names:
    if feature_name.isalpha():
        print(feature_name)
    
    
# print out the top 200 features:
#print(feature_names)



# To remove these features
    # -> min_df=3 in the TfidfVectorizer?

gq
capable
categories
depends
yd
exception
indication
tr
strictly
initially
extremely
fashion
steps
releases
ready
meetings
qq
remains
regularly
minimal
passes
improved
huge
ei
virtually
pacific
tz
bruce
princeton
gk
procedures
sections
closely
dates
special
characteristics
creative
mountain
contained
examined
martin
primarily
consists
slowly
bringing
mainly
arthur
occured
locations
prefer
ff
failing
ne
guidelines
measured
attached
brief
writers
jr
begins
repeat
santa
offered
greater
largely
similarly
pieces
turned
divided
financial
gw
dependent
mn
desired
conflicts
iron
unique
acceptance
rapidly
extend
closed
continuing
maryland
om
heavily
adequate
successful
quickly
shortly
compared
october
aspect
medium
quarter
dropped
dont
blow
substantial
perfectly
dedicated
furthermore
thu
introduction
massachusetts
benefit
oct
combination
stores
listed
half
appearance
telling
pre
covered
hoping
priority
tries
district
november
du
impressed
entering
obscure
assumes
wants
bothered
sorts
vi
utah
di

In [55]:
# Rerun a classification task from 3A with the reduced feature set 
# More suscipetable to overfitting

# Use logistic regression to classify the newsgroup data with the top 200 features selected by chi2.
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Fit the model on the training data with the selected features

log_reg.fit(ng_train_features, ng_train_labels)

# Predict on the test data

ng_test_predictions = log_reg.predict(ng_test_features)

# Evaluate the model's performance using accuracy

from sklearn.metrics import accuracy_score

# Calculate the accuracy of the model on the test set
accuracy = accuracy_score(ng_test_labels, ng_test_predictions)

print(f'Accuracy of Logistic Regression on the test set with top {num_features} features: {accuracy:.4f}')




Accuracy of Logistic Regression on the test set with top 200 features: 0.0686


In [60]:
# Perform the same on mutual information
from sklearn.feature_selection import mutual_info_classif

# Select top 200 features
num_features = 200
mi = mutual_info_classif(ng_train_features, ng_train_labels)
indices = np.argsort(mi)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())[indices]

# Get the feature names
feature_names = [feature_names[i] for i in range(num_features)]

# Get the feature indices
feature_indices = [indices[i] for i in range(num_features)]

# Select the top 200 features
ng_train_features = ng_train_features[:, feature_indices]
ng_test_features = ng_test_features[:, feature_indices]

for feature_name in feature_names:
    if feature_name.isalpha():
        print(feature_name)
    




In [61]:
# Rerun a classification task from 3A with the reduced feature set

# Use logistic regression to classify the newsgroup data with the top 200 features selected by chi2.
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Fit the model on the training data with the selected features

log_reg.fit(ng_train_features, ng_train_labels)

# Predict on the test data

ng_test_predictions = log_reg.predict(ng_test_features)

# Evaluate the model's performance using accuracy

from sklearn.metrics import accuracy_score

# Calculate the accuracy of the model on the test set
accuracy = accuracy_score(ng_test_labels, ng_test_predictions)

print(f'Accuracy of Logistic Regression on the test set with top {num_features} features: {accuracy:.4f}')

Accuracy of Logistic Regression on the test set with top 200 features: 0.0686


In [58]:
### Q4: Run a strongL1-regularized regression (library) on 20NG
#  select 200 features (words) based on regression coefficients absolute value. 
# Then reconstruct the dataset with only these features, and rerun any of the classification tasks

import torch
import torch.nn as nn
import torch.optim as optim

# Define the model
class LinearRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

# Set hyperparameters
input_dim = ng_train_features.shape[1]  # Match the number of features (200)
output_dim = 1
learning_rate = 0.01
l1_lambda = 0.01 # L1 regularization strength

# Instantiate the model, loss function, and optimizer
model = LinearRegression(input_dim=input_dim, output_dim=output_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
epochs = 1000
for epoch in range(epochs):
    # Forward pass
    outputs = model(torch.tensor(ng_train_features.toarray(), dtype=torch.float))  # Convert sparse matrix to dense and float
    loss = criterion(outputs, torch.tensor(ng_train_labels).float().view(-1, 1))

    # Add L1 regularization term
    l1_penalty = 0
    for param in model.parameters():
        l1_penalty += torch.sum(torch.abs(param))
    loss += l1_lambda * l1_penalty

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')


Epoch [10/1000], Loss: 114.5364
Epoch [20/1000], Loss: 112.6901
Epoch [30/1000], Loss: 110.8723
Epoch [40/1000], Loss: 109.0854
Epoch [50/1000], Loss: 107.3296
Epoch [60/1000], Loss: 105.6041
Epoch [70/1000], Loss: 103.9090
Epoch [80/1000], Loss: 102.2440
Epoch [90/1000], Loss: 100.6086
Epoch [100/1000], Loss: 99.0027
Epoch [110/1000], Loss: 97.4257
Epoch [120/1000], Loss: 95.8775
Epoch [130/1000], Loss: 94.3574
Epoch [140/1000], Loss: 92.8655
Epoch [150/1000], Loss: 91.4009
Epoch [160/1000], Loss: 89.9638
Epoch [170/1000], Loss: 88.5536
Epoch [180/1000], Loss: 87.1699
Epoch [190/1000], Loss: 85.8125
Epoch [200/1000], Loss: 84.4809
Epoch [210/1000], Loss: 83.1753
Epoch [220/1000], Loss: 81.8946
Epoch [230/1000], Loss: 80.6389
Epoch [240/1000], Loss: 79.4081
Epoch [250/1000], Loss: 78.2010
Epoch [260/1000], Loss: 77.0182
Epoch [270/1000], Loss: 75.8591
Epoch [280/1000], Loss: 74.7234
Epoch [290/1000], Loss: 73.6102
Epoch [300/1000], Loss: 72.5201
Epoch [310/1000], Loss: 71.4518
Epoch [3

In [59]:
# Then, extract weights from the model to identify the top 200 features based on the absolute value of the coefficients.
# Extract the weights from the model

with torch.no_grad():
    
    weights = model.linear.weight.data.numpy().flatten()  # Extract the weights from the linear layer
    # Get the absolute values of the weights
    abs_weights = np.abs(weights)


# Get the indices of the top 200 features based on the absolute values of the weights

top_indices = np.argsort(abs_weights)[-num_features:]  # Get the indices of the top 200 features

# Get the feature names corresponding to the top indices

# Get the feature names from the vectorizer
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())  # Get the feature names from the vectorizer
# Select the top feature names based on the indices

top_feature_names = feature_names[top_indices]  # Get the top feature names based on the indices

# Print the top feature names
print("Top 200 features based on L1-regularized regression coefficients:")

for feature_name in top_feature_names:
    
  # Filter to only print alphabetic words
  
  if feature_name.isalpha():
    print(feature_name)
    





Top 200 features based on L1-regularized regression coefficients:
