In [1]:
%%capture
!pip install wget

In [2]:
import numpy as np
import pandas as pd

# Extra imports
from abc import abstractmethod, ABC
import os
import wget
from pathlib import Path

from nltk.tokenize import RegexpTokenizer
import re
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, _BaseNB, _BaseDiscreteNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.exceptions import ConvergenceWarning
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

from scipy import stats

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import random
random.seed()
from typing import Tuple, List, Optional, Union, Dict

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Please add necessary imports here
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import wget
from pathlib import Path
filename = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/20_newsgroups.zip", "20_newsgroups.zip")
_ = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/training_files_Q7.txt", "training_files_Q7.txt")
_ = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/testing_files_Q7.txt", "testing_files_Q7.txt")

In [4]:
%%capture
!unzip 20_newsgroups.zip

In [5]:
DATA_DIR = "20_newsgroups"
ALL_FILES = [pth for pth in Path(DATA_DIR).glob("**/*") if pth.is_file() and not pth.name.startswith(".")]

# Q7

## Q7(a)

use the following code cell to implement your feature encoding

In [6]:
def clean_file_text(text):
    new_text = re.sub("Newsgroups:.*?\n", "", text)
    new_text = re.sub("Xref:.*?\n", "", new_text)
    new_text = re.sub("Path:.*?\n", "", new_text)
    new_text = re.sub("Date:.*?\n", "", new_text)
    new_text = re.sub("Followup-To:.*?\n", "", new_text)
    new_text = re.sub("Lines:.*?\n", "", new_text)
    new_text = re.sub("Reply-To:.*?\n", "", new_text)
    new_text = re.sub("Message-ID:.*?\n", "", new_text)
    new_text = re.sub("From:.*?\n", "", new_text)
    new_text = re.sub("NNTP-Posting-Host:.*?\n", "", new_text)
    return new_text

In [7]:
def corpus_words(file_list):
    data = []
    for file_path in file_list:
      with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        file_data = file.read()
        file_data = clean_file_text(file_data)
        data.append(file_data)
    tokenizer = RegexpTokenizer(r"\w+")
    # vectorizer = CountVectorizer(decode_error='ignore', lowercase=True, tokenizer=tokenizer.tokenize, stop_words='english', binary=True)
    vectorizer = TfidfVectorizer(sublinear_tf=True, lowercase=True, tokenizer=tokenizer.tokenize, max_df=0.8, min_df=3, stop_words="english", norm="l2", binary=False, ngram_range=(1,1))
    sparse_matrix = vectorizer.fit_transform(data)
    return sparse_matrix

In [8]:
def get_topic_name(file_path):
    return file_path.parent.name

def get_target(topic_name):
    topics = ["talk.politics.mideast", "rec.autos", "comp.sys.mac.hardware", "alt.atheism", "rec.sport.baseball",
     "comp.os.ms-windows.misc", "rec.sport.hockey", "sci.crypt", "sci.med", "talk.politics.misc",
     "rec.motorcycles", "comp.windows.x", "comp.graphics", "comp.sys.ibm.pc.hardware", "sci.electronics",
     "talk.politics.guns", "sci.space", "soc.religion.christian", "misc.forsale", "talk.religion.misc"]
    return topics.index(topic_name)

In [9]:
print(corpus_words(ALL_FILES).shape)



(19997, 56178)


In [10]:
def data_q7(file_list):
    X = None
    y = None

    #Please remember to put index for your dataframe as the file name
    #For example: pd.DataFrame(data, index=[str(f) for f in file_list], columns=[...])

    X = pd.DataFrame.sparse.from_spmatrix(corpus_words(file_list), index=list(map(str, file_list)))

    # Create a dataframe of targets (y)
    y = [get_target(get_topic_name(file_path)) for file_path in file_list]

    selector = SelectKBest(chi2, k=20000)
    selector.fit_transform(X,y)
    cols = selector.get_support(indices=True)
    X = X.iloc[:,cols]

    # validate return types
    assert isinstance(X, pd.DataFrame) and isinstance(y, list), "incorrect return types"

    return X, y

## Q7(b)

Use the following code cell to implement your model

In [11]:
# for clf, name in (
#     (LogisticRegression(C=5, max_iter=1000), "Logistic Regression"),
#     (RidgeClassifier(alpha=1.0, solver="sparse_cg"), "Ridge Classifier"),
#     (KNeighborsClassifier(n_neighbors=100), "kNN"),
#     (RandomForestClassifier(), "Random Forest"),
#     # L2 penalty Linear SVC
#     (LinearSVC(C=0.1, dual=False, max_iter=1000), "Linear SVC"),
#     # L2 penalty Linear SGD
#     (
#         SGDClassifier(
#             loss="log_loss", alpha=1e-4, n_iter_no_change=3, early_stopping=True
#         ),
#         "log-loss SGD",
#     ),
#     # NearestCentroid (aka Rocchio classifier)
#     (NearestCentroid(), "NearestCentroid"),
#     # Sparse naive Bayes classifier
#     (ComplementNB(alpha=0.1), "Complement naive Bayes"),
# ):
def build_model_q7():
    # Write your code here, define your model and return it
    MODELQ7 = MultinomialNB(alpha=0.01)
    return MODELQ7

Code for evaluating p at k

In [12]:
from pandas.core.arrays import numpy_
def calculate_average_precision_at_k(model_q7, data_func, all_files, training_files, testing_files, k=None):

    training_files = [str(f) for f in open(training_files, mode='r').read().splitlines()]
    testing_files = [str(f) for f in open(testing_files, mode='r').read().splitlines()]
    if k is None:
        k = len(testing_files)

    X, y = data_func(all_files)
    X["gt"] = y
    training = X.loc[training_files]
    X_train = training.loc[:, training.columns!="gt"]
    y_train = training["gt"].values

    testing = X.loc[testing_files]
    X_test = testing.loc[:, testing.columns!="gt"]
    y_test = testing["gt"].values

    model_q7.fit(X_train, y_train)
    y_pred = model_q7.predict(X_test)
    y_pred_prob = model_q7.predict_proba(X_test)
    confidences = np.max(y_pred_prob, axis=1)

    p_at_k = []
    rel_at_k = []
    confidence_order = np.argsort(confidences)
    for i in range(1, k+1):
        top_confidence = confidence_order[-i:]
        pred_top_i = y_pred[top_confidence]
        gt_top_i = np.array(y_test)[top_confidence]
        p_at_i = np.sum(pred_top_i == gt_top_i) / i
        rel_at_i = (pred_top_i[0] == gt_top_i[0])
        p_at_k.append(p_at_i)
        rel_at_k.append(rel_at_i)
    print(f"average precision at {k} is {np.dot(p_at_k, rel_at_k) / k}")
    return np.dot(p_at_k, rel_at_k) / k

In [13]:
# Example usage:
######This line of code must be able to run on Google Colab in under 7 minutes.#####
######Code that runs longer than 7 minutes on the autograder will receive 0 marks for Q7#####
m = calculate_average_precision_at_k(build_model_q7(), data_q7, ALL_FILES, "training_files_Q7.txt", "testing_files_Q7.txt")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["gt"] = y


average precision at 4000 is 0.8773310138389259


# Q7(c)

• A clear and concise description of your chosen feature set and feature encoding \
Feature encoding: tf-idf vector. I chose to set the arguments as sublinear_tf=True, lowercase=True, tokenizer=tokenizer.tokenize, max_df=0.8, min_df=3, stop_words="english", norm="l2", binary=False, ngram_range=(1,1). This was a result of empirically tuning hyperparameters using random search technique. \
Feature set: I used the SelectKBest function from sk-learn. I used the chi2 metric to determine the 20000 most relevant features to use as input to the classifier train_and_predict function. 20000 worked best empirically by a small margin, and allows us not to overfit to the train data. This gave superior performance to just using the most common words

• The name of the classifier you chose \
Multinomial Naive Bayes

• Why you chose the feature set, feature encoding, and classifier you used \
For feature encoding, i used tf-idf because it gives us not only the frequency of words for all documents, but also accounts for the relevance of that word to each topic. Tf-idf is well-used in information retrieval and is an efficient and widely taught method of creating sparse feature matrices, so I expect it to generalize well to different tasks and datasets. For feature set, i used SelectKBest with chi2 metric; because it was easy to implement (built-in scikit-learn method). chi2 worked best empirically over f_classif and mutual_info_classif, so I selected it. The number of features chosen (20,000) was also selected empirically. If we select all features, I fear we may overfit to words that wont appear in out-of-sample test sets.

• The final AP performance that your choices attained. We will verify this score by running
the model returned by your build model q7 function, and the data as returned by your
data Q7 function. \
0.8773310138389259. Not good better than the baseline at least...