In [78]:
%%capture
!pip install wget

In [79]:
import numpy as np
import pandas as pd
# Please add other necessary imports here
from nltk.tokenize import RegexpTokenizer
import re

# Please add necessary imports here
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

In [80]:
import wget
from pathlib import Path
filename = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/20_newsgroups.zip", "20_newsgroups.zip")
_ = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/training_files_Q7.txt", "training_files_Q7.txt")
_ = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/testing_files_Q7.txt", "testing_files_Q7.txt")

In [81]:
%%capture
!unzip 20_newsgroups.zip

In [82]:
DATA_DIR = "20_newsgroups"
ALL_FILES = [pth for pth in Path(DATA_DIR).glob("**/*") if pth.is_file() and not pth.name.startswith(".")]

# Q7

## Q7(a)

use the following code cell to implement your feature encoding

In [83]:
def data_q7(file_list, num_words=1000):
    def clean_file_text(text):
        new_text = re.sub("Newsgroups:.*?\n", "", text)
        new_text = re.sub("Xref:.*?\n", "", new_text)
        new_text = re.sub("Path:.*?\n", "", new_text)
        new_text = re.sub("Date:.*?\n", "", new_text)
        new_text = re.sub("Followup-To:.*?\n", "", new_text)
        new_text = re.sub("Lines:.*?\n", "", new_text)
        new_text = re.sub("Reply-To:.*?\n", "", new_text)
        new_text = re.sub("Message-ID:.*?\n", "", new_text)
        new_text = re.sub("From:.*?\n", "", new_text)
        new_text = re.sub("NNTP-Posting-Host:.*?\n", "", new_text)
        return new_text
    
    def get_topic_name(file_path):
        return file_path.parent.name

    def get_target(topic_name):
        topics = ["talk.politics.mideast", "rec.autos", "comp.sys.mac.hardware", "alt.atheism", "rec.sport.baseball", 
        "comp.os.ms-windows.misc", "rec.sport.hockey", "sci.crypt", "sci.med", "talk.politics.misc", 
        "rec.motorcycles", "comp.windows.x", "comp.graphics", "comp.sys.ibm.pc.hardware", "sci.electronics",
        "talk.politics.guns", "sci.space", "soc.religion.christian", "misc.forsale", "talk.religion.misc"]
        return topics.index(topic_name)
    
    class CustomTokenizer:
        def __init__(self):
            nltk.download('wordnet')
            nltk.download('stopwords')
            self.wnl = WordNetLemmatizer()
            self.stop_words = set(stopwords.words('english'))
            self.tokenizer = RegexpTokenizer(r"\w+")
        def __call__(self, doc):
            return [self.wnl.lemmatize(word.lower()) for word in self.tokenizer.tokenize(doc) if word.lower() not in self.stop_words]
    corpus = []
    for file_path in ALL_FILES:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
            file_data = file.read()
            file_data = clean_file_text(file_data)
            corpus.append(file_data)
    vectorizer = TfidfVectorizer(tokenizer=CustomTokenizer(),ngram_range=(1, 3), min_df=7)
    X = vectorizer.fit_transform(corpus)
    X = pd.DataFrame.sparse.from_spmatrix(X, index = [str(f).replace('\\','/') for f in ALL_FILES], columns=vectorizer.get_feature_names_out())#.replace('\\','/')
    y = [get_target(get_topic_name(file_path)) for file_path in ALL_FILES]
    # validate return types
    assert isinstance(X, pd.DataFrame) and isinstance(y, list), "incorrect return types"
    
    return X, y

## Q7(b)

Use the following code cell to implement your model

In [84]:
def build_model_q7():
    # Write your code here, define your model and return it
    MODELQ7 = CalibratedClassifierCV(estimator=LinearSVC(C=0.7), cv=2)
    return MODELQ7

Code for evaluating p at k 

In [85]:
def calculate_average_precision_at_k(model_q7, data_func, all_files, training_files, testing_files, k=None):
  
    training_files = [str(f) for f in open(training_files, mode='r').read().splitlines()]
    testing_files = [str(f) for f in open(testing_files, mode='r').read().splitlines()]
    if k is None:
        k = len(testing_files)

    X, y = data_func(all_files)
    X["gt"] = y
    training = X.loc[training_files]
    X_train = training.loc[:, training.columns!="gt"]
    y_train = training["gt"].values

    testing = X.loc[testing_files]
    X_test = testing.loc[:, testing.columns!="gt"]
    y_test = testing["gt"].values

    model_q7.fit(X_train, y_train)
    y_pred = model_q7.predict(X_test)
    y_pred_prob = model_q7.predict_proba(X_test)
    confidences = np.max(y_pred_prob, axis=1)
    
    p_at_k = []
    rel_at_k = []
    confidence_order = np.argsort(confidences)
    for i in range(1, k+1):
        top_confidence = confidence_order[-i:]
        pred_top_i = y_pred[top_confidence]
        gt_top_i = np.array(y_test)[top_confidence]
        p_at_i = np.sum(pred_top_i == gt_top_i) / i
        rel_at_i = (pred_top_i[0] == gt_top_i[0])
        p_at_k.append(p_at_i)
        rel_at_k.append(rel_at_i)
    print(f"average precision at {k} is {np.dot(p_at_k, rel_at_k) / k}")
    return np.dot(p_at_k, rel_at_k) / k

In [86]:
# Example usage:
######This line of code must be able to run on Google Colab in under 7 minutes.#####
######Code that runs longer than 7 minutes on the autograder will receive 0 marks for Q7#####
m = calculate_average_precision_at_k(build_model_q7(), data_q7, ALL_FILES, "training_files_Q7.txt", "testing_files_Q7.txt")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kirby\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kirby\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


average precision at 4000 is 0.8600042117237919


# Q7(c)

I encoded the documents using TF-IDF, applying lowercase, stop word filtering and lemmatizer as processing. I selected unigrams, bigrams and trigrams which have a min document frequency no less than 7 (selected to reduce computation time)

I used a support vector classifier (SVC), wrapped as a CalibratedClassifierCV so I could get the class probabilities (predict_proba) using Platt scaling.

I chose the feature set because it provides information about ngrams of various lengths, and filters too specific ngrams. I chose TF-IDF encoding to retain information about word frequency while normalizing the data based on document frequency. I chose SVC because it demonstrated much greater accuracy than NB, LR and K means, and it is supported by research as being a good choice for this task (https://arxiv.org/abs/2211.02563).

The final AP performance is around 0.86.