In [1]:
import os
import sys
import getpass
module_path = f"/gpfs/space/home/{getpass.getuser()}/ut-mit-news-classify/NYT/"
if module_path not in sys.path:
    sys.path.append(module_path)

import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics as metrics
from sklearn.multioutput import ClassifierChain
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score # harsh subset accuracy

In [18]:
def stats(y_true, y_pred):
    total = len(y_true)
    all_labels_correct = 0

    for y_v, y_p in zip(vec2labels(y_true), vec2labels(y_pred)):
        if set(y_v) == set(y_p):
            all_labels_correct +=1

    print("Hamming loss %f, accuracy %f" % (metrics.hamming_loss(y_true, y_pred), metrics.accuracy_score(y_true, y_pred)))
    print("Number of correctly labeled articles out of a total of %d articles : %d" % (total, all_labels_correct))
    print("Manually calculated accuracy (for sanity check): ", all_labels_correct/total)
    

## Dummy baselines

In [3]:
from utils import load_nyt_data, labels2vec, vec2labels

train_articles, train_labels_lists, \
    test_articles, test_labels_lists = load_nyt_data(min_len=500, cutoff_tags=True)

Train data loaded.
Test data loaded.
Train articles after filtering: 1195938
Test articles after filtering: 133032


#### Most frequent (single) label

In [34]:
most_common_label_classifier = DummyClassifier('most_frequent')
most_common_label_classifier.fit(train_articles, train_labels_vecs)

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [35]:
most_common_label_classifier.score(test_articles, test_labels_vecs)

0.0

#### Uniformly Random

In [58]:
most_common_label_classifier = DummyClassifier('uniform')
most_common_label_classifier.fit(train_articles, train_labels_vecs)

DummyClassifier(constant=None, random_state=None, strategy='uniform')

In [96]:
most_common_label_classifier.score(test_articles, test_labels_vecs)

0.0

#### Prior probability classifier

In [5]:
base_classifier = DummyClassifier(strategy='prior')
chain = ClassifierChain(base_classifier, order='random', random_state=0)
chain.fit(train_articles, labels2vec(train_labels_lists))

y_pred = chain.predict(test_articles)

stats(mlb, y_val, y_pred)


Hamming loss 0.005647, accuracy 0.000000
Number of correctly labeled articles out of a total of 10000 articles : 0
Manually calculated accuracy (for sanity check):  0.0


#### Most frequent labelset

In [10]:
# most common *labelset*, e.g. ('music', 'dance') vs ('politics', 'US government', 'elections')
from collections import Counter

label_sets = [tuple(lset) for lset in test_labels_lists]

most_common_labels = Counter(label_sets).most_common(5)
most_common_labelset = most_common_labels[0][0]
most_common_labelset

('weddings and engagements',)

In [19]:
test_preds = labels2vec([most_common_labelset] * len(test_labels_lists))

stats(labels2vec(test_labels_lists), test_preds)

Hamming loss 0.007541, accuracy 0.032812
Number of correctly labeled articles out of a total of 133032 articles : 4365
Manually calculated accuracy (for sanity check):  0.0328116543388057


## Learning baselines

In [4]:
with open('../NYTcorpus_train.p', mode='rb') as f:
    data = pickle.load(f)

In [3]:
labels = [a[3:] for a in data]
articles = [a[2] for a in data]

n_samples = 10000

x_train, x_val, y_train, y_val = train_test_split(articles, labels, train_size=n_samples, test_size=n_samples, random_state=0)

tfidf_vectorizer = TfidfVectorizer(
    max_features=40000,
    analyzer='word', 
    stop_words='english', 
    ngram_range=(1, 1), # logit accuracy with unigrams 0.1487, unigrams + bigrams 0.1524, unigrams + bigrams + trigrams 0.1537
    strip_accents='ascii')

tfidf_vectorizer.fit(x_train)

x_train = tfidf_vectorizer.transform(x_train)
x_val = tfidf_vectorizer.transform(x_val)

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_val = mlb.transform(y_val)


#### Naive Bayes

In [6]:
base_classifier = ComplementNB()
chain = ClassifierChain(base_classifier, order='random', random_state=0)
chain.fit(x_train, y_train)

y_pred = chain.predict(x_val)

stats(mlb, y_val, y_pred)


Hamming loss 0.005167, accuracy 0.116000
Number of correctly labeled articles out of a total of 10000 articles : 1160
Manually calculated accuracy (for sanity check):  0.116


#### Logistic Regression

In [7]:
base_classifier = LogisticRegression(solver='lbfgs', random_state=0)
chain = ClassifierChain(base_classifier, order='random', random_state=0)
chain.fit(x_train, y_train)

y_pred = chain.predict(x_val)

stats(mlb, y_val, y_pred)


Hamming loss 0.004829, accuracy 0.148700
Number of correctly labeled articles out of a total of 10000 articles : 1487
Manually calculated accuracy (for sanity check):  0.1487
