In [5]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.metrics as metrics
from sklearn.multioutput import ClassifierChain
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression


In [6]:
with open('../NYTcorpus_train.p', mode='rb') as f:
    data = pickle.load(f)

In [3]:
labels = [a[3:] for a in data]
articles = [a[2] for a in data]

n_samples = 10000

x_train, x_val, y_train, y_val = train_test_split(articles, labels, train_size=n_samples, test_size=n_samples, random_state=0)

tfidf_vectorizer = TfidfVectorizer(
    max_features=40000,
    analyzer='word', 
    stop_words='english', 
    ngram_range=(1, 1), # logit accuracy with unigrams 0.1487, unigrams + bigrams 0.1524, unigrams + bigrams + trigrams 0.1537
    strip_accents='ascii')

tfidf_vectorizer.fit(x_train)

x_train = tfidf_vectorizer.transform(x_train)
x_val = tfidf_vectorizer.transform(x_val)

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_val = mlb.transform(y_val)


In [4]:
def stats(mlb, y_val, y_pred):
    total = len(y_val)
    all_labels_correct = 0

    for y_v, y_p in zip(mlb.inverse_transform(y_val), mlb.inverse_transform(y_pred)):
        if set(y_v) == set(y_p):
            all_labels_correct +=1

    print("Hamming loss %f, accuracy %f" % (metrics.hamming_loss(y_val, y_pred), metrics.accuracy_score(y_val, y_pred)))
    print("Number of correctly labeled articles out of a total of %d articles : %d" % (total, all_labels_correct))
    print("Manually calculated accuracy (for sanity check): ", all_labels_correct/total)
    

In [5]:
base_classifier = DummyClassifier()
chain = ClassifierChain(base_classifier, order='random', random_state=0)
chain.fit(x_train, y_train)

y_pred = chain.predict(x_val)

stats(mlb, y_val, y_pred)


Hamming loss 0.005647, accuracy 0.000000
Number of correctly labeled articles out of a total of 10000 articles : 0
Manually calculated accuracy (for sanity check):  0.0


In [6]:
base_classifier = ComplementNB()
chain = ClassifierChain(base_classifier, order='random', random_state=0)
chain.fit(x_train, y_train)

y_pred = chain.predict(x_val)

stats(mlb, y_val, y_pred)


Hamming loss 0.005167, accuracy 0.116000
Number of correctly labeled articles out of a total of 10000 articles : 1160
Manually calculated accuracy (for sanity check):  0.116


In [7]:
base_classifier = LogisticRegression(solver='lbfgs', random_state=0)
chain = ClassifierChain(base_classifier, order='random', random_state=0)
chain.fit(x_train, y_train)

y_pred = chain.predict(x_val)

stats(mlb, y_val, y_pred)


Hamming loss 0.004829, accuracy 0.148700
Number of correctly labeled articles out of a total of 10000 articles : 1487
Manually calculated accuracy (for sanity check):  0.1487
