In [1]:
pip install --upgrade mit-news-classify

Collecting mit-news-classify
  Downloading mit_news_classify-0.9.2.3-py3-none-any.whl (32 kB)
Installing collected packages: mit-news-classify
  Attempting uninstall: mit-news-classify
    Found existing installation: mit-news-classify 0.9.2.2
    Uninstalling mit-news-classify-0.9.2.2:
      Successfully uninstalled mit-news-classify-0.9.2.2
Successfully installed mit-news-classify-0.9.2.3
Note: you may need to restart the kernel to use updated packages.


In [8]:
%%time
from mitnewsclassify import download

download.download()

model_2500_500_50.h5?dl=1: 0.00B [00:00, ?B/s]Package directory: /home/mbaliesnyi/anaconda3/envs/nlp/lib/python3.8/site-packages/mitnewsclassify
/home/mbaliesnyi/anaconda3/envs/nlp/lib/python3.8/site-packages/mitnewsclassify/data directory already exists, some other models downloaded. Continuing...
/home/mbaliesnyi/anaconda3/envs/nlp/lib/python3.8/site-packages/mitnewsclassify/data/tfidf directory already exists... perhaps you already downloaded the data? Overwriting...
/home/mbaliesnyi/anaconda3/envs/nlp/lib/python3.8/site-packages/mitnewsclassify/data/tfidf_bi directory already exists... perhaps you already downloaded the data? Overwriting...
/home/mbaliesnyi/anaconda3/envs/nlp/lib/python3.8/site-packages/mitnewsclassify/data/doc2vec directory already exists... perhaps you already downloaded the data? Overwriting...
/home/mbaliesnyi/anaconda3/envs/nlp/lib/python3.8/site-packages/mitnewsclassify/data/gpt2 directory already exists... perhaps you already downloaded the data? Overwriting

In [9]:
# download NYT dataset
!curl https://www.dropbox.com/sh/xu9tu5hmjhuddwk/AACFtsyeBeB7mw7WW37935cYa/NYTcorpus_test.p.gz -LO
!curl https://www.dropbox.com/sh/xu9tu5hmjhuddwk/AAD31tK6oEoGlhpRZzeu3Y3Ya/NYTcorpus_train.p.gz -LO

# train and test data labels are coded in numbers,
# but the models predict human-readable labels,
# so we need to map these for model evaluation. 
# Let's use one of the files downloaded by the mitnewsclassify package
!curl https://www.dropbox.com/s/omgstbndd3xl4cy/nyt-theme-tags.csv -LO

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 18902  100 18902    0     0  11813      0  0:00:01  0:00:01 --:--:-- 11813


In [10]:
import gzip
import pickle
from random import shuffle
import requests
import csv

# open the train data given to us by Max
with gzip.open('../../NYTcorpus_train.p.gz', mode='r') as f:
    train_data = pickle.load(f)

# open the test data given to us by Max
with gzip.open('../../NYTcorpus_test.p.gz', mode='r') as f:
    test_data = pickle.load(f)

# shuffle just in case the test and train data were not shuffled before - 
# we will only measure model's accuracy on a few thousand samples
shuffle(train_data)
shuffle(test_data)

# train and test data labels are coded in numbers,
# but the models predict human-readable labels,
# so we need to re-map these. 
# Let's use one of the files downloaded by the mitnewsclassify package
with open('nyt-theme-tags.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    tags_dict = {row['tags_id']: row['tag'] for row in reader}


# extract actual article texts from data samples
train_articles = [d[2] for d in train_data] 
test_articles = [d[2] for d in test_data]

# map the number-coded labels to human-readable labels
train_labels_lists = [list(map(tags_dict.get, d[3:])) for d in train_data]
test_labels_lists = [list(map(tags_dict.get, d[3:])) for d in test_data]

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def multi_label_scores(correct_labels, predicted_labels):

    accuracy = accuracy_score(correct_labels, predicted_labels)
    precision = precision_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    f_1_score = f1_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    
    return accuracy, precision, recall, f_1_score


In [12]:
from sklearn.preprocessing import MultiLabelBinarizer

# get the set of all 538 labels for fitting MultiLabelBinarizer
labels_flattened = []
for label in test_labels_lists:
    labels_flattened.extend(label)

label_names = list(set(labels_flattened))

mlb = MultiLabelBinarizer()
mlb.fit([sorted(label_names)])


MultiLabelBinarizer()

In [13]:
import warnings
from tqdm.autonotebook import tqdm

warnings.simplefilter(action='ignore', category=FutureWarning)


def evaluate(model, articles, labels_lists, n_of_articles = 100):

    model_name = model.__name__.split('.')[1]

    articles_subset = articles[:n_of_articles]
    labels_subset = labels_lists[:n_of_articles]

    print('predicting for', model_name)

    preds, correct = [], []

    for article, labels in tqdm(zip(articles_subset, labels_subset)):
        pred_labels = model.gettags(article)
        preds.append(pred_labels)
        correct.append(labels)

    correct, preds = mlb.transform(correct), mlb.transform(preds)
    accuracy, precision, recall, f_score = multi_label_scores(correct, preds)
    print(f'accuracy {round(accuracy,4)}, precision {round(precision,4)}, recall {round(recall,4)}, f-1 {round(f_score,4)}')
    print()
    return model_name, dict(accuracy=accuracy, precision=precision, recall=recall, f_score = f_score)


  from tqdm.autonotebook import tqdm


In [14]:
from mitnewsclassify import gpt2
from mitnewsclassify import tfidf
from mitnewsclassify import tfidf_bi
from mitnewsclassify import doc2vec
from mitnewsclassify import ensemble
from mitnewsclassify import trisemble
from mitnewsclassify import quadsemble
from mitnewsclassify import pentasemble

models = [
    gpt2,
    tfidf_bi,
    doc2vec,
    quadsemble,
    pentasemble,
    ensemble,
    trisemble,
    tfidf,
]

train_set_scores = {}
test_set_scores = {}

for model in models:
    print('Train: ')
    model_name, model_scores = evaluate(model, train_articles, train_labels_lists, n_of_articles=500)
    train_set_scores[model_name] = model_scores
    
    print('Test: ')
    model_name, model_scores = evaluate(model, test_articles, test_labels_lists, n_of_articles=500)
    test_set_scores[model_name] = model_scores


Train: 
predicting for gpt2


0it [00:00, ?it/s]

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Initializing...
Model...
Miscellaneous...
accuracy 0.0, precision 0.0591, recall 0.0277, f-1 0.0312

Test: 
predicting for gpt2


0it [00:00, ?it/s]

accuracy 0.0, precision 0.0477, recall 0.0297, f-1 0.025

Train: 
predicting for tfidf_bi


0it [00:00, ?it/s]

Initializing...
Model...
Count Vectorizer...
TF-IDF Transformer...
Miscellaneous...
accuracy 0.286, precision 0.7064, recall 0.474, f-1 0.5481

Test: 
predicting for tfidf_bi


0it [00:00, ?it/s]

accuracy 0.272, precision 0.6666, recall 0.4542, f-1 0.5192

Train: 
predicting for doc2vec


0it [00:00, ?it/s]

Initializing...
Model...
Doc2Vec Model...
Miscellaneous...
accuracy 0.332, precision 0.7461, recall 0.5827, f-1 0.638

Test: 
predicting for doc2vec


0it [00:00, ?it/s]

accuracy 0.274, precision 0.7117, recall 0.5426, f-1 0.5929

Train: 
predicting for quadsemble


0it [00:00, ?it/s]

Initializing...
Model...
Miscellaneous...
Initializing...
Model...
Count Vectorizer...
TF-IDF Transformer...
Miscellaneous...
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Initializing...
Miscellaneous...
accuracy 0.398, precision 0.8096, recall 0.69, f-1 0.7281

Test: 
predicting for quadsemble


0it [00:00, ?it/s]

accuracy 0.338, precision 0.7611, recall 0.6626, f-1 0.6917

Train: 
predicting for pentasemble


0it [00:00, ?it/s]

Initializing...
Model...
Miscellaneous...
accuracy 0.434, precision 0.8139, recall 0.681, f-1 0.7253

Test: 
predicting for pentasemble


0it [00:00, ?it/s]

accuracy 0.346, precision 0.7649, recall 0.64, f-1 0.6792

Train: 
predicting for ensemble


0it [00:00, ?it/s]

Initializing...
Model...
Miscellaneous...
accuracy 0.408, precision 0.8108, recall 0.6388, f-1 0.6943

Test: 
predicting for ensemble


0it [00:00, ?it/s]

accuracy 0.364, precision 0.7643, recall 0.6284, f-1 0.6736

Train: 
predicting for trisemble


0it [00:00, ?it/s]

Initializing...
Model...
Miscellaneous...
accuracy 0.422, precision 0.8084, recall 0.681, f-1 0.7233

Test: 
predicting for trisemble


0it [00:00, ?it/s]

accuracy 0.348, precision 0.755, recall 0.6387, f-1 0.6756

Train: 
predicting for tfidf


0it [00:00, ?it/s]

accuracy 0.378, precision 0.7877, recall 0.6131, f-1 0.6702

Test: 
predicting for tfidf


0it [00:00, ?it/s]

accuracy 0.34, precision 0.745, recall 0.6071, f-1 0.6545



In [15]:
all_train_results = sorted(train_set_scores.items(), key=lambda k: k[1]['f_score'])

all_test_results = sorted(test_set_scores.items(), key=lambda k: k[1]['f_score'])

# best model
all_train_results[-1], all_test_results[-1]

(('quadsemble',
  {'accuracy': 0.398,
   'precision': 0.809600022443656,
   'recall': 0.6899653979238755,
   'f_score': 0.7280642619169311}),
 ('quadsemble',
  {'accuracy': 0.338,
   'precision': 0.7610686918660271,
   'recall': 0.6625806451612903,
   'f_score': 0.6916691487740084}))

In [None]:
from pprint import pprint
art = train_articles[10]
pprint(art)


('LEAD: Farm ministers of the European Community opened their first '
 'substantial debate today on a proposed freeze of guaranteed farm product '
 "prices for 1987. The bloc's executive commission is seeking to hold down "
 "prices to reduce the community's large farm surpluses. Farm ministers of the "
 'European Community opened their first substantial debate today on a proposed '
 "freeze of guaranteed farm product prices for 1987. The bloc's executive "
 "commission is seeking to hold down prices to reduce the community's large "
 'farm surpluses.')


In [None]:
gpt2.gettags(art)

['office buildings and commercial properties',
 'marriages',
 'restaurants',
 'income tax',
 'insects',
 'abortion',
 'building (construction)']