In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/notebook388161bf19/__results__.html
/kaggle/input/notebook388161bf19/headline_classifier.pt
/kaggle/input/notebook388161bf19/__notebook__.ipynb
/kaggle/input/notebook388161bf19/__output__.json
/kaggle/input/notebook388161bf19/custom.css
/kaggle/input/upworthy-packages/packages.csv
/kaggle/input/bert-base-uncased/config.json
/kaggle/input/bert-base-uncased/pytorch_model.bin
/kaggle/input/bert-base-uncased/vocab.txt


In [2]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [3]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("/kaggle/input/bert-base-uncased",
                                                      num_labels = 41,
                                                      output_attentions = False, # Whether the model returns attentions weights.
                                                      output_hidden_states = False)
model.load_state_dict(torch.load('/kaggle/input/notebook388161bf19/headline_classifier.pt', map_location=torch.device('cpu')))

Some weights of the model checkpoint at /kaggle/input/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

<All keys matched successfully>

In [4]:
df = pd.read_csv('/kaggle/input/upworthy-packages/packages.csv')
headlines = df.headline.values

In [5]:
from transformers import BertTokenizer

#Use lower case, as in our use case capitalisation style may differ from Huff Post
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert-base-uncased/vocab.txt', do_lower_case=True)



In [6]:
#Clean data: remove 129000:130000
headlines = list(headlines[:129000]) + list(headlines[130000:])

In [7]:
input_tokens = [] #tokenized input
attention_masks = [] #indicates padded tokens
encoded_dict = tokenizer(headlines,
                             add_special_tokens = True, #For classification markers
                             padding='longest',
                             return_attention_mask = True, #To indicate useful data
                             return_tensors = 'pt') #Pytorch
input_tokens = encoded_dict['input_ids']
attention_masks = encoded_dict['attention_mask']
input_tokens, attention_masks

(tensor([[ 101, 2027, 1005,  ...,    0,    0,    0],
         [ 101, 2027, 1005,  ...,    0,    0,    0],
         [ 101, 2027, 1005,  ...,    0,    0,    0],
         ...,
         [ 101, 2027, 1005,  ...,    0,    0,    0],
         [ 101, 2027, 1005,  ...,    0,    0,    0],
         [ 101, 2027, 1005,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]))

In [8]:
input_tokens.shape, attention_masks.shape

(torch.Size([149817, 48]), torch.Size([149817, 48]))

In [9]:
#Reduce size
input_tokens = input_tokens[:10000, 0:48]
attention_masks = attention_masks[:10000, 0:48]

In [10]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
test_dataset = TensorDataset(input_tokens, attention_masks)
test_dl = DataLoader(test_dataset,
                      sampler = SequentialSampler(test_dataset),
                      batch_size = 64)

In [11]:
results = []

In [12]:
count = 0
for batch in test_dl:
    batch_input_tokens = batch[0].to(device)
    batch_input_mask = batch[1].to(device)
        
    with torch.no_grad():
        result = model(batch_input_tokens,
                           token_type_ids=None, 
                           attention_mask=batch_input_mask, 
                           return_dict=True)
        
        logits = result.logits.detach().numpy()
        preds = np.argmax(logits, axis=1).flatten()
        print(preds)
        results.extend(preds)

[11 11 11  7  7  7  7  7  7  7  7  1  4  1  1  1  1  1  1  1  1  1  5 26
 12  9  7 32  7 12 20  8  1 17 31 11 16 16 16 16 12 12 12 12 12 20  4 20
  4 18  1 16  1  1  1 16 16 16 16  7 14  7  8 17]
[17 24  1  4  4  4  4  9  9  9  9  1  1  8  8  7  7  7  8  9  7  9  9  9
  9 25  9  9  9  9  8  8  8  8  9  9  9 20  4  4  4  4  1  9  1 30 20  1
 21 20  9  7  6 21 12 26  6 25  5  3  9 20 25 38]
[ 7  4  4 20  4 25  4 25 20 25  5 28 28 28 28 32 32  4  6 20 32 20  6 38
  5  5 20  5  5 25 25 25 25  4  8  8  7  7 32 10 10 10  4 11 10 25 34 20
  4 32  6 20 20 20  4  7  4 23  9 12  8  8  7  7]
[24 11 11  1  1 12 25  7 32  9  7 20 20 20 11  1 20 10 11 11 11 11 24 24
 24 25 25 25 25  7  7  7  7  7  7  7 31 25 25 25 20 20  3  6  8  3  6  1
  1  8  6  6  6  8  8 38 38 20 25 25 25  4  4 27]
[ 7 23 22  1  7 13  7  1  1 25  9  6  6  6  8  1  4  7  7  4  1  7  4 25
 20  9  9 12  7 16 16  6 16  7 20  6  3  9  4 21  9  7 26 10 21  7  7  7
  7  7  7  7  7  7 20 32 20  7 20  7  8  8  8  8]
[ 8  7  8 34  1  7  

In [13]:
print(len(results))

10000


In [14]:
short_df = df.loc[:9999, ['created_at', 'test_week', 'test_id', 'headline', 'image_id', 'excerpt', 'lede', 'slug', 'share_text', 'share_image', 'impressions', 'clicks', 'first_place', 'winner']]
short_df.head()

Unnamed: 0,created_at,test_week,test_id,headline,image_id,excerpt,lede,slug,share_text,share_image,impressions,clicks,first_place,winner
0,2014-11-20,2014-11-17,546d88fb84ad38b2ce000024,They're Being Called 'Walmart's Worst Nightmar...,546d6fa19ad54eec8d00002d,Things that matter. Pass 'em on.,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,Anyone who's ever felt guilty about shopping a...,,3052,150,True,True
1,2014-11-20,2014-11-17,546d88fb84ad38b2ce000024,They're Being Called 'Walmart's Worst Nightmar...,546d6fa19ad54eec8d00002d,Things that matter. Pass 'em on.,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,Walmart is getting schooled by another retaile...,,3033,122,False,False
2,2014-11-20,2014-11-17,546d88fb84ad38b2ce000024,They're Being Called 'Walmart's Worst Nightmar...,546d6fa19ad54eec8d00002d,Things that matter. Pass 'em on.,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,Walmart may not be crapping their pants over t...,,3092,110,False,False
3,2014-11-20,2014-11-17,546d902c26714c6c44000039,This Is What Sexism Against Men Sounds Like,546bc55335992b86c8000043,Things that matter. Pass 'em on.,<p>DISCLOSURE: I'm a dude. I have cried on mul...,this-is-what-sexism-against-men-sounds-like-am...,"If you ever wondered, ""but what about the men?...",,3526,90,False,False
4,2014-11-20,2014-11-17,546d902c26714c6c44000039,This Is What Sexism Against Men Sounds Like,546d900426714cd2dd00002e,Things that matter. Pass 'em on.,<p>DISCLOSURE: I'm a dude. I have cried on mul...,this-is-what-sexism-against-men-sounds-like-am...,"If you ever wondered, ""but what about the men?...",,3506,120,True,False


In [15]:
print(short_df.shape)

(10000, 14)


In [16]:
preds_to_labels = {0: 'CRIME', 1: 'ENTERTAINMENT', 2: 'WORLD NEWS', 3: 'IMPACT', 4: 'POLITICS', 5: 'WEIRD NEWS', 6: 'BLACK VOICES', 7: 'WOMEN', 8: 'COMEDY', 9: 'QUEER VOICES', 
                   10: 'SPORTS', 11: 'BUSINESS', 12: 'TRAVEL', 13: 'MEDIA', 14: 'TECH', 15: 'RELIGION', 16: 'SCIENCE', 17: 'LATINO VOICES', 18: 'EDUCATION', 19: 'COLLEGE', 
                   20: 'PARENTS', 21: 'ARTS & CULTURE', 22: 'STYLE', 23: 'GREEN', 24: 'TASTE', 25: 'HEALTHY LIVING', 26: 'THE WORLDPOST', 27: 'GOOD NEWS', 28: 'WORLDPOST', 29: 'FIFTY', 
                   30: 'ARTS', 31: 'WELLNESS', 32: 'PARENTING', 33: 'HOME & LIVING', 34: 'STYLE & BEAUTY', 35: 'DIVORCE', 36: 'WEDDINGS', 37: 'FOOD & DRINK', 38: 'MONEY', 39: 'ENVIRONMENT', 
                   40: 'CULTURE & ARTS'}
cats = []
for i in range(10000):
    cats.append(preds_to_labels[results[i]])
short_df['cat'] = cats
print(short_df['cat'].value_counts())

POLITICS          1430
WOMEN             1069
COMEDY             721
ENTERTAINMENT      692
PARENTS            616
QUEER VOICES       597
HEALTHY LIVING     589
PARENTING          436
BUSINESS           421
WELLNESS           380
IMPACT             287
BLACK VOICES       272
WEIRD NEWS         247
SCIENCE            238
GREEN              229
TRAVEL             166
TASTE              144
SPORTS             141
MEDIA              125
RELIGION           114
ARTS & CULTURE     110
TECH                94
STYLE               89
CRIME               85
THE WORLDPOST       85
COLLEGE             77
MONEY               73
WEDDINGS            70
FOOD & DRINK        67
EDUCATION           62
WORLDPOST           61
DIVORCE             50
GOOD NEWS           40
STYLE & BEAUTY      40
ARTS                38
HOME & LIVING       23
LATINO VOICES       17
FIFTY                4
ENVIRONMENT          1
Name: cat, dtype: int64


In [17]:
short_df.to_csv('out.csv')
from IPython.display import FileLink
FileLink('out.csv')