In [1]:
from google.cloud import bigquery

client = bigquery.Client()

QUERY = (
    'SELECT * FROM `article-source.article_views.hackernews`')
query_job = client.query(QUERY)  

In [2]:
rows = query_job.result()  

In [3]:
query_job

QueryJob<project=article-source, location=US, id=e006ba67-00dd-45a7-940c-5a814a34fc67>

In [4]:
from transformers import pipeline
import torch

pipe = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions")

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")

In [6]:
first_row = next(rows)

In [7]:
first_row.comments

['I think these sites look very attractive, but this is a very subjective and specific definition of "good". Many of these pages do not finish loading in 3 seconds, for example. I\'m not sure it would be correct to frame these as "astronomically good" web design.']

In [8]:
first_row.detail_url

'https://news.ycombinator.com/item?id=37226805'

In [9]:
comment = first_row.comments[0]

In [10]:
comment_tokens = tokenizer(comment, return_tensors='pt')

In [11]:
comment_tokens

{'input_ids': tensor([[    0,   100,   206,   209,  3091,   356,   182,  6043,     6,    53,
            42,    16,    10,   182, 22262,     8,  2167,  8515,     9,    22,
          8396,   845,  1876,     9,   209,  6052,   109,    45,  2073, 16761,
            11,   155,  2397,     6,    13,  1246,     4,    38,   437,    45,
           686,    24,    74,    28,  4577,     7,  5120,   209,    25,    22,
          1988,  2839,  1075,  3435,   205,   113,  3748,  1521,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [61]:
with torch.no_grad():
    output = model(**comment_tokens)

In [62]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-0.5851, -6.7813, -5.6964, -4.2389, -1.5744, -5.4951,  0.5477, -4.2243,
         -6.1095, -3.7572, -1.6259, -5.5178, -6.2968, -5.6325, -5.9487, -4.7712,
         -7.2741, -5.5293, -3.9624, -6.3900, -4.0808, -6.0886, -3.5761, -6.4716,
         -6.1779, -6.1516, -5.6621, -2.7154]]), hidden_states=None, attentions=None)

In [63]:
outputpipe = pipe(comment)

In [64]:
outputpipe

[{'label': 'confusion', 'score': 0.633594274520874}]

In [65]:
import numpy as np

In [66]:
sm = torch.nn.Softmax(dim=0)
np.argmax(sm(output.logits[0]).detach().numpy())

6

In [68]:
sm(output.logits[0]).detach().numpy()[6]

0.59183

In [42]:
comment

'I think these sites look very attractive, but this is a very subjective and specific definition of "good". Many of these pages do not finish loading in 3 seconds, for example. I\'m not sure it would be correct to frame these as "astronomically good" web design.'

In [60]:
output.logits.argmax()

6

---

In [327]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

In [74]:
inputs = tokenizer("my dog is cute", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'LABEL_0'

In [75]:
sm(logits[0])

tensor([0.5388, 0.4612])

In [76]:
inputs = tokenizer(comment, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'LABEL_0'

In [82]:
first_row.detail_url

'https://news.ycombinator.com/item?id=37226805'

In [77]:
sm(logits[0])

tensor([0.5268, 0.4732])

In [91]:
rows = query_job.result()  

In [92]:
scored_rows = []

# Test: average of the classification scores of the comments. Score of 1 = all positive. 

for row in rows:
    row_dict = {}
    row_dict['url'] = row.detail_url
    row_dict['comments'] = row.comments

    total = 0
    for comment in row.comments:
        inputs = tokenizer(comment, truncation=True, max_length=512, return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits
            total += logits.argmax().item()
    row_dict['score'] = (len(row.comments) - total) / len(row.comments)
    scored_rows.append(row_dict)
    


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [94]:
len(scored_rows)

68

In [100]:
sorted_rows = sorted(scored_rows, key=lambda row: row['score'], reverse=True) 

In [328]:
for row in sorted_rows:
    print(row['url'])
    print(row['score'])
    print('\n\n')

# Output removed

In [162]:
rows = query_job.result()  

scored_rows = []

# Test: average ratio of pos:neg odds (non-normalized) 
for row in rows:
    row_dict = {}
    row_dict['url'] = row.detail_url
    row_dict['comments'] = row.comments

    total = 0
    for comment in row.comments:
        inputs = tokenizer(comment, truncation=True, max_length=512, return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits
            total += logits[0,0] / logits[0,1] 


    
    row_dict['score'] = total / len(row.comments)
    scored_rows.append(row_dict)

In [163]:
sorted_rows_2 = sorted(scored_rows, key=lambda row: row['score'], reverse=True) 

In [329]:
for row in sorted_rows_2:
    print(row['url'])
    print(row['score'])
    print('\n\n')

# Output removed

In [118]:
sm

Softmax(dim=0)

In [145]:
rows = query_job.result() 
sm = torch.nn.Softmax(dim=1)
scored_rows = []

# Test: average ratio of pos:neg odds (normalized) 

for row in rows:
    row_dict = {}
    row_dict['url'] = row.detail_url
    row_dict['comments'] = row.comments

    total = 0
    for comment in row.comments:
        inputs = tokenizer(comment, truncation=True, max_length=512, return_tensors="pt")
        with torch.no_grad():
            logits = sm(model(**inputs).logits)
            total += logits[0,0] / logits[0,1] 

            # print(logits[0,0] / logits[0,1])
            # print(total)

    
    # print(total / len(row.comments))
    row_dict['score'] = total / len(row.comments)
    scored_rows.append(row_dict)

In [146]:
sorted_rows_3 = sorted(scored_rows, key=lambda row: row['score'], reverse=True) 

In [330]:
for row in sorted_rows_3:
    print(row['url'])
    print(row['score'])
    print('\n\n')
# Output removed

---

In [166]:
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")

In [167]:
rows = list(query_job.result())

In [168]:
comment_tokens = tokenizer(rows[0]['comments'][0], return_tensors='pt')
with torch.no_grad():
    output = model(**comment_tokens)

In [177]:
model.config.id2label

{0: 'admiration',
 1: 'amusement',
 2: 'anger',
 3: 'annoyance',
 4: 'approval',
 5: 'caring',
 6: 'confusion',
 7: 'curiosity',
 8: 'desire',
 9: 'disappointment',
 10: 'disapproval',
 11: 'disgust',
 12: 'embarrassment',
 13: 'excitement',
 14: 'fear',
 15: 'gratitude',
 16: 'grief',
 17: 'joy',
 18: 'love',
 19: 'nervousness',
 20: 'optimism',
 21: 'pride',
 22: 'realization',
 23: 'relief',
 24: 'remorse',
 25: 'sadness',
 26: 'surprise',
 27: 'neutral'}

In [182]:
target_emotion = 7
output.logits[0][target_emotion]

tensor(-4.2243)

In [183]:
rows = query_job.result() 
target_emotion = 7
scored_rows = []

# Test: roberta emotions - rank by average total curiosity 

for row in rows:
    row_dict = {}
    row_dict['url'] = row.detail_url
    row_dict['comments'] = row.comments

    total = 0
    for comment in row.comments:
        inputs = tokenizer(comment, truncation=True, max_length=512, return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits
            total += np.exp(logits[0][target_emotion])

            # print(logits[0,0] / logits[0,1])
            # print(total)

    
    # print(total / len(row.comments))
    row_dict['score'] = total / len(row.comments)
    scored_rows.append(row_dict)

In [None]:
sorted_rows_4 = sorted(scored_rows, key=lambda row: row['score'], reverse=True)

for row in sorted_rows_4:
    print(row['url'])
    print(row['score'])
    print('\n\n')


This is pretty cool. I'll go with this for now.


---

In [266]:
rows = query_job.result() 
target_emotion = 7
scored_rows = []

# Implement batching 

for row in rows:
    row_dict = {}
    row_dict['url'] = row.detail_url
    row_dict['comments'] = row.comments

    inputs = tokenizer(
        row.comments, 
        truncation=True,
        padding=True,
        max_length=512, 
        return_tensors="pt"
    )
    
    with torch.no_grad():
        logits = model(**inputs).logits[:,target_emotion]
        
    logits = torch.exp(logits)
    row_dict['score'] = torch.mean(logits)
    scored_rows.append(row_dict)

In [311]:
rows = list(query_job.result())


In [295]:
%%time
inputs = tokenizer(
    rows[6]['comments'], 
    truncation=True,
    padding=True,
    max_length=512, 
    return_tensors="pt"
)

CPU times: user 37.1 ms, sys: 7.5 ms, total: 44.6 ms
Wall time: 26.4 ms


In [296]:
inputs.input_ids.shape

torch.Size([70, 465])

In [297]:
%%time
with torch.no_grad():
    logits = model(**inputs).logits[:,target_emotion]

CPU times: user 32.4 s, sys: 10.8 s, total: 43.2 s
Wall time: 12.7 s


In [299]:
%%time
for comment in rows[6]['comments']:
    inputs = tokenizer(
        comment, 
        truncation=True,
        padding=True,
        max_length=512, 
        return_tensors="pt"
    )
    with torch.no_grad():
        logits = model(**inputs).logits[:,target_emotion]

CPU times: user 6.81 s, sys: 964 ms, total: 7.77 s
Wall time: 3.79 s


In [300]:
%%time
inputs = tokenizer(
    rows[6]['comments'], 
    truncation=True,
    padding=True,
    max_length=512, 
    return_tensors="pt"
)
with torch.no_grad():
    logits = model(**inputs).logits[:,target_emotion]

CPU times: user 32.1 s, sys: 10.6 s, total: 42.7 s
Wall time: 13.1 s


In [312]:
%%time
import math
all_logits = torch.empty((0))

comments_list = rows[6]['comments']

batch_size = 16
comments_list_size = len(comments_list)
batches = math.ceil(comments_list_size / batch_size)

for i in range(batches):
    comments = comments_list[i*batch_size:(i+1)*batch_size]
    inputs = tokenizer(
        comments, 
        truncation=True,
        padding=True,
        max_length=512, 
        return_tensors="pt"
    )
    with torch.no_grad():
        logits = model(**inputs).logits[:,target_emotion]
        all_logits = torch.cat((all_logits, logits), dim=0)
        

CPU times: user 18.1 s, sys: 4.58 s, total: 22.7 s
Wall time: 6.63 s


In [314]:
%%time
import math

# Batching script

rows = query_job.result() 
target_emotion = 7
batch_size = 16
scored_rows = []

for row in rows:
    row_dict = {}
    row_dict['url'] = row.detail_url
    row_dict['comments'] = row.comments

    if len(row.comments) <= batch_size:
        # print('below batch size')
        inputs = tokenizer(
            row.comments, 
            truncation=True,
            padding=True,
            max_length=512, 
            return_tensors="pt"
        )
        with torch.no_grad():
            all_logits = model(**inputs).logits[:,target_emotion]

    else:
        # print('batching activated')
        all_logits = torch.empty((0))
        comments_len = len(row.comments)
        batches = math.ceil(comments_len / batch_size)
        
        for i in range(batches):
            comments = row.comments[i*batch_size:(i+1)*batch_size]
            # print(len(comments))
            inputs = tokenizer(
                comments, 
                truncation=True,
                padding=True,
                max_length=512, 
                return_tensors="pt"
            )
            # print(inputs.input_ids.shape)
            with torch.no_grad():
                logits = model(**inputs).logits[:,target_emotion]
                all_logits = torch.cat((all_logits, logits), dim=0)
                # print(logits)

    # print('logits:', all_logits)
    scores = torch.exp(all_logits)
    row_dict['score'] = torch.mean(scores)
    # print('score:', row_dict['score'])
    scored_rows.append(row_dict)

CPU times: user 9min 56s, sys: 2min 7s, total: 12min 3s
Wall time: 3min 36s


In [331]:
sorted_rows_6 = sorted(scored_rows, key=lambda row: row['score'], reverse=True)

for row in sorted_rows_6:
    print(row['url'])
    print(row['score'])
    print('')

# Output removed

In [317]:
%%time 
rows = query_job.result() 
target_emotion = 7
scored_rows = []

# Test: roberta emotions - rank by average total curiosity - speedtest 

for row in rows:
    row_dict = {}
    row_dict['url'] = row.detail_url
    row_dict['comments'] = row.comments

    total = 0
    for comment in row.comments:
        inputs = tokenizer(comment, truncation=True, max_length=512, return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits
            total += np.exp(logits[0][target_emotion])

    row_dict['score'] = total / len(row.comments)
    scored_rows.append(row_dict)

CPU times: user 3min 51s, sys: 28.6 s, total: 4min 20s
Wall time: 1min 58s


In [332]:
sorted_rows_7 = sorted(scored_rows, key=lambda row: row['score'], reverse=True)

for row in sorted_rows_7:
    print(row['url'])
    print(row['score'])
    print('')

# Output removed

In [319]:
# Seems like batching isn't effective unless MPS is active

if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

In [320]:
mps_device = torch.device("mps")

In [321]:
model.to(mps_device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [324]:
%%time
import math

# Batching script

rows = query_job.result() 
target_emotion = 7
batch_size = 16
scored_rows = []

for row in rows:
    row_dict = {}
    row_dict['url'] = row.detail_url
    row_dict['comments'] = row.comments

    if len(row.comments) <= batch_size:
        # print('below batch size')
        inputs = tokenizer(
            row.comments, 
            truncation=True,
            padding=True,
            max_length=512, 
            return_tensors="pt"
        ).to(mps_device)
        with torch.no_grad():
            all_logits = model(**inputs).logits[:,target_emotion]

    else:
        # print('batching activated')
        all_logits = torch.empty((0)).to(mps_device)
        comments_len = len(row.comments)
        batches = math.ceil(comments_len / batch_size)
        
        for i in range(batches):
            comments = row.comments[i*batch_size:(i+1)*batch_size]
            # print(len(comments))
            inputs = tokenizer(
                comments, 
                truncation=True,
                padding=True,
                max_length=512, 
                return_tensors="pt"
            ).to(mps_device)
            # print(inputs.input_ids.shape)
            with torch.no_grad():
                logits = model(**inputs).logits[:,target_emotion]
                all_logits = torch.cat((all_logits, logits), dim=0)
                # print(logits)

    # print('logits:', all_logits)
    scores = torch.exp(all_logits)
    row_dict['score'] = torch.mean(scores)
    # print('score:', row_dict['score'])
    scored_rows.append(row_dict)

CPU times: user 34.5 s, sys: 6.61 s, total: 41.1 s
Wall time: 1min 28s


In [326]:
%%time 
rows = query_job.result() 
target_emotion = 7
scored_rows = []

# Test: roberta emotions - rank by average total curiosity - speedtest 

for row in rows:
    row_dict = {}
    row_dict['url'] = row.detail_url
    row_dict['comments'] = row.comments

    total = 0
    for comment in row.comments:
        inputs = tokenizer(comment, truncation=True, max_length=512, return_tensors="pt").to(mps_device)
        with torch.no_grad():
            logits = model(**inputs).logits
            total += torch.exp(logits[0][target_emotion])

    row_dict['score'] = total / len(row.comments)
    scored_rows.append(row_dict)

CPU times: user 4min 21s, sys: 25 s, total: 4min 46s
Wall time: 4min 40s


---

Makes a lot more sense. Using plain CPU, the serial script ran in 2-3 minutes, while the batched ran in 3-4. With MPS enabled, batching is down to 1.5m while serial is 4+. 

---

Collecting the useful code here:

In [None]:
# BigQuery load
from google.cloud import bigquery

client = bigquery.Client()

QUERY = (
    'SELECT * FROM `article-source.article_views.hackernews`')
query_job = client.query(QUERY)  
rows = query_job.result()  

In [None]:
# Torch and Transformers
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification
)

In [None]:
# Model and Tokenizer load
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")

In [None]:
# Label and ID mappings
model.config.id2label

In [None]:
# Batching script
import math


target_emotion = 7
batch_size = 16
scored_rows = []

for row in rows:
    row_dict = {}
    row_dict['url'] = row.detail_url
    row_dict['comments'] = row.comments

    if len(row.comments) <= batch_size:
        inputs = tokenizer(
            row.comments, 
            truncation=True,
            padding=True,
            max_length=512, 
            return_tensors="pt"
        )
        with torch.no_grad():
            all_logits = model(**inputs).logits[:,target_emotion]

    else:
        all_logits = torch.empty((0))
        comments_len = len(row.comments)
        batches = math.ceil(comments_len / batch_size)
        
        for i in range(batches):
            comments = row.comments[i*batch_size:(i+1)*batch_size]
            inputs = tokenizer(
                comments, 
                truncation=True,
                padding=True,
                max_length=512, 
                return_tensors="pt"
            )
            with torch.no_grad():
                logits = model(**inputs).logits[:,target_emotion]
                all_logits = torch.cat((all_logits, logits), dim=0)

    scores = torch.exp(all_logits)
    row_dict['score'] = torch.mean(scores)
    scored_rows.append(row_dict)