In [None]:
#default_exp gpt

# GPT Zero-Shot Classification

> Attempting zero-shot solutions confined to just GPT-2 with no fine-tuning

The other solutions perform well, and it's unlikely that GPT-2 alone will do better than BART, but it's worth a shot, and there are other interesting thigs we can try with the GPT architecture.

In [None]:
#export
from ought.starter import *
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import json
import pandas as pd
import numpy as np

Since all of these experiments will use GPT-2 in some way, it's useful to have a top-level base class that provides shared functionality to all the GPT variants. The base class itself does *not* implement the `predict` method. Note that is is mostly a cleaned up and packaged version of the starter code.

In [None]:
#export
class GPTBase:
    def __init__(self):
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.model.eval().cuda()
        
    def generate(self, prompt, max_length=5, stop_token=None):
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
        generated_text_ids = self.model.generate(input_ids=input_ids.cuda(), max_length=max_length+len(input_ids[0]), do_sample=False)
        generated_text = self.tokenizer.decode(generated_text_ids[0], clean_up_tokenization_spaces=True)
        post_prompt_text = generated_text[len(self.tokenizer.decode(input_ids[0], clean_up_tokenization_spaces=True)):]
        return prompt + post_prompt_text[:post_prompt_text.find(stop_token) if stop_token else None]
    
    def get_logits_and_tokens(self, text):
        input_ids = self.tokenizer.encode(text, return_tensors="pt")
        tokens = [self.tokenizer.decode([input_id]) for input_id in input_ids[0]]
        output = self.model(input_ids.cuda())
        return output.logits[0][:-1], tokens

## Raw Language Model Token Classifier 

This is the simplest possible solution - a replica of the starter code refactored into a single class to provide an interface that is consistent with the other models.

In [None]:
#export
class GPTLMClassifier(GPTBase):
    def __init__(self, instructions='Label each of the following examples as "AI" or "NOT AI"', json='data/train.jsonl', samples=4):
        super(GPTLMClassifier, self).__init__()
        self.instructions = instructions
        self.context = load_jsonl(json)[:samples]
        
    def predict(self, prompt):
        prompt = make_prompt(self.instructions, self.context, {'text': prompt})
        out = self.generate(prompt, stop_token="\n")
        # to create a concrete prediction, take the last line and strip the "LABEL: " component 
        pred = out.split('\n')[-1].strip("LABEL: ")
        return pred

> Note: you might have to restart the notebook to clear GPU memory at this point

In [None]:
test = load_jsonl("data/test_no_labels.jsonl")
example = test[0]
prompt = example['text']
prompt

'out of plane effect on the superconductivity of sr2 xbaxcuo3+d with tc up to 98k. we comment on the paper published by w.b. gao q.q. liu l.x. yang y.yu f.y. li c.q. jin and s. uchida in phys. rev. b and give alternate explanations for the enhanced superconductivity. the enhanced onset tc of 98k observed upon substituting ba for sr is attributed to optimal oxygen ordering rather than to the increase in volume. comparison with la2cuo +x samples suggest that the effect of disorder is overestimated.'

In [None]:
%%time
clas = GPTLMClassifier()
pred = clas.predict(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


CPU times: user 10.2 s, sys: 333 ms, total: 10.6 s
Wall time: 6.78 s


In [None]:
pred

'NOT AI'

## Embedding Similarity Score

Another approach to classification would be to compare the final-layer embeddings of the unknown sample to that of known samples.

The general hypothesis here is that you should know most of what you need to know about a paper . In other words, the marginal information provided by the next word in abstract decreases across the sequence. So, we collect the maximum number of hidden states possible and perform a matrix multiplication. The norm of the resulting matrix is the similarity score.True  

In [None]:
gpt = GPTBase()

In [None]:
outs = []
for sample in uniform_samples():    
    input_ids = gpt.tokenizer.encode(sample['text'], return_tensors="pt")
    tokens = [gpt.tokenizer.decode([input_id]) for input_id in input_ids[0]]
    output = gpt.model(input_ids.cuda(), output_hidden_states=True)
    outs.append(output)

In [None]:
source = outs[0].hidden_states[0]
targ_1 = outs[1].hidden_states[0]
targ_2 = outs[2].hidden_states[0]

In [None]:
min_idx = min(source.size()[1], targ_1.size()[1])
(source[0,:min_idx,:].T@targ_1[0,:min_idx,:]).norm()

tensor(867.2198, device='cuda:0', grad_fn=<CopyBackwards>)

In [None]:
min_idx = min(source.size()[1], targ_2.size()[1])
(source[0,:min_idx,:].T@targ_2[0,:min_idx,:]).norm()

tensor(1994.7556, device='cuda:0', grad_fn=<CopyBackwards>)

In [None]:
for i, targ in enumerate(targs):
    min_idx = min(source.size()[1], targ.size()[1])
    score = (source[0,:min_idx,:].T@targ[0,:min_idx,:]).norm()

In [None]:
#export
class GPTMatmulClassifier(GPTBase):
    def __init__(self, json='data/train.jsonl', samples=2):
        super(GPTMatmulClassifier, self).__init__()
        self.samples = uniform_samples(json, samples)
        self.outs = []
        for sample in self.samples:    
            input_ids = self.tokenizer.encode(sample['text'], return_tensors="pt")
            tokens = [self.tokenizer.decode([input_id]) for input_id in input_ids[0]]
            out = self.model(input_ids.cuda(), output_hidden_states=True)
            self.outs.append(out)
        self.targs = [out.hidden_states[-1] for out in self.outs]
        
    def predict(self, prompt):
        
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
        tokens = [self.tokenizer.decode([input_id]) for input_id in input_ids[0]]
        source = self.model(input_ids.cuda(), output_hidden_states=True).hidden_states[-1]
        
        scores = []
        for targ in self.targs:
            min_idx = min(source.size()[1], targ.size()[1])
            score = (source[0,:min_idx,:].T@targ[0,:min_idx,:]).norm() / min_idx
            scores.append(score)

        pred = self.samples[scores.index(max(scores))]['label']
        # this portion is specific to binary classification 
        return 'NOT AI' if pred == 'False' else 'AI'
        

We can now test this new classifier in the usual way.

In [None]:
test = load_jsonl("data/test_no_labels.jsonl")
example = test[0]
prompt = example['text']
prompt

'out of plane effect on the superconductivity of sr2 xbaxcuo3+d with tc up to 98k. we comment on the paper published by w.b. gao q.q. liu l.x. yang y.yu f.y. li c.q. jin and s. uchida in phys. rev. b and give alternate explanations for the enhanced superconductivity. the enhanced onset tc of 98k observed upon substituting ba for sr is attributed to optimal oxygen ordering rather than to the increase in volume. comparison with la2cuo +x samples suggest that the effect of disorder is overestimated.'

In [None]:
%%time
clas = GPTMatmulClassifier()
pred = clas.predict(prompt)

[tensor(53382.4570, device='cuda:0', grad_fn=<DivBackward0>), tensor(52856.9922, device='cuda:0', grad_fn=<DivBackward0>), tensor(56165.2070, device='cuda:0', grad_fn=<DivBackward0>), tensor(53237.0820, device='cuda:0', grad_fn=<DivBackward0>)]
CPU times: user 9.31 s, sys: 304 ms, total: 9.62 s
Wall time: 6.59 s


In [None]:
pred

'AI'

One issue with this is that  matrix multiplications do not accurately capture similary between sets of vectors. They are also more computationally expensive. An alternative is using a dot product between each of the vectors, which *does* measure similarity more directly. One concern with dot products might be that they'll give too much importance to the positions, but self-attention should mitigate that concern. All hidden vectors at all positions should have *some* information about the sequence as a whole.

In [None]:
#export
class GPTSimilarityClassifier(GPTBase):
    def __init__(self, json='data/train.jsonl', samples=2):
        super(GPTSimilarityClassifier, self).__init__()
        self.samples = uniform_samples(json, samples)
        self.outs = []
        for sample in self.samples:    
            input_ids = self.tokenizer.encode(sample['text'], return_tensors="pt")
            tokens = [self.tokenizer.decode([input_id]) for input_id in input_ids[0]]
            out = self.model(input_ids.cuda(), output_hidden_states=True)
            self.outs.append(out)
        self.targs = [out.hidden_states[-1] for out in self.outs]
        
    def predict(self, prompt):
        
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
        tokens = [self.tokenizer.decode([input_id]) for input_id in input_ids[0]]
        source = self.model(input_ids.cuda(), output_hidden_states=True).hidden_states[-1]
        
        scores = []
        for targ in self.targs:
            min_idx = min(source.size()[1], targ.size()[1])
            score = (source[0,:min_idx,:] * targ[0,:min_idx,:]).sum() / min_idx
            scores.append(score)

        pred = self.samples[scores.index(max(scores))]['label']
        # this portion is specific to binary classification 
        return 'NOT AI' if pred == 'False' else 'AI'

In [None]:
clas = GPTSimilarityClassifier()

In [None]:
%%time
pred = clas.predict(prompt)

[tensor(51208.2266, device='cuda:0', grad_fn=<DivBackward0>), tensor(49627.3359, device='cuda:0', grad_fn=<DivBackward0>), tensor(55836.4375, device='cuda:0', grad_fn=<DivBackward0>), tensor(55042.0508, device='cuda:0', grad_fn=<DivBackward0>)]
CPU times: user 26.5 ms, sys: 94 µs, total: 26.6 ms
Wall time: 26 ms


In [None]:
pred

'AI'