In [None]:
from transformers import pipeline, AutoConfig, GPT2LMHeadModel, GPT2Tokenizer
import pickle
import numpy as np

### Load Dataset

In [None]:
with open('data_train.p', 'rb') as f:
    data_train = pickle.load(f)
    
with open('data_test.p', 'rb') as f:
    data_test = pickle.load(f)
    
classes = list(data_train.keys())

### Instanciate Model

In [None]:
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
gpt2.config.max_length = 1024
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

generator = pipeline("text-generation", model=gpt2, tokenizer=tokenizer)

### Load Class names

In [None]:
classes_text = list(map(lambda x: x.split('_')[-1], classes))

### Evaluation Functions

In [None]:
def generate_prompt(phrase):
    prompt = ''
    for i in range(15):
        cls_idx = np.random.randint(0, len(classes))
        cls_name_prompt = classes_text[cls_idx]
        cls_name = classes[cls_idx]
        sample_idx = np.random.randint(0, len(data_train[cls_name]))
        sample = data_train[cls_name][sample_idx]
        prompt += f'Assign this phrase one of the five types {", ".join(classes_text)}:\n'
        prompt += f'"{sample}"\nType: {cls_name_prompt}\n\n'
    prompt += f'"{phrase}"\nType:'
    return prompt

In [None]:
def evaluate_prompt(prompt, n_samples=5):
    len_prompt = len(prompt)
    len_answer = len(tokenizer(prompt).input_ids) + 5
    res = generator(prompt, max_length=len_answer, num_return_sequences=n_samples)
    return list(map(lambda x: x['generated_text'][len_prompt:].split('\n')[0].replace(' ', ''), res))

In [None]:
def predict_claim(claim, n_prompts=10, n_samples=5):
    preds = []
    for _ in range(n_prompts):
        prompt = generate_prompt(claim)
        preds += evaluate_prompt(prompt, n_samples=n_samples)
        
    results = {}
    for cls in classes_text:
        results[cls] = preds.count(cls)
    results['other_text'] = list(filter(lambda x: x not in classes_text, preds))
    results['other'] = len(results['other_text'])
    return results

In [None]:
def predict_probas(claim):
    res = predict_claim(claim)
    probas = np.zeros(len(classes_text))
    for i, cls in enumerate(classes_text):
        probas[i] = res[cls]
    probas /= np.sum(probas)

### Usage Example

In [None]:
test_phrase = data_test['agreement'][0]

In [None]:
test_phrase

In [None]:
generate_prompt(test_phrase)

In [None]:
res = predict_claim(test_phrase)
res

### Put everything into a class

In [None]:
class SemanticTypeAnalyzer:
    
    def __init__(self, data_train):
        self.gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
        self.gpt2.config.max_length = 1024
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.generator = pipeline("text-generation", model=self.gpt2, tokenizer=self.tokenizer)
        
        self.data_train = data_train
        self.classes = list(data_train.keys())
        self.classes_text = list(map(lambda x: x.split('_')[-1], self.classes))
        
    def generate_prompt(self, phrase, n_examples=15):
        prompt = ''
        for i in range(n_examples):
            cls_idx = np.random.randint(0, len(self.classes))
            cls_name_prompt = self.classes_text[cls_idx]
            cls_name = self.classes[cls_idx]
            sample_idx = np.random.randint(0, len(self.data_train[cls_name]))
            sample = self.data_train[cls_name][sample_idx]
            prompt += f'Assign this phrase one of the five types {", ".join(self.classes_text)}:\n'
            prompt += f'"{sample}"\nType: {cls_name_prompt}\n\n'
        prompt += f'"{phrase}"\nType:'
        return prompt

    def evaluate_prompt(self, prompt, n_samples=5):
        len_prompt = len(prompt)
        len_answer = len(self.tokenizer(prompt).input_ids) + 5
        res = self.generator(prompt, max_length=len_answer, num_return_sequences=n_samples)
        return list(map(lambda x: x['generated_text'][len_prompt:].split('\n')[0].replace(' ', ''), res))
    
    def predict_claim(self, claim, n_prompts=10, n_samples=5):
        preds = []
        for _ in range(n_prompts):
            prompt = self.generate_prompt(claim)
            preds += self.evaluate_prompt(prompt, n_samples=n_samples)

        results = {}
        for cls in self.classes_text:
            results[cls] = preds.count(cls)
        results['other_text'] = list(filter(lambda x: x not in self.classes_text, preds))
        results['other'] = len(results['other_text'])
        return results
    
    def predict_probas(self, claim, n_prompts=10, n_samples=5):
        res = self.predict_claim(claim, n_prompts=n_prompts, n_samples=n_samples)
        probas = np.zeros(len(self.classes_text))
        for i, cls in enumerate(self.classes_text):
            probas[i] = res[cls]
        probas /= np.sum(probas)
        return probas
        
    def predict(self, X, n_prompts=10, n_samples=5):
        probas = np.array(list(map(lambda x: self.predict_probas(x, n_prompts=n_prompts, n_samples=n_samples), X)))
        return np.argmax(probas, axis=1)

In [None]:
analyzer = SemanticTypeAnalyzer(data_train)

In [None]:
res = analyzer.predict(np.array(['Yes.', 'No.']), n_prompts=3, n_samples=5)
res