In [2]:
import pandas as pd
from openprompt.data_utils.utils import InputExample
from openprompt.data_utils.data_processor import DataProcessor
from openprompt import PromptDataLoader
from tqdm import tqdm
import torch



In [3]:
from openprompt.data_utils import InputExample
classes = [
    "no_influencer",
    "nano",
    "micro",
    "macro",
    "mega"
]

In [4]:
test_df = pd.read_csv('../data/few_shot_train_val_test/test.csv')
test_df = test_df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()

train_df = pd.read_csv('../data/few_shot_train_val_test/train.csv')
train_df = train_df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()

validate_df = pd.read_csv('../data/few_shot_train_val_test/validate.csv')
validate_df = validate_df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()

In [5]:
train_df = pd.concat([train_df, validate_df], ignore_index=True)

In [9]:
class TweetProcessor(DataProcessor):
    def __init__(self):
        super().__init__()
        self.labels = ['no_influencer', 'nano', 'micro', 'macro', 'mega']

    def get_examples(self, df):
        labels = df['class'].unique().tolist()
        id2label = {idx:label for idx, label in enumerate(labels)}
        label2id = {label:idx for idx, label in enumerate(labels)}
        df = df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()
        examples = []
        for i, row in df.iterrows():
            examples.append(InputExample(guid=row['twitter user id'], text_a=row['texts'], label= label2id[row['class']]))
        return examples

In [10]:
dataset = {}
dataset['train'] =  TweetProcessor().get_examples(train_df)
dataset['test'] = TweetProcessor().get_examples(test_df)

In [14]:
from openprompt import plms
from openprompt.plms.mlm import MLMTokenizerWrapper
# from openprompt.plms.lm import LMTokenizerWrapper
from collections import namedtuple
from transformers import ElectraModel, ElectraConfig, ElectraTokenizer

config = ElectraConfig.from_pretrained('../finetuning_v2/single/google/electra_large_discriminator/config.json')
model = ElectraModel(config)
state_dict = torch.load('../finetuning_v2/single/google/electra_large_discriminator/pytorch_model.bin')
adapted_state_dict = {k.replace('electra.', ''): v for k, v in state_dict.items()}
model.load_state_dict(adapted_state_dict, strict=False)
model.eval()

ModelClass = namedtuple("ModelClass", ('config', 'tokenizer', 'model','wrapper'))
plms._MODEL_CLASSES['Electra_Finetuned'] = ModelClass(**{
        'config': config,
        'tokenizer': ElectraTokenizer,
        'model': model,
        'wrapper': MLMTokenizerWrapper
    })

In [18]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("Electra_Finetuned", "google/electra-large-discriminator")

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

In [20]:
from openprompt.prompts import ManualTemplate
promptTemplate = ManualTemplate(tokenizer = tokenizer).from_file("../abhinav/openprompt/mt.txt", choice=2)

In [21]:
from openprompt.prompts import ManualVerbalizer
promptVerbalizer = ManualVerbalizer(
    classes= ["no_influencer", "nano", "micro", "macro", "mega"],
    label_words = {
        "no_influencer": ["zero", "none", "nothing"],
        "nano": ["smallest", "least"],
        "micro": ["medium", "small", "few"],
        "macro": ["large", "big", "many"],
        "mega": ["largest", "biggest", "most"],
    },
    tokenizer = tokenizer,
)

In [22]:
from openprompt.data_utils.data_sampler import FewShotSampler
support_sampler = FewShotSampler(num_examples_total=100, also_sample_dev=False)
dataset['support'] = support_sampler(dataset['train'], seed=1)

In [23]:
for example in dataset['support']:
    example.label = -1 # remove the labels of support set for classification
support_dataloader = PromptDataLoader(dataset=dataset["support"], template=promptTemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3,
    batch_size=5,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="tail")

tokenizing: 100it [00:00, 128.70it/s]


In [24]:
from openprompt import PromptForClassification
use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=promptTemplate, verbalizer=promptVerbalizer, freeze_plm=False)
if use_cuda:
    prompt_model=  prompt_model.cuda()

In [26]:
# zero-shot test
test_dataloader = PromptDataLoader(dataset=dataset["test"], template=promptTemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3,
    batch_size=5,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="tail")
allpreds = []
alllabels = []
pbar = tqdm(test_dataloader)
for step, inputs in enumerate(pbar):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    print('steps', step, labels)
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print("test:", acc)

tokenizing: 15it [00:00, 178.30it/s]
  0%|          | 0/3 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 10.00 MiB (GPU 0; 14.58 GiB total capacity; 13.76 GiB already allocated; 7.31 MiB free; 13.77 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF