In [1]:
import pandas as pd
from openprompt.data_utils.utils import InputExample
from openprompt.data_utils.data_processor import DataProcessor
from openprompt import PromptDataLoader
from tqdm import tqdm
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from openprompt.data_utils import InputExample
classes = [ # There are two classes in Sentiment Analysis, one for negative and one for positive
    "no_influencer",
    "nano",
    "micro",
    "macro",
    "mega"
]
dataset = [ # For simplicity, there's only two examples
    # text_a is the input text of the data, some other datasets may have multiple input sentences in one example.
    InputExample(
        guid = 0,
        text_a = "Albert Einstein was one of the greatest intellects of his time.",
    ),
    InputExample(
        guid = 1,
        text_a = "The film was badly made.",
    ),
]

In [3]:
test_df = pd.read_csv('/mnt/home/abhinavkumar2/Profiling-Cryptocurrency-Influencers-with-FSL/data/few_shot_train_val_test/test.csv')
test_df = test_df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()

In [4]:
train_df = pd.read_csv('/mnt/home/abhinavkumar2/Profiling-Cryptocurrency-Influencers-with-FSL/data/few_shot_train_val_test/train.csv')
train_df = train_df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()

In [5]:
validate_df = pd.read_csv('/mnt/home/abhinavkumar2/Profiling-Cryptocurrency-Influencers-with-FSL/data/few_shot_train_val_test/validate.csv')
validate_df = validate_df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()

In [6]:
train_df = pd.concat([train_df, validate_df], ignore_index=True)

In [7]:
class TweetProcessor(DataProcessor):
    def __init__(self):
        super().__init__()
        self.labels = ['no_influencer', 'nano', 'micro', 'macro', 'mega']

    def get_examples(self, path):
        df = pd.read_csv(path)
        labels = df['class'].unique().tolist()
        id2label = {idx:label for idx, label in enumerate(labels)}
        label2id = {label:idx for idx, label in enumerate(labels)}
        df = df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()
        examples = []
        for i, row in df.iterrows():
            examples.append(InputExample(guid=row['twitter user id'], text_a=row['texts'], label= label2id[row['class']]))
        return examples

In [8]:
dataset = {}
dataset['train'] =  TweetProcessor().get_examples('/mnt/home/abhinavkumar2/Profiling-Cryptocurrency-Influencers-with-FSL/data/few_shot_train_val_test/train.csv')
dataset['test'] = TweetProcessor().get_examples('/mnt/home/abhinavkumar2/Profiling-Cryptocurrency-Influencers-with-FSL/data/few_shot_train_val_test/test.csv')

In [9]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("roberta", "cardiffnlp/twitter-roberta-large-2022-154m")

In [10]:
from openprompt.prompts import ManualTemplate
promptTemplate = ManualTemplate(tokenizer = tokenizer).from_file("./mt.txt", choice=2)

In [11]:
from openprompt.prompts import ManualVerbalizer
promptVerbalizer = ManualVerbalizer(
    classes= ["no_influencer", "nano", "micro", "macro", "mega"],
    label_words = {
        "no_influencer": ["zero", "none", "nothing"],
        "nano": ["smallest", "least"],
        "micro": ["medium", "small", "few"],
        "macro": ["large", "big", "many"],
        "mega": ["largest", "biggest", "most"],
    },
    tokenizer = tokenizer,
)

In [12]:
from openprompt.data_utils.data_sampler import FewShotSampler
support_sampler = FewShotSampler(num_examples_total=100, also_sample_dev=False)
dataset['support'] = support_sampler(dataset['train'], seed=1)

In [13]:
for example in dataset['support']:
    example.label = -1 # remove the labels of support set for classification
support_dataloader = PromptDataLoader(dataset=dataset["support"], template=promptTemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3,
    batch_size=5,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="tail")

tokenizing: 100it [00:00, 284.96it/s]


In [14]:
from openprompt import PromptForClassification
use_cuda = True
prompt_model = PromptForClassification(plm=plm,template=promptTemplate, verbalizer=promptVerbalizer, freeze_plm=False)
if use_cuda:
    prompt_model=  prompt_model.cuda()

In [15]:
org_label_words_num = [len(prompt_model.verbalizer.label_words[i]) for i in range(4)]
from openprompt.utils.calibrate import calibrate
# calculate the calibration logits
cc_logits = calibrate(prompt_model, support_dataloader)
print("the calibration logits is", cc_logits)

ContextCali: 100%|██████████| 20/20 [00:09<00:00,  2.03it/s]

the calibration logits is tensor([45.1223, -5.0615, 46.4884,  ...,  0.4226,  1.7090, 32.2724],
       device='cuda:0')





In [16]:
# register the logits to the verbalizer so that the verbalizer will divide the calibration probability in producing label logits
# currently, only ManualVerbalizer and KnowledgeableVerbalizer support calibration.
prompt_model.verbalizer.register_calibrate_logits(cc_logits)
new_label_words_num = [len(prompt_model.verbalizer.label_words[i]) for i in range(4)]
print("Original number of label words per class: {} \n After filtering, number of label words per class: {}".format(org_label_words_num, new_label_words_num))

Original number of label words per class: [3, 2, 3, 3] 
 After filtering, number of label words per class: [3, 2, 3, 3]


In [17]:
# zero-shot test
test_dataloader = PromptDataLoader(dataset=dataset["test"], template=promptTemplate, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=512, decoder_max_length=3,
    batch_size=5,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="tail")
allpreds = []
alllabels = []
pbar = tqdm(test_dataloader)
for step, inputs in enumerate(pbar):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    print('steps', step, labels)
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print("test:", acc)  # roughly ~0.853 when using template 0

tokenizing: 15it [00:00, 374.15it/s]
 33%|███▎      | 1/3 [00:00<00:00,  2.18it/s]

steps 0 tensor([0, 1, 2, 3, 4], device='cuda:0')


 67%|██████▋   | 2/3 [00:00<00:00,  2.13it/s]

steps 1 tensor([4, 1, 2, 0, 3], device='cuda:0')


100%|██████████| 3/3 [00:01<00:00,  2.14it/s]

steps 2 tensor([3, 2, 1, 0, 4], device='cuda:0')
test: 0.13333333333333333



