In [68]:
!pip install openprompt
import pandas as pd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [69]:
from openprompt.data_utils import InputExample
classes = [
    0,
    1
]

In [70]:
train_df = pd.read_csv("train.csv")
validate_df = pd.read_csv("validate.csv")
test_df = pd.read_csv("test.csv")

In [71]:
train_df = train_df[["twitter user id","texts","class"]]
validate_df = validate_df[["twitter user id","texts","class"]]
test_df = test_df[["twitter user id","texts","class"]]

In [72]:
import numpy as np
def split_rows(dataframe):
  for index, row in dataframe.iterrows():
    if row['no.of_words']>470:
      base = 0
      for i in range(len(row['texts'])):
        if np.sum([len(y) for y in row['texts'][base:i]])<=470 and (np.sum([len(y) for y in row['texts'][base:i+1]]))>470:
          dataframe.loc[len(dataframe)] = {'twitter user id': row['twitter user id'],
                                          'texts': row['texts'][base:i], 'class': row['class'],
                                          'no.of_tweets': i-base, 'no.of_words': np.sum([len(y) for y in row['texts'][base:i]])}
          base = i
      if base!=len(row['texts'])-1:
        dataframe.loc[len(dataframe)] = {'twitter user id': row['twitter user id'],
                                              'texts': row['texts'][base:], 'class': row['class'],
                                              'no.of_tweets': row['no.of_tweets']-base, 
                                              'no.of_words': np.sum([len(y) for y in row['texts'][base:]])}

  dataframe = dataframe[dataframe['no.of_words']<=470]
  return dataframe

def preprocess_for_prompt(dataframe):
  dataframe = dataframe.groupby("twitter user id").agg(lambda x: list(x))
  dataframe['class'] = dataframe['class'].apply(lambda x: x[0])
  dataframe['no.of_tweets'] = dataframe['texts'].apply(lambda x: len(x))
  dataframe['no.of_words'] = dataframe['texts'].apply(lambda x: np.sum([len(y) for y in x]))
  dataframe.reset_index(inplace=True)
  dataframe = split_rows(dataframe)
  dataframe = dataframe[['twitter user id','texts','class']]
  dataframe.reset_index(inplace=True)
  return dataframe

train_df = preprocess_for_prompt(train_df)
validate_df = preprocess_for_prompt(validate_df)
test_df = preprocess_for_prompt(test_df)

In [73]:
def prompt_template_1(text_dataframe, labels, user_ids_list):
  '''
  Q&A form of template
  Template: Given a set of user tweets [x1] [SEP] [x2] [SEP] [x3] [SEP] [x4] [SEP] [x5] [SEP] 
  the user profile is labelled as [z].
  Label: True/False
  * Binary Classification Prompt
  '''
  label_list = ['micro', 'no influencer', 'mega', 'macro', 'nano']
  text = []
  binary_labels = []
  given_label = []
  user_id = []
  for tweets, label, user in zip(text_dataframe, labels, user_ids_list):
    tweet_text = "Given a set of user tweets-"
    for tweet in tweets:
      tweet_text+=", "+str(tweet)+" [SEP] "
    positive_text = tweet_text+",-the user profile is labelled as "+label+". This statement is True. "
    positive_label = 0
    text.append(positive_text)
    binary_labels.append(positive_label)
    given_label.append(label)
    user_id.append(user)

    for neg_label in label_list:
      if neg_label!=label:
        negative_text = tweet_text+",-the user profile is labelled as "+neg_label+". This statement is False. "
        text.append(negative_text)
        binary_labels.append(1)
        given_label.append(neg_label)
        user_id.append(user)
  
  return text, binary_labels, given_label, user_id

train = pd.DataFrame()
validate = pd.DataFrame()
test = pd.DataFrame()
train['prompt'], train['binary_label'], train['given_label'], train['user_id'] = prompt_template_1(train_df['texts'], train_df['class'], train_df['twitter user id'])
validate['prompt'], validate['binary_label'], validate['given_label'], validate['user_id'] = prompt_template_1(validate_df['texts'], validate_df['class'], validate_df['twitter user id'])
test['prompt'], test['binary_label'], test['given_label'], test['user_id'] = prompt_template_1(test_df['texts'], test_df['class'], test_df['twitter user id'])

In [74]:
train = pd.concat([train, validate], ignore_index=True)

In [75]:
from openprompt.data_utils.data_processor import DataProcessor
class TweetProcessor(DataProcessor):
    def __init__(self):
        super().__init__()
        self.labels = ['no_influencer', 'nano', 'micro', 'macro', 'mega']

    def get_examples(self, df):
        examples = []
        for i, row in df.iterrows():
            examples.append(InputExample(guid=row['user_id'], text_a=row['prompt'], label= row['binary_label']))
        return examples


In [76]:
dataset = {}

dataset['train'] =  TweetProcessor().get_examples(train)
dataset['test'] = TweetProcessor().get_examples(test)

In [77]:
from openprompt import plms
from openprompt.plms.mlm import MLMTokenizerWrapper
from openprompt.plms.lm import LMTokenizerWrapper
from collections import namedtuple
from transformers import XLNetConfig, XLNetTokenizer, XLNetModel, GPTJConfig, GPTJForCausalLM, GPT2Tokenizer

ModelClass = namedtuple("ModelClass", ('config', 'tokenizer', 'model','wrapper'))
plms._MODEL_CLASSES['xlnet'] = ModelClass(**{
        'config': XLNetConfig,
        'tokenizer': XLNetTokenizer,
        'model': XLNetModel,
        'wrapper': MLMTokenizerWrapper
    })
plms._MODEL_CLASSES['gptj'] = ModelClass(**{
        'config': GPTJConfig,
        'tokenizer': GPT2Tokenizer,
        'model': GPTJForCausalLM,
        'wrapper': LMTokenizerWrapper
    })

In [78]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("roberta", "tner/roberta-large-tweetner7-all")

Downloading (…)lve/main/config.json:   0%|          | 0.00/13.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at tner/roberta-large-tweetner7-all were not used when initializing RobertaForMaskedLM: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at tner/roberta-large-tweetner7-all and are newly initialized: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

In [79]:
from openprompt.prompts import ManualTemplate
promptTemplate = ManualTemplate(
    text = '{"placeholder":"text_a"} Label: {"mask"}',
    tokenizer = tokenizer,
)

In [80]:
from openprompt.prompts import ManualVerbalizer
promptVerbalizer = ManualVerbalizer(
    classes = classes,
    label_words = {
        0: ["0"],
        1: ["1"],
    },
    tokenizer = tokenizer,
)

In [81]:
from openprompt import PromptForClassification
use_cuda = True
promptModel = PromptForClassification(
    template = promptTemplate,
    plm = plm,
    verbalizer = promptVerbalizer
)
if use_cuda:
    prompt_model=  promptModel.cuda()

In [82]:
from openprompt import PromptDataLoader
data_loader_train = PromptDataLoader(
    dataset = dataset['train'],
    tokenizer = tokenizer,
    template = promptTemplate,
    tokenizer_wrapper_class=WrapperClass,
)
data_loader_test = PromptDataLoader(
    dataset = dataset['test'],
    tokenizer = tokenizer,
    template = promptTemplate,
    tokenizer_wrapper_class=WrapperClass,
)

tokenizing: 1400it [00:01, 710.88it/s]
tokenizing: 115it [00:00, 459.37it/s]


In [83]:
import torch

all_preds = []
all_labels = []
# making zero-shot inference using pretrained MLM with prompt
promptModel.eval()
with torch.no_grad():
    for batch in data_loader_train:
      if use_cuda:
        batch = batch.cuda()
      logits = prompt_model(batch)
      labels = batch['label']
      preds = torch.argmax(logits, dim = -1)
      all_labels.extend(labels.cpu().tolist())
      all_preds.extend(preds.cpu().tolist())

acc = sum([int(i==j) for i,j in zip(all_preds, all_labels)])/len(all_preds)
print("train:", acc)

with torch.no_grad():
    for batch in data_loader_test:
      if use_cuda:
        batch = batch.cuda()
      logits = prompt_model(batch)
      labels = batch['label']
      preds = torch.argmax(logits, dim = -1)
      all_labels.extend(labels.cpu().tolist())
      all_preds.extend(preds.cpu().tolist())

acc = sum([int(i==j) for i,j in zip(all_preds, all_labels)])/len(all_preds)
print("test:", acc)

train: 0.8
test: 0.8
