# (Prompt base + Image caption) model


In [None]:
!pip install -q openprompt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [None]:
import nltk
# stop words
nltk.download('stopwords')
from nltk.corpus import stopwords as sw
sww = sw.words('english')
# word tokenize
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# stemming
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer("english")

In [None]:
import numpy as np
from sklearn.model_selection import KFold

#import tensorflow as tf
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from torch.utils.data import Dataset, DataLoader,SubsetRandomSampler
from sklearn.metrics import precision_recall_fscore_support
from transformers import  AdamW, get_linear_schedule_with_warmup
from openprompt import PromptDataLoader
from torch import nn
import os

In [None]:
import random
import time

rand_seed = int(time.time())

print("seed:", rand_seed)
def seed_torch(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
seed_torch(rand_seed)

# Load data

In [None]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import csv
import torch
import pandas as pd
# Authenticate
drive = None
def authenticate():
    global drive
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)
#Download files
def downloadFiles(fileIds):
    authenticate()
    for fileId in fileIds:
        downloaded = drive.CreateFile({"id": fileId[1]})
        downloaded.GetContentFile(fileId[0])
#Download file if not existing
try:
  _ = open("val_data.csv", "r")
except:
  downloadFiles([["val_data.csv", "1rRiYQwe4pfwS8uT8Jwh9fPfLk2PZfCwW"], ["train_data.csv", "1N5RSWBIO2FteeZn2oPhiAcgakWSNV-ot"], ["test_data.csv", "1m4JtSSkwHwkBZM6rIZQ4znBfGBQlSvkp"]])
# https://drive.google.com/file/d/1rRiYQwe4pfwS8uT8Jwh9fPfLk2PZfCwW/view?usp=share_link
# https://drive.google.com/file/d/1N5RSWBIO2FteeZn2oPhiAcgakWSNV-ot/view?usp=share_link
# https://drive.google.com/file/d/1m4JtSSkwHwkBZM6rIZQ4znBfGBQlSvkp/view?usp=share_link

try:
  _ = open("text_b.csv", "r")
except:
  downloadFiles([["text_b.csv", "1-Ost-ExFIubQJYBQDxmMUd1F0DiXh5QN"]])
# https://drive.google.com/file/d/1-Ost-ExFIubQJYBQDxmMUd1F0DiXh5QN/view?usp=share_link

val_dataset = pd.read_csv("/content/val_data.csv")
train_dataset = pd.read_csv("/content/train_data.csv")
test_dataset = pd.read_csv("/content/test_data.csv")

# text_b contains results from vision_in_text
text_b_dataset = pd.read_csv("text_b.csv")
text_b_dataset = text_b_dataset.dropna()
text_b_dataset.head()

In [None]:
label_texts = train_dataset['labels'].unique()
print(label_texts)

# Data processing

In [None]:
label_dict = {}
for id, lab in enumerate(label_texts):
  label_dict[lab] = id

In [None]:
top_k = 1

datasets = [val_dataset, train_dataset, test_dataset]

for i,dataset in enumerate(datasets):
  dataset['labels'] = [label_dict[l] for l in dataset['labels']]
  datasets[i] = pd.merge(dataset, text_b_dataset, on='id')
  datasets[i]['text_z'] = [','.join(text.split(sep=',')[1:top_k+1]) for text in datasets[i]['text_y']]
  datasets[i]['text_y'] = [text.split(sep=',')[0] for text in datasets[i]['text_y']]

val_dataset = datasets[0]
train_dataset = datasets[1]
test_dataset = datasets[2]

# Open prompt structure

##1.Define a task

In [None]:
# Step 1: Define a task

from openprompt.data_utils import InputExample

def get_prompt_dataset(text_a, text_b, label_digit):
  dataset = []
  for t,j,i in zip(text_a, text_b, label_digit):
      t = str(t).lower()
      j = str(j).lower()
      a = InputExample(text_a = t,text_b = j, label = int(i))
      dataset.append(a)
  return dataset

def get_new_prompt_dataset(text_a, text_b, text_c, label_digit):
  dataset = []
  for t,j,h,i in zip(text_a, text_b, text_c, label_digit):
      t = str(t).lower().strip('\n')
      j = str(j).lower().strip('\n')
      a = InputExample(text_a = t,text_b = j, meta={"text_c":h}, label = int(i))
      dataset.append(a)
  return dataset

val_prompt_dataset = get_new_prompt_dataset(val_dataset['text_x'], val_dataset['text_y'], val_dataset['text_z'], val_dataset['labels'])
train_prompt_dataset = get_new_prompt_dataset(train_dataset['text_x'], train_dataset['text_y'], train_dataset['text_z'], train_dataset['labels'])
test_prompt_dataset = get_new_prompt_dataset(test_dataset['text_x'], test_dataset['text_y'], test_dataset['text_z'], test_dataset['labels'])

##2.Define a Pre-trained Language Models (PLMs) as backbone.

In [None]:
# Step 2: Define a Pre-trained Language Models (PLMs) as backbone.
# 第二步：选择预训练模型
from openprompt.plms import load_plm

model_name = ["t5","t5-base"]

plm, tokenizer, model_config, WrapperClass = load_plm(model_name[0], model_name[1])

##3.Define a **Template**

In [None]:
# Step 3: Define a Template.
# 第三步：定义模板（Template）
from openprompt.prompts import ManualTemplate, MixedTemplate

promptTemplate = ManualTemplate(
    text='The tweet is {"placeholder":"text_a"}, {"placeholder":"text_b"}. So the meme is {"mask"}.',
    tokenizer=tokenizer,
)

prompt_2_template = MixedTemplate(
    text='The tweet is {"placeholder":"text_a"}, {"placeholder":"text_b"}. So the effect range is {"mask"}.',
    tokenizer=tokenizer,
    model=plm
)

prompt_3_template = MixedTemplate(
    text='Tweet text: {"placeholder":"text_a"} Caption: {"placeholder":"text_b"} Keywords: {"meta":"text_c"} Is it harmful? Answer: {"mask"}.',
    tokenizer=tokenizer,
    model=plm
)

##4.Define a Verbalizer

In [None]:
# Step 4: Define a Verbalizer
# 第四步：定义映射（Verbalizer）
from openprompt.prompts import ManualVerbalizer

promptVerbalizer = ManualVerbalizer(
  num_classes=len(label_texts),
  label_words=[[l] for l in label_texts],
  tokenizer=tokenizer)

# Train

## Train function

In [None]:
from openprompt import PromptForClassification

# Hyper Parameters

train_batch_size = 30
val_batch_size = 50
max_seq_length = 256
learning_rate = 1e-4
epoch_num = 5
print("top_k:", top_k)
print("seed:", rand_seed)
print("model name:", model_name)

In [None]:
from sklearn.metrics import classification_report
from openprompt import PromptForClassification
from tqdm import tqdm

best_model = None

loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']

plm, tokenizer, model_config, WrapperClass = load_plm(model_name[0], model_name[1])
prompt_model = PromptForClassification(plm=plm, template=prompt_3_template, verbalizer=promptVerbalizer, freeze_plm=False)
prompt_model = prompt_model.to(device)
#load dataset
train_loader = PromptDataLoader(dataset=train_prompt_dataset, template=prompt_3_template, tokenizer=tokenizer,
  tokenizer_wrapper_class=WrapperClass, max_seq_length=max_seq_length, decoder_max_length=3,
  batch_size=train_batch_size,shuffle=True, teacher_forcing=False, predict_eos_token=False)

val_loader = PromptDataLoader(dataset=val_prompt_dataset, template=prompt_3_template, tokenizer=tokenizer,
  tokenizer_wrapper_class=WrapperClass, max_seq_length=max_seq_length, decoder_max_length=3,
  batch_size=val_batch_size,shuffle=False, teacher_forcing=False, predict_eos_token=False)
test_loader = PromptDataLoader(dataset=test_prompt_dataset, template=prompt_3_template, tokenizer=tokenizer,
  tokenizer_wrapper_class=WrapperClass, max_seq_length=max_seq_length, decoder_max_length=3,
  batch_size=val_batch_size,shuffle=False, teacher_forcing=False, predict_eos_token=False)

# it's always good practice to set no decay to biase and LayerNorm parameters

optimizer_grouped_parameters = [
      {'params': [p for n, p in prompt_model.plm.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': 0.01},
      {'params': [p for n, p in prompt_model.plm.named_parameters() if any(nd in n for nd in no_decay)],
      'weight_decay': 0.0}]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)


# Train process
fold_preds = []
fold_labels = []
best_f1 = 0
for epoch in range(epoch_num):
  train_loss = 0
  train_acc = 0
  prompt_model.train()
  t_n=len(train_loader)

  train_preds = []
  train_labels = []
  for step, inputs in enumerate(tqdm(train_loader)):
      inputs = inputs.cuda()
      logits = prompt_model(inputs)
      labels = inputs['label']
      loss = loss_func(logits, labels)
      loss.backward()
      train_loss += loss.item()
      optimizer.step()
      optimizer.zero_grad()
      train_labels.extend(labels.cpu().tolist())
      train_preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

      acc = sum([int(i==j) for i,j in zip(train_preds, train_labels)])/len(train_preds)
      train_f1 = f1_score(train_labels, train_preds, average='macro')

      train_acc+= acc
  epoch_loss = train_loss /t_n
  epoch_acc = train_acc/t_n
  print('Epoch: %d, train_loss: %.5f, train_acc: %.5f'%(epoch+1,epoch_loss,epoch_acc))

  # validation process

  with torch.no_grad() :
    prompt_model.eval()
    val_tloss = 0
    val_n=len(test_loader)

    val_preds = []
    val_labels = []
    for step, inputs in enumerate(val_loader):
        inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        val_loss = loss.item()
        val_tloss+= val_loss

        val_labels.extend(labels.cpu().tolist())
        val_preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
        vloss = val_tloss/val_n

    val_acc = accuracy_score(np.array(val_preds),np.array(val_labels))
    val_pre,val_rec,val_f1,_=precision_recall_fscore_support(np.array(val_preds),np.array(val_labels),average='macro')

    vloss = val_tloss/val_n
    print('val_loss: %.5f, val_acc: %.4f, val_f1: %.4f, val_precision: %.4f, val_recall: %.4f '%(vloss,val_acc,val_f1,val_pre,val_rec))
    if val_f1 > best_f1:
      best_f1 = val_f1
      print("best_f1 updated:", best_f1)
      fold_preds = val_preds
      torch.save(prompt_model.state_dict(), './harmp_label_checkpoint_k={}_{}.pth'.format(top_k,rand_seed))

with torch.no_grad() :
    prompt_model.eval()
    test_preds = []
    test_labels = []
    for step, inputs in enumerate(test_loader):
        inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        test_labels.extend(labels.cpu().tolist())
        test_preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

In [None]:
def calculate_mmae(expected, predicted, classes):
    NUM_CLASSES = len(classes)
    count_dict = {}
    dist_dict = {}
    for i in range(NUM_CLASSES):
        count_dict[i] = 0
        dist_dict[i] = 0.0
    for i in range(len(expected)):
        dist_dict[expected[i]] += abs(expected[i] - predicted[i])
        count_dict[expected[i]] += 1
    overall = 0.0
    for claz in range(NUM_CLASSES):
        class_dist =  1.0 * dist_dict[claz] / count_dict[claz]
        overall += class_dist
    overall /= NUM_CLASSES
#     return overall[0]
    return overall

In [None]:
mmae = calculate_mmae(test_labels, fold_preds, label_texts)
print("mmae:", mmae)

In [None]:
print("mmae:", mmae)
print("model XX \n harmp_label:\n", classification_report(test_labels, fold_preds, digits=4))
