In [19]:
############## AUTORELOAD MAGIC ###################
%load_ext autoreload
%autoreload 2
###################################################

############## FUNDAMENTAL MODULES ################
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
import json
from PIL import Image

 ##################################################

############## TASK-SPECIFIC MODULES ##############
sys.path.append("..")
sys.path.append("../..")
from ml.datasets import VanillaDataset
from ml.models import VanillaNN
from ml.trainer import Trainer
from ml.eval import get_output_scores
from data import utils
from analysis.utils import extract_datapoints
###################################################


####################### CONSTANTS ########################
SPLITS = ["train", "dev", "test", "gold"]
TRAIN, DEV, TEST, TXT, IMG = "train", "dev", "test", "txt", "img"
FE_METHODS = ["txt_embeddings", "img_embeddings", "concat", "sum", "mean", "hadamard", "paraphrase"]
TRAIN, DEV, TEST, GOLD = "train", "dev", "test", "gold"
#FE_METHODS += ["concat_cos", "sum_cos", "mean_cos", "hadamard_cos"]
##########################################################

############## DATA SCIENCE & ML MODULES #################
from torch.utils.data import DataLoader
import torch
from sklearn.metrics import classification_report
import pandas as pd
##########################################################

####################### SELECT ###########################
users = ["patriziopalmisano", "onurdenizguler", "jockl"]
user = users[1] # SELECT USER
version = "v2" # SELECT DATASET VERSION
dataset_version = version
##########################################################

if user in users[:2]:
    data_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english_{version}"
    cw_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive"

else:
    data_dir = f"/home/jockl/Insync/check.worthiness@gmail.com/Google Drive/data/CT23_1A_checkworthy_multimodal_english_{dataset_version}"
    cw_dir = "/home/jockl/Insync/check.worthiness@gmail.com/Google Drive"

features_dir = f"{data_dir}/features"
labels_dir = f"{data_dir}/labels"
models_dir = f"{cw_dir}/models/vanillann_hyperparam_search"
evals_dir = f"{models_dir}/gold_evals"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
dataset_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english"
raw_dataset, texts, imgs, tweet_ids, ocr_texts, tweet_concat_ocr = utils.load_data_splits_with_gold_dataset(dataset_dir, version)

Sizes of txt, img, ocr, txt+ocr arrays in train, test, dev, gold:
2356 2356 2356 2356
271 271 2356 2356
548 548 2356 2356
736 736 2356 2356


In [6]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [17]:
image_captions = {
    "gold": []
}

for raw_image in imgs["gold"]:
    # unconditional image captioning
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=20)
    caption= processor.decode(out
    [0], skip_special_tokens=True)
    image_captions["gold"].append(caption)

In [18]:
image_captions["gold"]

['a woman in a striped shirt is dancing',
 'energypi - energypig - energypig - energypig - energypig - energy',
 'a digital painting of a man with a beard',
 'a woman walks past a shop in the city of vienna',
 'a graph of the bitcoin market',
 'a man and woman are on a news set',
 'a large room with many blue blankets on the floor',
 'people wearing masks walk through an airport terminal',
 'a map of the united with the states highlighted in red',
 'a bar chart showing the number of extreme weather events',
 'a plane is parked at an airport at night',
 'bitcoin bitcoin bitcoin bitcoin bitcoin bitcoin bitco',
 'a blue face mask on a white background',
 'a map showing the temperature of the earth',
 "a map showing the percentage of uk's population",
 'table showing the number of people who have been diagnosed',
 "volvo's new volvo car is the first to be equipped with a new led",
 'bitcoin crypt trading platform',
 'a man is dumping a pile of palm leaves',
 'the logo for the competition c

In [25]:
captions_file = "/Users/onurdenizguler/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english_test_gold/captions_gold.txt"
captions = []
with open(captions_file, "r") as f:
    for line in f:
        captions.append("".join(line.split(',')[:-1]))

In [197]:
prompt_template = """
You are a social media rules and regulations editor whose job is to read tweets and look at captions that describe the accompanying image to understand whether the tweet requires any fact checking. If fact checking is required, you print out “final_decision” = ;;;Yes;;;, if it is not required, you print out “final_decision” =  ;;;No;;;. 

Your guidelines:

“step_1”: First look at the “text” of a tweet. It contains a verifiable claim if and only if: it states a definition OR mentions a quantity in the present or in the past OR makes a verifiable prediction about the future OR references laws, procedures, and rules of operation OR states correlation or causation.

“step_2”: If the tweet contains a verifiable claim, then you check whether it requires fact checking. The claim requires fact checking if and only if: it appears to be false OR is of public interest or appears to be harmful: can negatively affect society as a whole, but also specific persons, companies, products.
If both “step_1” and “step_2” hold, then you label the tweet as fact checkable. So you say “final_decision” = ;;;Yes;;;. If not, you then look at the image caption and the OCR text:

“step_3”: Look at the image. Does the image contribute to establish if a tweet has a factually-verifiable claim? It does if and only if: there exists a piece of evidence (e.g. an event, action, situation or a person’s identity) OR illustration of certain aspects in the claim text OR the OCR text contains a claim. If the OCR text contains a claim you apply “step_1” and “step_2” on it to finalize your decision.

your_current_tweet = [
“text”: ‘{text}’,
“image_caption”: {caption},
"ocr_text": ‘{ocr_text}’,
]

“step_1”: You look at the text of the tweet. Does it contain a verifiable claim?
“step_2”: If it contains a verifiable claim, does it require fact checking?
“step_3”: Do the image caption or the OCR make you think that this tweet requires fact checking?
“final_decision”=
"""


In [205]:
import re
gold_prompts = []
for text, ocr_text, caption in zip(texts["gold"], ocr_texts["gold"], captions):
    ocr_text = re.sub('\n', '', ocr_text)
    ocr_text = re.sub('"', '', ocr_text)
    ocr_text = re.sub("'", '', ocr_text)
    text = re.sub('\n', '', text)
    text = re.sub('"', '', text)
    text = re.sub("'", '', text)
    gold_prompts.append(prompt_template.format(text=text, ocr_text = ocr_text, caption=caption))


prompts_gold_file = "prompts_gold.txt"
for idx, prompt in enumerate(gold_prompts):
    filename = "prompts_gold/" + str(idx) + ".txt"
    with open(filename, "w") as f:
        f.write(prompt)

In [218]:
import openai
openai.api_key = "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"

full_responses = []
responses = []
last_processed_prompt = -1

In [246]:
prompt

'\nYou are a social media rules and regulations editor whose job is to read tweets and look at captions that describe the accompanying image to understand whether the tweet requires any fact checking. If fact checking is required, you print out “final_decision” = ;;;Yes;;;, if it is not required, you print out “final_decision” =  ;;;No;;;. \n\nYour guidelines:\n\n“step_1”: First look at the “text” of a tweet. It contains a verifiable claim if and only if: it states a definition OR mentions a quantity in the present or in the past OR makes a verifiable prediction about the future OR references laws, procedures, and rules of operation OR states correlation or causation.\n\n“step_2”: If the tweet contains a verifiable claim, then you check whether it requires fact checking. The claim requires fact checking if and only if: it appears to be false OR is of public interest or appears to be harmful: can negatively affect society as a whole, but also specific persons, companies, products.\nIf b

In [265]:
with open("prompts_gold/276.txt", "r") as f:
    gold_prompts[713] = "".join(f.readlines())

In [263]:
gold_prompts.index(prompt)

713

In [266]:
counter = 0
for prompt in gold_prompts[last_processed_prompt+1:]:
  counter += 1
  response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
          #{"role": "system", "content": "You are a social media rules and regulations editor."},
          {"role": "user", "content": prompt},
      ]
  )
  full_responses.append(response)
  responses.append(response['choices'][0]['message']["content"])
  if gold_prompts.index(prompt)%10 == 0:
    print(gold_prompts.index(prompt))
  last_processed_prompt = gold_prompts.index(prompt)
  if counter ==200:
    break

720
730


In [259]:
responses[580]

'“step_1”: The text of the tweet contains several verifiable claims: \n1. “£1m of cocaine that was hidden in a box of face masks was seized by experienced officers at the Channel Tunnel yesterday.”\n2. “Criminal networks are trying to exploit the #coronavirus outbreak for their own benefit.”\n3. “Border Force and @NCA_UK are working together to stop them.”\n\n“step_2”: These verifiable claims may require fact checking as they involve illegal activities and the actions taken by law enforcement agencies. Additionally, the claim about criminal networks exploiting the coronavirus outbreak is of public interest.\n\n“step_3”: The image caption, "a pile of boxes with a cell in it," does not directly contribute to establishing the veracity of the verifiable claims.\n\n“final_decision” = ;;;Yes;;;'

In [267]:
import random
import numpy as np
preds = []
for response in responses:
    if "no" in response.lower().split(";;;"):
        preds.append(0)
    elif "yes" in response.lower().split(";;;"):
        preds.append(1)
    else:
        preds.append(1) if "yes" in response.lower() else preds.append(0) if "no" in response.lower() else preds.append(random.choice([0, 1]))

preds = np.array(preds)

focus_labels = split_to_labels["gold"][:len(preds)]

arg_pos = np.argwhere(focus_labels == 1).flatten()
arg_neg = np.argwhere(focus_labels == 0).flatten()
TP = sum(preds[arg_pos] == 1)
FP = sum(preds[arg_neg] == 1)
FN = sum(preds[arg_pos] == 0)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1 = 2*(precision*recall)/(precision+recall)
print(precision, recall, f1)

0.75 0.5740072202166066 0.6503067484662576


In [300]:
with open("explains_gold.pickle", 'rb') as handle:
    mypreds = pickle.load(handle)

In [302]:
import pickle
with open("captions_gold.pickle", 'wb') as handle:
    pickle.dump(captions, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [283]:
import random
import numpy as np
preds = []
explains = [] 
for response in responses:
    if "no" in response.lower().split(";;;"):
        preds.append(0)
    elif "yes" in response.lower().split(";;;"):
        preds.append(1)
    else:
        preds.append(1) if "yes" in response.lower() else preds.append(0) if "no" in response.lower() else preds.append(random.choice([0, 1]))
    explains.append(response.split("final_decision")[0])
preds = np.array(preds)

focus_labels = split_to_labels["gold"][:len(preds)]

arg_pos = np.argwhere(focus_labels == 1).flatten()
arg_neg = np.argwhere(focus_labels == 0).flatten()
TP = sum(preds[arg_neg] == 0)
FP = sum(preds[arg_pos] == 0)
FN = sum(preds[arg_neg] == 1)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1 = 2*(precision*recall)/(precision+recall)
print(precision, recall, f1)

0.7748091603053435 0.8845315904139434 0.8260427263479145


In [268]:
len(preds)

736

In [131]:
if user in users[:2]:
    data_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive/data/CT23_1A_checkworthy_multimodal_english_{version}"
    cw_dir = f"/Users/{user}/Library/CloudStorage/GoogleDrive-check.worthiness@gmail.com/My Drive"

else:
    data_dir = f"/home/jockl/Insync/check.worthiness@gmail.com/Google Drive/data/CT23_1A_checkworthy_multimodal_english_{dataset_version}"
    cw_dir = "/home/jockl/Insync/check.worthiness@gmail.com/Google Drive"

features_dir = f"{data_dir}/features"
labels_dir = f"{data_dir}/labels"
models_dir = f"{cw_dir}/models/vanillann_hyperparam_search"
split_to_labels = {split: 
                   np.load(f"{labels_dir}/{split}_labels_{dataset_version}.pickle", allow_pickle=True) if split != "gold" else np.load(f"{labels_dir}/{split}_labels.pickle", allow_pickle=True)
                   for split in SPLITS}

In [185]:
responses[210]

'1) The text of the tweet does not contain a verifiable claim. It includes a quote from @novogratz but does not make any specific verifiable statements.\n2) Since there is no verifiable claim in the tweet, fact checking is not required.\n3) The image caption and the OCR text do not provide any additional information that suggests fact checking is needed.\nFinal Decision: ;;;No;;;'

In [179]:
for idx, prompt in enumerate(gold_prompts[210:]):
  if idx%20 == 0:
    print(idx)
  response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
          #{"role": "system", "content": "You are a social media rules and regulations editor."},
          {"role": "user", "content": prompt},
      ]
  )
  full_responses.append(response)
  responses.append(response['choices'][0]['message']["content"])

0
20
40
60


InvalidRequestError: This model's maximum context length is 4097 tokens. However, your messages resulted in 5039 tokens. Please reduce the length of the messages.