In [20]:
import os
import gc
import random
import pandas as pd
import pickle
import torch
import numpy as np
import json
import math
import argparse
from collections import defaultdict, Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.utils.data.dataset import Subset
import torch.nn.functional as F
from torcheval.metrics.functional import multiclass_f1_score
import logging
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
import wandb
import matplotlib.pyplot as plt
import time

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
random.seed(420)

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
def get_memory_usage(idx=""):
    free, available = torch.cuda.mem_get_info()
    print(f"{idx} Free:{free/1000000000:.3f}GB\tAvailable:{available/1000000000:.3f}GB")
get_memory_usage()

 Free:47.445GB	Available:47.726GB


## Loading Model

In [None]:
def initialize_trained_model(modelname, bool_tokenizer=True, bool_model=True):
    if bool_tokenizer:
        tokenizer = AutoTokenizer.from_pretrained(modelname, use_fast=False, truncation=True, max_len=MAX_LEN, padding='max_length', cache_dir="/gscratch/argon/hjung10/transformers")
    else:
        tokenizer = None
        
    if bool_model:
        model = AutoModelForSequenceClassification.from_pretrained(PATH_MODEL_SAVE, num_labels=3, cache_dir="/gscratch/argon/hjung10/transformers").to(device)
    else:
        model = None
    if tokenizer and tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

In [27]:
MYTH = 'M3'
PATH_MODEL_SAVE = '/mmfs1/home/hjung10/models/deberta-v3-base-oud-' + MYTH

MAX_LEN = 1024
MODEL_NAME = "microsoft/deberta-v3-base"
model, tokenizer = initialize_trained_model(MODEL_NAME, bool_tokenizer=True, bool_model=True)

## Loading Data

In [15]:
def load_training_data(TRAINING_DIR, drop_columns):
    df_data = pd.read_csv(TRAINING_DIR)
    df_data = df_data.drop_duplicates(subset='video_id')

    df_data = df_data.drop(axis=1, columns=drop_columns).reset_index()
    print("Shape of dataframe: " + str(df_data.shape))
    return df_data

def load_data_for_labeling(TRAINING_DIR, drop_columns, exclude_video_id):
    df_data = pd.read_csv(TRAINING_DIR)
    df_data = df_data.drop_duplicates(subset='video_id')

    df_data = df_data.drop(axis=1, columns=drop_columns).reset_index()

    print("Shape of dataframe before excluding: " + str(df_data.shape))
    df_data = df_data[~df_data['video_id'].isin(exclude_video_id)]
    print("Shape of dataframe after excluding: " + str(df_data.shape))
    return df_data

def process_input_text(row):
    title = row['video_title'] if row['video_title'] == row['video_title'] else ""
    if title == "":
        return None
    description = row['video_description'] if row['video_description'] == row['video_description'] else ""

    transcript = ""
    if row['transcript'] == row['transcript'] and "Could not retrieve a transcript for" not in row['transcript']:
        transcript = row['transcript']

    tags = row['tags'] if row['tags'] == row['tags'] else ""
    input = 'VIDEO TITLE: ' + title + '\nVIDEO DESCRIPTION: ' + description + '\nVIDEO TRANSCRIPT: ' + transcript + '\nVIDEO TAGS: ' + tags
    return input

In [17]:
### incrementing labels by 1 since prediction index starts from 0
## 0 -> opposing
### 1 -> neutral
### 2 -> supporting
def process_data_for_prediction(data):
    processed = []
    url_not_accessible = []
    for i, row  in data.iterrows():
        if i % 200 == 1:
            #print(original_)
            print(i)

        # processing inputs and labels
        original_ = process_input_text(row)
        if original_ is None:   # deleted video
            url_not_accessible.append(row['video_id'])
            continue
        instance = [original_, row['video_id']]
        
        tokenized = tokenizer(instance[0], max_length=MAX_LEN, padding = 'max_length', truncation=True, return_tensors='pt') #.to(device)
        if 'token_type_ids' in tokenized:
            tokenized = {'input_ids':tokenized['input_ids'][0], 'token_type_ids':tokenized['token_type_ids'][0], 'attention_mask':tokenized['attention_mask'][0]}
        else:
            tokenized = {'input_ids':tokenized['input_ids'][0], 'attention_mask':tokenized['attention_mask'][0]}
    
        instance[0]=tokenized
        processed.append(instance)
    return processed, url_not_accessible

def save_list_to_json(data, filename):
    list_example = []
    for example in data_dev:
        list_example.append(example[2])
    
    """Saves a list into a JSON file."""
    with open(filename, 'w') as file:
        json.dump(list_example, file, indent=4)

def read_list_from_json(filename):
    """Reads a list from a JSON file."""
    with open(filename, 'r') as file:
        return json.load(file)

## Reading, processing, and creating dataloader
- Note: 274/164,085 unique videos were URL not accessible; assigned 0 (neutral) automatically following past works

In [18]:
class CustomData(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

In [11]:
# Batch size
VALID_BATCH_SIZE = 8

In [12]:
# Already annotated search results; exclude
SEARCH_DATA_DIR = os.getcwd() + os.sep + 'training_data' + os.sep + 'final-search-results-annotations.csv'
columns_to_drop = ['Unnamed: 0.1', 'Unnamed: 0']
df_data = load_training_data(SEARCH_DATA_DIR, columns_to_drop)
video_id_exclude = set(df_data['video_id'].tolist())

Shape of dataframe: (1776, 27)


In [13]:
# reading in the recommendation data, excluding any of the search results that we already labeled
TRAINING_DATA_DIR = os.getcwd() + os.sep + 'recommendation-data' + os.sep + 'recommendation-video-metadata.csv'
columns_to_drop = ['Unnamed: 0.1', 'Unnamed: 0']
df_data = load_data_for_labeling(TRAINING_DATA_DIR, columns_to_drop, video_id_exclude)
print(df_data.shape)

Shape of dataframe before excluding: (164085, 16)
Shape of dataframe after excluding: (163744, 16)
(163744, 16)


In [19]:
# processing data & placing into data loader
data_for_prediction, url_not_accessible = process_data_for_prediction(df_data)
print("Number of url not accessible: " + str(len(url_not_accessible)))
dataloader_pred = DataLoader(CustomData(data_for_prediction), batch_size=VALID_BATCH_SIZE, shuffle=True)

1
201
401
601
801
1001
1201
1401
1601
1801
2001
2201
2401
2601
2801
3001
3201
3401
3601
3801
4001
4201
4401
4601
4801
5001
5201
5401
5601
5801
6001
6201
6401
6601
6801
7001
7201
7401
7601
7801
8001
8201
8401
8601
8801
9001
9201
9401
9601
9801
10001
10201
10401
10601
10801
11001
11201
11401
11601
11801
12001
12201
12401
12601
12801
13001
13201
13401
13601
13801
14001
14201
14401
14601
14801
15001
15201
15401
15601
15801
16001
16201
16401
16601
16801
17001
17201
17401
17601
17801
18001
18201
18401
18601
18801
19001
19201
19401
19601
19801
20001
20201
20401
20601
20801
21001
21201
21401
21601
21801
22001
22201
22401
22601
22801
23001
23201
23401
23601
23801
24001
24201
24401
24601
24801
25001
25201
25401
25601
25801
26001
26201
26401
26601
26801
27001
27201
27401
27601
27801
28001
28201
28401
28601
28801
29001
29201
29401
29601
29801
30001
30201
30601
30801
31001
31201
31401
31601
31801
32001
32201
32401
32601
32801
33001
33201
33401
33601
33801
34001
34201
34401
34601
34801
35001
35201
3

## Labeling Recommendation Data

In [28]:
# evaluation against the provided validation/test dataloader
def predict(dataloder):
    model.eval()
    model_prediction = []

    raw_predictions = []
    with torch.no_grad():
        for i, batch in enumerate(dataloder):
            if i % 100 == 1:
                print(i)
                print(model_prediction[i-1])
                print(Counter(raw_predictions))
            input, video_id = batch
            input = {k:v.to(device) for k,v in input.items()}
            outputs = model(**input)
            logits = outputs.logits
            probs = F.softmax(logits, dim=1)            
            max_probs, pred_idx = probs.max(dim=1)

            pred_idx_cpu = pred_idx.cpu()
            for i in range(len(pred_idx_cpu)):
                prediction_idx = pred_idx_cpu[i].item()
                confidence = max_probs[i].item()
                raw_predictions.append(prediction_idx)
                model_prediction.append((video_id[i], prediction_idx, confidence))
    return model_prediction

In [30]:
PATH_MODEL_SAVE

'/mmfs1/home/hjung10/models/deberta-v3-base-oud-M3'

In [None]:
start = time.perf_counter()
model_prediction = predict(dataloader_pred)
end = time.perf_counter()

In [None]:
# store model predictions; figure out which videos to propagate up to GPT-4o
with open(os.getcwd() + os.sep + 'recommendation-data' + os.sep + MYTH + '-deberta-predictions.json', 'w') as f:
    json.dump(model_prediction, f)

## Parse through DEBERTA predictions & determine which to propagate to LLM

In [36]:
def read_predictions_json(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)
    
    parsed_data = [
        {"video_id": entry[0], "label": entry[1], "confidence": entry[2]}
        for entry in data
    ]

    return parsed_data

In [37]:
# parse through the deberta predictions, determining which to cascade up to the LLM
def return_cascaded_videos(data, class_to_filter, threshold):
    video_id_cascaded = []
    for tuple in data:
        video_id = tuple['video_id']
        label = tuple['label']
        confidence = float(tuple['confidence'])
        
        if label in class_to_filter or confidence < threshold:
            video_id_cascaded.append(video_id)
    return video_id_cascaded

In [None]:
# directory containing data
PREDICTION_DATA_DIR = os.getcwd() + os.sep + 'recommendation-data' + os.sep 

# Maximum Softmax Probability (MSP) Thresholds per myth
MYTH_TO_THRESHOLD = {
    'M1': 0.99,
    'M2': 0.76, 
    'M3': 0.99,
    'M4': 0.55,
    'M5': 0.78,
    'M6': 0.95,
    'M7': 0.99,
    'M8': 0.48
}

# Validation Error Tendencies (VET) classes to defer to GPT-4o
class_to_filter = [0, 2]

In [None]:
MYTH_TO_ID_LIST = defaultdict(list)
total_videos = 0
for MYTH, threshold in MYTH_TO_THRESHOLD.items():
    # reading through the predictions
    model_predictions = read_predictions_json(PREDICTION_DATA_DIR + MYTH + '-deberta-predictions.json')
    video_id_for_cascading = return_cascaded_videos(model_predictions, class_to_filter, threshold)
    MYTH_TO_ID_LIST[MYTH] = video_id_for_cascading
    total_videos += len(video_id_for_cascading)
    print(MYTH + " list: " + str(len(video_id_for_cascading)))

print(total_videos)

In [47]:
filename = 'myth_to_video_id_for_llm-redo.json'
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(MYTH_TO_ID_LIST, f, indent=4, ensure_ascii=False)
