# POC Model

This model only uses captured screenshots + text to detect if the video is a specific person (Joe Biden, Trump, etc)


## Data Import

In [None]:
!pip install torch

In [None]:
!pip install evaluate

In [None]:
!pip install transformers

In [None]:
!pip install librosa

In [None]:
!pip install accelerate -U

In [None]:
!gdown "https://drive.google.com/uc?id=1JZjiwnL6xIWJy6O__6tLROm_7RhCROMp"

Downloading...
From: https://drive.google.com/uc?id=1JZjiwnL6xIWJy6O__6tLROm_7RhCROMp
To: /content/presidential-deepfake.zip
100% 25.3M/25.3M [00:00<00:00, 29.8MB/s]


In [None]:
!unzip "presidential-deepfake.zip" -d .

In [None]:
# imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import os
import torch
import librosa
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from transformers import BertModel, BertTokenizer, AutoTokenizer
from transformers import Trainer, TrainingArguments, AdamW
from transformers import BertModel, BertTokenizer, AutoTokenizer, AutoImageProcessor, ResNetForImageClassification, AutoProcessor
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [None]:
data_dir_real = "/content/presidential-deepfake/real"
data_dir_fake = "/content/presidential-deepfake/fake"

## Pre-processing

In [None]:
# create dataframe containing text, and screenshot images for each data sample
is_trump = []
is_not_trump = []

is_biden = []
is_not_biden = []

# real data
presidents = ['biden', 'trump']
for name in presidents:
  for i in range(1,16):
    text_file = name + '-real-' + str(i) + '.txt'
    text_path = os.path.join(data_dir_real, 'text-files')
    screenshot_folder_name = name + '-real-' + str(i)
    screenshot_folder_path = os.path.join(data_dir_real, 'screenshots', screenshot_folder_name)
    image_files = os.listdir(screenshot_folder_path)
    sample = {'text': text_file, 'images': image_files, 'text_path': text_path, 'image_path': screenshot_folder_path}
    if name == "biden":
      is_biden.append(sample)
      is_not_trump.append(sample)
    else:
      is_trump.append(sample)
      is_not_biden.append(sample)

# deepfake data
for name in presidents:
  for i in range(1,9):
    text_file = name + '-fake-' + str(i) + '.txt'
    text_path = os.path.join(data_dir_fake, 'text-files')
    screenshot_folder_name = name + '-fake-' + str(i)
    screenshot_folder_path = os.path.join(data_dir_fake, 'screenshots', screenshot_folder_name)
    image_files = os.listdir(screenshot_folder_path)
    sample = {'text': text_file, 'images': image_files, 'text_path': text_path, 'image_path': screenshot_folder_path}
    if name == "biden":
      # add to not trump and biden
      is_not_biden.append(sample)
    else:
      is_not_trump.append(sample)

In [None]:
import pandas as pd

# Create dataframes for Biden and Trump
# Concatenate 'is_biden' and 'is_not_biden' for the biden_df
biden_samples = is_biden + is_not_biden
biden_labels = [1] * len(is_biden) + [0] * len(is_not_biden)

biden_df = pd.DataFrame(biden_samples)
biden_df['label'] = biden_labels  # Add the label column

trump_samples = is_trump + is_not_trump
trump_labels = [1] * len(is_trump) + [0] * len(is_not_trump)

trump_df = pd.DataFrame(trump_samples)
trump_df['label'] = trump_labels  # Add the label column

In [None]:
biden_df

Unnamed: 0,text,images,text_path,image_path,label
0,biden-real-1.txt,"[biden-real-1-044.jpg, biden-real-1-012.jpg, b...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
1,biden-real-2.txt,"[biden-real-2-016.jpg, biden-real-2-042.jpg, b...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
2,biden-real-3.txt,"[biden-real-3-041.jpg, biden-real-3-006.jpg, b...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
3,biden-real-4.txt,"[biden-real-4-051.jpg, biden-real-4-003.jpg, b...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
4,biden-real-5.txt,"[biden-real-5-047.jpg, biden-real-5-055.jpg, b...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
5,biden-real-6.txt,"[biden-real-6-044.jpg, biden-real-6-009.jpg, b...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
6,biden-real-7.txt,"[biden-real-7-025.jpg, biden-real-7-015.jpg, b...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
7,biden-real-8.txt,"[biden-real-8-032.jpg, biden-real-8-006.jpg, b...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
8,biden-real-9.txt,"[biden-real-9-061.jpg, biden-real-9-011.jpg, b...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
9,biden-real-10.txt,"[biden-real-10-055.jpg, biden-real-10-001.jpg,...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1


In [None]:
trump_df

Unnamed: 0,text,images,text_path,image_path,label
0,trump-real-1.txt,"[trump-real-1-043.jpg, trump-real-1-047.jpg, t...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
1,trump-real-2.txt,"[trump-real-2-008.jpg, trump-real-2-027.jpg, t...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
2,trump-real-3.txt,"[trump-real-3-045.jpg, trump-real-3-005.jpg, t...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
3,trump-real-4.txt,"[trump-real-4-043.jpg, trump-real-4-005.jpg, t...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
4,trump-real-5.txt,"[trump-real-5-012.jpg, trump-real-5-054.jpg, t...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
5,trump-real-6.txt,"[trump-real-6-009.jpg, trump-real-6-004.jpg, t...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
6,trump-real-7.txt,"[trump-real-7-044.jpg, trump-real-7-034.jpg, t...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
7,trump-real-8.txt,"[trump-real-8-003.jpg, trump-real-8-043.jpg, t...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
8,trump-real-9.txt,"[trump-real-9-055.jpg, trump-real-9-056.jpg, t...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1
9,trump-real-10.txt,"[trump-real-10-014.jpg, trump-real-10-025.jpg,...",/content/presidential-deepfake/real/text-files,/content/presidential-deepfake/real/screenshot...,1


In [None]:
# count how many real and fake screenshots are there
biden_label_count = biden_df["label"].value_counts()
biden_label_count

label
0    23
1    15
Name: count, dtype: int64

In [None]:
# count how many real and fake screenshots are there
trump_label_count = trump_df["label"].value_counts()
trump_label_count

label
0    23
1    15
Name: count, dtype: int64

Note how this is a more balanced dataset.

In [None]:
# create training and testing dataframes
biden_train_df, biden_test_df = train_test_split(biden_df, test_size=0.2, random_state=42)
trump_train_df, trump_test_df = train_test_split(trump_df, test_size=0.2, random_state=42)

In [None]:
# "unpack" list of images onto inidivdual rows themselves (we needed to collate them before to prevent cross-contamination of test / train datasets)
def unpack_dataframe(df):
    df_expanded = df.explode('images')

    # Creating full paths
    df_expanded['full_image_path'] = df_expanded['image_path'] + "/" + df_expanded['images']
    df_expanded['full_text_path'] = df_expanded['text_path'] + "/" + df_expanded['text']
    df_expanded.reset_index(drop=True, inplace=True)

    # Final DataFrame with desired columns
    new_df = df_expanded[['full_text_path', 'full_image_path', 'label']]
    return new_df

In [None]:
biden_train, biden_test = unpack_dataframe(biden_train_df), unpack_dataframe(biden_test_df)
trump_train, trump_test = unpack_dataframe(trump_train_df), unpack_dataframe(trump_test_df)

In [None]:
biden_train

Unnamed: 0,full_text_path,full_image_path,label
0,/content/presidential-deepfake/real/text-files...,/content/presidential-deepfake/real/screenshot...,0
1,/content/presidential-deepfake/real/text-files...,/content/presidential-deepfake/real/screenshot...,0
2,/content/presidential-deepfake/real/text-files...,/content/presidential-deepfake/real/screenshot...,0
3,/content/presidential-deepfake/real/text-files...,/content/presidential-deepfake/real/screenshot...,0
4,/content/presidential-deepfake/real/text-files...,/content/presidential-deepfake/real/screenshot...,0
...,...,...,...
1721,/content/presidential-deepfake/real/text-files...,/content/presidential-deepfake/real/screenshot...,0
1722,/content/presidential-deepfake/real/text-files...,/content/presidential-deepfake/real/screenshot...,0
1723,/content/presidential-deepfake/real/text-files...,/content/presidential-deepfake/real/screenshot...,0
1724,/content/presidential-deepfake/real/text-files...,/content/presidential-deepfake/real/screenshot...,0


In [None]:
dataset_size = {
    "train": len(biden_train),
    "test": len(biden_test)
}
dataset_size

{'train': 1726, 'test': 414}

In [None]:
label_count = biden_train["label"].value_counts()
label_count

label
0    1007
1     719
Name: count, dtype: int64

Compared to the previous baseline model, there is now a signficant balance between two labels.

In [None]:
# custom dataset
class CustomDataset(Dataset):
  def __init__(self, data_df):
    self.data_df = data_df
    self.image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")

  def __len__(self):
    return len(self.data_df)

  def __getitem__(self, idx):
    text_path = self.data_df.loc[idx, 'full_text_path']
    image_path = self.data_df.loc[idx, 'full_image_path']
    label = self.data_df.loc[idx, 'label']

    with open(text_path, 'r') as file:
        opened_text = file.read()

    image = Image.open(image_path).convert("RGB")
    processed_image = self.image_processor(images=image, return_tensors="pt")
    processed_image = processed_image["pixel_values"].squeeze()

    item = {
        "text": opened_text,
        "image": processed_image,
        "label": label
    }

    return item

In [None]:
# create training and testing datasets using custom dataset class
biden_train_dataset = CustomDataset(biden_train)
biden_test_dataset = CustomDataset(biden_test)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

In [None]:
trump_train_dataset = CustomDataset(trump_train)
trump_test_dataset = CustomDataset(trump_test)

In [None]:
# sanity check
batch = next(iter(biden_train_dataset))

In [None]:
batch['image'].shape

torch.Size([3, 224, 224])

## Create Text and Image Embedding

In [None]:
text_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
image_tokenizer = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# custom collate function
def collate_fn(batch):
    images = torch.stack([sample["image"] for sample in batch])
    # image_tokens = image_tokenizer(images, return_tensors="pt")

    texts = [sample["text"] for sample in batch]
    text_tokens = text_tokenizer(texts, padding="longest", return_tensors="pt")

    label = torch.LongTensor([sample["label"] for sample in batch])
    return {
        "pixel_values": images,
        "input_ids": text_tokens["input_ids"],
        "attention_mask": text_tokens["attention_mask"],
        "labels": label
    }

In [None]:
for k, v in collate_fn([batch]).items():
  print(k)
  print(v.shape)

pixel_values
torch.Size([1, 3, 224, 224])
input_ids
torch.Size([1, 173])
attention_mask
torch.Size([1, 173])
labels
torch.Size([1])


In [None]:
def get_image_encoder():
    image_encoder = ResNetForImageClassification.from_pretrained("microsoft/resnet-50").to(device)

    # # freeze everything
    for param in image_encoder.parameters():
        param.requires_grad = False
    image_encoder.classifier = nn.Identity()
    image_encoder.resnet.pooler = torch.nn.Identity()

    return image_encoder

In [None]:
def image_encode(image_tokens, image_encoder):
    with torch.no_grad():
        logits = image_encoder(**image_tokens).logits
    # return logits
    return torch.transpose(torch.flatten(logits, start_dim=2), 1 ,2)

In [None]:
def get_text_encoder():
    encoder = BertModel.from_pretrained('bert-base-uncased').to(device)
    # # freeze everything
    for param in encoder.parameters():
        param.requires_grad = False
    return encoder

In [None]:
def text_encode(text_tokens, text_encoder):
    with torch.no_grad():
        outputs = text_encoder(**text_tokens)
    return outputs.last_hidden_state

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# metadata
num_epochs = 10
lr = 3e-4
batch_size = 8

In [None]:
text_embedding_size = 768
image_embedding_size = 2048
num_classes = 2
max_tokens = 512

class BaselineModel(nn.Module):
    def __init__(self, num_labels=2, print_dim=False, hidden_dim=50, num_heads=2, num_layers=2, dropout=0.1):
        super().__init__()
        self.print_dim = print_dim
        self.text_encoder = get_text_encoder()
        self.image_encoder = get_image_encoder()
        self.hidden_dim = hidden_dim

        self.modality_embeddings = nn.Embedding(num_labels, hidden_dim)
        self.position_embeddings = nn.Embedding(max_tokens, hidden_dim)

        self.image_to_hidden_dim_embedding = nn.Linear(image_embedding_size, hidden_dim)
        self.text_to_hidden_dim_embedding = nn.Linear(text_embedding_size, hidden_dim)

        # Transformer layers
        encoder_layer = TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dropout=dropout)
        self.transformer = TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_head = nn.Linear(hidden_dim, num_labels)  # For classification

    def forward(self, pixel_values, input_ids, attention_mask, labels=None):
        if self.print_dim:
            print("Pixel_values: " + str(pixel_values.shape))
            print("input_ids: " + str(input_ids.shape))
            print("attention_mask: " + str(attention_mask.shape))

        text_tokens = {
            "input_ids": input_ids.to(device),
            "attention_mask": attention_mask.to(device)
        }
        text_embedding = text_encode(text_tokens, self.text_encoder) # 8 x T x 768
        text_embedding_resize = self.text_to_hidden_dim_embedding(text_embedding)

        image_tokens = {
            "pixel_values": pixel_values.to(device)
        }
        image_embedding = image_encode(image_tokens, self.image_encoder) # 8 x 49 x 2048
        image_embedding_resize = self.image_to_hidden_dim_embedding(image_embedding)

        fusion_embedding = torch.cat((text_embedding_resize, image_embedding_resize), dim=1)

        # modality embedding
        text_modality_embedding = torch.zeros(input_ids.size()).long().to(device) # 8 x T
        image_modality_embedding = torch.ones(image_embedding_resize.size()[:-1]).long().to(device) # 8 x 49
        fusion_modality_embedding = torch.cat((text_modality_embedding, image_modality_embedding), dim=1).to(device) # 8 x (T + 49)
        fusion_modality_embedding = self.modality_embeddings(fusion_modality_embedding).to(device) # 8 x (T + 49) x 768
        if self.print_dim:
            print("Modality Embedding size: " + str(fusion_modality_embedding.shape))

        # positional embedding
        text_position_embedding = self.get_positional_embedding(tuple(input_ids.size())).to(device) # B x T

        image_position_embedding = self.get_positional_embedding(tuple(image_embedding_resize.size()[:-1])).to(device) # B x 49

        fusion_positional_embedding = torch.cat((text_position_embedding, image_position_embedding), dim=1).to(device) # 8 x (T + 49)
        fusion_positional_embedding = self.position_embeddings(fusion_positional_embedding).to(device) # 8 x (T + 49) x 768
        if self.print_dim:
            print("Positional Embedding size: " + str(fusion_positional_embedding.shape))

        # add three together
        fusion_embedding += fusion_modality_embedding + fusion_positional_embedding # 8 x (T + 49) x 768
        if self.print_dim:
            print("Final Embedding size: " + str(fusion_embedding.shape))

        # Prepare mask for transformer
        image_mask = torch.ones(image_embedding_resize.size()[:-1]).to(device) # 8 x 49
        fusion_mask = torch.cat((attention_mask.to(device), image_mask), dim=1).to(device)  # 8 x (T + 49)

        # Check dimensions
        if self.print_dim:
            print("Fusion Mask Shape: ", fusion_mask.shape)

        # Transformer input
        transformer_output = self.transformer(fusion_embedding, src_key_padding_mask=(fusion_mask == 0).T)
        pooled_output = transformer_output.mean(dim=1)
        logits = self.output_head(pooled_output).to(device)

        # Compute loss if labels are provided
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels.to(device))
            return {"loss": loss, "logits": logits}
        else:
            return {"logits": logits}

    def get_positional_embedding(self, shape):
        position_tensor = torch.arange(shape[1], dtype=torch.long).unsqueeze(0)
        position_tensor = position_tensor.repeat(shape[0], 1)

        return position_tensor

# Training for Biden Detector



In [None]:
biden_training_args = TrainingArguments(
    output_dir='./biden_detector_results',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    lr_scheduler_type='cosine',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    remove_unused_columns=False,
    push_to_hub=False,
    load_best_model_at_end=True,
    dataloader_num_workers=0,
    use_cpu=False,
    seed=123
)

In [None]:
import evaluate

def compute_metrics(eval_pred):
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")
    metric4 = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric1.compute(predictions=predictions, references=labels,
                                average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels,
                             average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels,
                         average="weighted")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)[
        "accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1,
            "accuracy": accuracy}

In [None]:
model_print = BaselineModel(print_dim=True).to(device)



In [None]:
# sanity check
model_print.forward(**collate_fn([batch]))

Pixel_values: torch.Size([1, 3, 224, 224])
input_ids: torch.Size([1, 210])
attention_mask: torch.Size([1, 210])
Modality Embedding size: torch.Size([1, 259, 50])
Positional Embedding size: torch.Size([1, 259, 50])
Final Embedding size: torch.Size([1, 259, 50])
Fusion Mask Shape:  torch.Size([1, 259])


{'loss': tensor(0.6951, device='cuda:0', grad_fn=<NllLossBackward0>),
 'logits': tensor([[0.0031, 0.0070]], device='cuda:0', grad_fn=<AddmmBackward0>)}

In [None]:
biden_model = BaselineModel().to(device)



In [None]:
biden_trainer = Trainer(
    model=biden_model,
    args=biden_training_args,
    train_dataset=biden_train_dataset,
    eval_dataset=biden_test_dataset,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [None]:
biden_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.535206,0.715375,0.714976,0.715159,0.714976
2,No log,0.419369,0.81268,0.799517,0.800379,0.799517
3,0.269600,0.37313,0.785936,0.777778,0.778794,0.777778
4,0.269600,0.35016,0.717584,0.717391,0.717484,0.717391
5,0.019800,0.369697,0.742819,0.743961,0.74295,0.743961
6,0.019800,0.366053,0.717584,0.717391,0.717484,0.717391
7,0.009300,0.35037,0.717584,0.717391,0.717484,0.717391
8,0.009300,0.358383,0.717584,0.717391,0.717484,0.717391
9,0.009300,0.364143,0.717584,0.717391,0.717484,0.717391
10,0.006900,0.359401,0.717584,0.717391,0.717484,0.717391


TrainOutput(global_step=2160, training_loss=0.07121559370447088, metrics={'train_runtime': 539.3037, 'train_samples_per_second': 32.004, 'train_steps_per_second': 4.005, 'total_flos': 0.0, 'train_loss': 0.07121559370447088, 'epoch': 10.0})

In [None]:
biden_trainer.evaluate()

{'eval_loss': 0.35016024112701416,
 'eval_precision': 0.71758433461332,
 'eval_recall': 0.717391304347826,
 'eval_f1': 0.7174836211854102,
 'eval_accuracy': 0.717391304347826,
 'eval_runtime': 11.3102,
 'eval_samples_per_second': 36.604,
 'eval_steps_per_second': 4.598,
 'epoch': 10.0}

# Training for Trump Detector



In [None]:
trump_training_args = TrainingArguments(
    output_dir='./trump_detector_results',
    num_train_epochs=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    lr_scheduler_type='cosine',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    remove_unused_columns=False,
    push_to_hub=False,
    load_best_model_at_end=True,
    dataloader_num_workers=0,
    use_cpu=False,
    seed=123
)

In [None]:
trump_model = BaselineModel().to(device)



In [None]:
trump_trainer = Trainer(
    model=trump_model,
    args=biden_training_args,
    train_dataset=trump_train_dataset,
    eval_dataset=trump_test_dataset,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [None]:
trump_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.494157,0.885913,0.857143,0.851049,0.857143
2,No log,0.385284,0.884347,0.85468,0.848309,0.85468
3,0.315800,0.394769,0.87668,0.842365,0.834476,0.842365
4,0.315800,0.378974,0.878192,0.844828,0.83726,0.844828
5,0.030900,0.641211,0.786602,0.657635,0.585648,0.657635


In [None]:
trump_trainer.evaluate()