In [233]:
import os
import pickle

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch

from subtask_1_2a import check_format, evaluate_h

# Preparing embeddings and data for classification

In [223]:
CLIP_MODEL_FEATURES = "../extracted_features/ClipVIT_B32"

def read_data(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

def get_embeddings(task, set, embeddings_dir=CLIP_MODEL_FEATURES):
    embeddings_dir = embeddings_dir
    for name in os.listdir(embeddings_dir):
        if task + "_" + set in name:
            if "image" in name:
                image_embeddings = read_data(os.path.join(embeddings_dir, name))
                image_embeddings = [embedding[0].numpy() for _, embedding in image_embeddings.items()]
            elif "text" in name:
                text_embeddings = read_data(os.path.join(embeddings_dir, name))
                text_embeddings = [embedding[0].numpy() for _, embedding in text_embeddings.items()]

    return text_embeddings, image_embeddings

def get_raw_data(task, set):
    set = "val" if set == "validation" else set
    file = f"../annotations/subtask{task}/{set}.json"
    data = pd.read_json(file)
    return data

def get_data(task, set):
    text_embeddings, image_embeddings = get_embeddings(task, set)
    raw_data = get_raw_data(task, set)
    raw_data["text_embeddings"] = text_embeddings
    raw_data["image_embeddings"] = image_embeddings

    return raw_data

In [225]:
data_train = get_data(task="2b", set="train")
data_validation = get_data(task="2b", set="validation")

data_train["concatenated_embeddings"] = data_train.apply(lambda row: np.concatenate((row.text_embeddings, row.image_embeddings)), axis=1)
data_validation["concatenated_embeddings"] = data_validation.apply(lambda row: np.concatenate((row.text_embeddings, row.image_embeddings)), axis=1)

In [226]:
data_train

Unnamed: 0,id,text,image,label,text_embeddings,image_embeddings,concatenated_embeddings
0,35807,DONALD TRUMP: BARACK\nOBAMA AND JOE BIDEN\nWIL...,prop_meme_6570.png,propagandistic,"[-0.3242, 0.04156, 0.1288, -0.04965, -0.313, -...","[-0.5376, 0.428, -0.404, 0.2576, 0.03836, 0.15...","[-0.3242, 0.04156, 0.1288, -0.04965, -0.313, -..."
1,30562,00\n10% FOR\nTHE BIG GUY\nNANCY'S\nCUT\n@ImMem...,prop_meme_8346.png,propagandistic,"[0.1453, 0.279, -0.07104, 0.2039, -0.1882, 0.3...","[0.4866, 0.3816, 0.4597, 0.1007, -0.2932, -0.5...","[0.1453, 0.279, -0.07104, 0.2039, -0.1882, 0.3..."
2,44163,"To much political posting online\nthese days, ...",prop_meme_24378.png,non_propagandistic,"[-0.0949, 0.01037, -0.3408, -0.01467, -0.3308,...","[-0.1317, -0.2247, -0.2047, 0.02065, 0.1738, -...","[-0.0949, 0.01037, -0.3408, -0.01467, -0.3308,..."
3,24224,I DON'T THINK\nYOU UNDERSTOOD\nWHAT I SAID.\nY...,prop_meme_2594.png,propagandistic,"[0.02841, -0.0127, 0.1814, -0.0631, -0.3118, 0...","[0.1572, -0.2983, 0.2915, -0.249, -0.642, 0.29...","[0.02841, -0.0127, 0.1814, -0.0631, -0.3118, 0..."
4,31611,ⒸSergey Mihailicenko/Anadolu Agency via Getty ...,prop_meme_7654.png,propagandistic,"[-0.01251, -0.3154, -0.1598, -0.362, -0.2141, ...","[-0.4382, 0.4343, -0.1145, 0.2101, 0.2932, -0....","[-0.01251, -0.3154, -0.1598, -0.362, -0.2141, ..."
...,...,...,...,...,...,...,...
1195,24014,Joe Biden\nis transplanting\nHundreds of thous...,prop_meme_4276.png,propagandistic,"[-0.08105, -0.1439, 0.2069, 0.1754, -0.0593, -...","[0.002172, 0.3079, 0.01727, 0.4546, 0.04306, -...","[-0.08105, -0.1439, 0.2069, 0.1754, -0.0593, -..."
1196,31283,THE INDEPENDENT REVIEW BROUGHT ON BY\nREPUBLIC...,prop_meme_7494.png,propagandistic,"[0.1885, 0.1377, 0.2491, -0.05048, -0.2524, 0....","[0.07825, 0.2041, 0.1857, 0.03842, -0.2029, 0....","[0.1885, 0.1377, 0.2491, -0.05048, -0.2524, 0...."
1197,34833,Longtime White House Photographer Pete\nSouza ...,prop_meme_6217.png,propagandistic,"[-0.1749, -0.02106, -0.1366, 0.2104, -0.4033, ...","[-0.3796, 0.7524, 0.1749, 0.05066, 0.3706, 0.0...","[-0.1749, -0.02106, -0.1366, 0.2104, -0.4033, ..."
1198,23378,✔\nThe last time I was here in 2017\nCapitol p...,prop_meme_3748.png,propagandistic,"[0.08234, -0.3438, -0.1277, 0.308, -0.678, -0....","[-0.2086, -0.631, 0.3274, 0.382, -0.6704, 0.13...","[0.08234, -0.3438, -0.1277, 0.308, -0.678, -0...."


In [217]:
# X_train, X_test, Y_train, Y_test, = train_test_split(data_train.concatenated_embeddings, data_train.label, test_size=0.2, random_state=42)

# Classification methods

## Logistic Regression

In [227]:
X_train = data_train.concatenated_embeddings.to_list()
Y_train = data_train.label
logistic_regression_classifier = LogisticRegression(random_state=0, max_iter=5000)
logistic_regression_classifier.fit(X_train, Y_train)

In [232]:
X_test = data_validation.concatenated_embeddings.to_list()
Y_test = data_validation.label
res = logistic_regression_classifier.predict(X_test)
f1_score(Y_test, res, average="micro")

0.82

## Linear Discriminant

In [230]:
linear_discriminant_classifier = LinearDiscriminantAnalysis()
linear_discriminant_classifier.fit(X_train, Y_train)

In [231]:
res = linear_discriminant_classifier.predict(X_test)
f1_score(Y_test, res, average="micro")

0.6066666666666667

## Kneighbors

In [234]:
kneighbors_classifier = KNeighborsClassifier()
kneighbors_classifier.fit(X_train, Y_train)

In [235]:
res = kneighbors_classifier.predict(X_test)
f1_score(Y_test, res, average="micro")

0.7866666666666666