# Setup

In [20]:
# The usuals
import numpy as np
from numpy import quantile, where, random

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import multiprocessing

# Scientific
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.parsing.preprocessing import preprocess_string

from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neural_network import MLPRegressor

from scipy.spatial.distance import cosine

# Supporting

from tqdm import tqdm
import joblib

from os import listdir
from os.path import isfile, join

from collections import Counter

In [21]:
# Paths
DATA_PATH = '../data/'
OUTPUT = '../output_data/'
MODEL_PATH = '../data/models/'

In [22]:
# Check files in data folder
datafiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]

print('Index, Filename')
print(list(zip([index for index, value in enumerate(datafiles)], datafiles)))

Index, Filename
[(0, 'arxiv_disinformation.csv'), (1, 'arxiv_deepfake.csv'), (2, 'deepfake_txt.csv')]


In [23]:
# Get a file name, can use
filename = datafiles[1]
filename

'arxiv_deepfake.csv'

In [24]:
# Load dataframe
CONVERTERS = {'tokens': eval, 'published_parsed': eval, 'tags': eval, 'arxiv_primary_category': eval}

df = pd.read_csv(DATA_PATH + filename, converters=CONVERTERS)

In [25]:
# Check data frame
df.head(3)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,arxiv_primary_category,tags,arxiv_affiliation,arxiv_journal_ref,arxiv_doi,cleaning,tokens,year,month_year,category
0,http://arxiv.org/abs/2203.14315v1,True,http://arxiv.org/abs/2203.14315v1,2022-03-27T14:25:52Z,"[2022, 3, 27, 14, 25, 52, 6, 86, 0]",2022-03-27T14:25:52Z,"[2022, 3, 27, 14, 25, 52, 6, 86, 0]",Adaptive Frequency Learning in Two-branch Face...,"{'type': 'text/plain', 'language': None, 'base...",Face forgery has attracted increasing attentio...,...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,face forgery has attracted increasing attentio...,"[face, forgery, attract, increase, attention, ...",2022,"[2022, 3]",cs.CV
1,http://arxiv.org/abs/2203.13964v1,True,http://arxiv.org/abs/2203.13964v1,2022-03-26T01:55:37Z,"[2022, 3, 26, 1, 55, 37, 5, 85, 0]",2022-03-26T01:55:37Z,"[2022, 3, 26, 1, 55, 37, 5, 85, 0]",Fusing Global and Local Features for Generaliz...,"{'type': 'text/plain', 'language': None, 'base...",With the development of the Generative Adversa...,...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,with the development of the generative adversa...,"[development, generative, adversarial, network...",2022,"[2022, 3]",cs.CV
2,http://arxiv.org/abs/2203.12208v2,True,http://arxiv.org/abs/2203.12208v2,2022-03-25T16:00:07Z,"[2022, 3, 25, 16, 0, 7, 4, 84, 0]",2022-03-23T05:52:23Z,"[2022, 3, 23, 5, 52, 23, 2, 82, 0]",Self-supervised Learning of Adversarial Exampl...,"{'type': 'text/plain', 'language': None, 'base...",Recent studies in deepfake detection have yiel...,...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,recent studies in deepfake detection have yiel...,"[recent, study, deepfake, detection, yield, pr...",2022,"[2022, 3]",cs.CV


In [26]:
# Here for tests we will load a second df, not do a traditional train test split, as we want some sort of bias in the second set - to ensure outliers.

# Prep work

If "cleaning" column kept from Preprocssing, can use that column

In [27]:
def join_tokens(txt):
        x = ' '.join(txt)
        #x = [token.split('/')[0] for token in x] # use when we need lists with just these!
        return x

In [28]:
df['tokens_merged'] = df['tokens'].dropna().apply(lambda x: join_tokens(x))


In [32]:
df['tokens_merged'][1]

'development generative adversarial network gan deepfake aisynthesize image high quality human hardly distinguish real image imperative medium forensic develop detector expose accurately exist detection method show high performance generate image detection tend generalize poorly scenario synthetic image usually generate unseen model use unknown source datum work emphasize importance combine information whole image informative patch improve generalization ability aisynthesize image detection specifically design twobranch model combine global spatial information whole image local informative feature multiple patch select novel patch selection module multihead attention mechanism far utilize fuse global local feature collect highly diverse dataset synthesize model various object resolution evaluate model experimental result demonstrate high accuracy good generalization ability method detect generated image'

In [31]:
df['cleaning'][1]

'with the development of the generative adversarial networks gans and deepfakes aisynthesized images are now of such high quality that humans can hardly distinguish them from real images it is imperative for media forensics to develop detectors to expose them accurately existing detection methods have shown high performance in generated images detection but they tend to generalize poorly in the realworld scenarios where the synthetic images are usually generated with unseen models using unknown source data in this work we emphasize the importance of combining information from the whole image and informative patches in improving the generalization ability of aisynthesized image detection specifically we design a twobranch model to combine global spatial information from the whole image and local informative features from multiple patches selected by a novel patch selection module multihead attention mechanism is further utilized to fuse the global and local features we collect a highly 

# Doc2 Vec

Creates doc2vec vectors for each document in the dataframe

In [30]:

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, action_column, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1
        self.action_column = action_column

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(str(row[self.action_column]).split(), [index]) for index, row in df_x.iterrows()] # edit this: will not work on Chinese

        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers) # maybe want to try Word2Vec

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(str(row[self.action_column]).split())
                                     for index, row in df_x.iterrows()]))

In [56]:
# Initializing model
doc2vec_tr = Doc2VecTransformer('tokens_merged', 
                              vector_size=300,#normally imo 150
                              epochs= 50,
                              )

# Fitting
#doc2vec_tr.fit(df)
fitted = doc2vec_tr.fit(df)

#Transforming
doc2vec_vectors = fitted.transform(df)

100%|██████████| 438/438 [00:00<00:00, 2116480.59it/s]
100%|██████████| 438/438 [00:00<00:00, 2282118.20it/s]
100%|██████████| 438/438 [00:00<00:00, 2128742.93it/s]
100%|██████████| 438/438 [00:00<00:00, 2376591.40it/s]
100%|██████████| 438/438 [00:00<00:00, 1971142.87it/s]
100%|██████████| 438/438 [00:00<00:00, 2168955.32it/s]
100%|██████████| 438/438 [00:00<00:00, 2027709.88it/s]
100%|██████████| 438/438 [00:00<00:00, 2200125.93it/s]
100%|██████████| 438/438 [00:00<00:00, 4073403.88it/s]
100%|██████████| 438/438 [00:00<00:00, 2163845.88it/s]
100%|██████████| 438/438 [00:00<00:00, 3453205.17it/s]
100%|██████████| 438/438 [00:00<00:00, 3993706.85it/s]
100%|██████████| 438/438 [00:00<00:00, 2420428.40it/s]
100%|██████████| 438/438 [00:00<00:00, 2439714.68it/s]
100%|██████████| 438/438 [00:00<00:00, 2414067.22it/s]
100%|██████████| 438/438 [00:00<00:00, 2635731.93it/s]
100%|██████████| 438/438 [00:00<00:00, 3950763.77it/s]
100%|██████████| 438/438 [00:00<00:00, 3950763.77it/s]
100%|█████

In [34]:
len(doc2vec_vectors)

438

In [35]:
doc2vec_vectors[1].shape

(1, 300)

In [36]:
filename

'arxiv_deepfake.csv'

In [57]:
# ADD SAVE KV
# ADD LOAD KV
m = MODEL_PATH + filename.split('.')[0] + '_doc_vectors.pkl'
print('Saving as: ' + m)

joblib.dump(fitted, m) 

Saving as: ../data/models/arxiv_deepfake_doc_vectors.pkl


['../data/models/arxiv_deepfake_doc_vectors.pkl']

# MODELS

In [54]:
# Check files in models folder
models = [f for f in listdir(MODEL_PATH) if isfile(join(MODEL_PATH, f))]

print('Index, Model Name')
print(list(zip([index for index, value in enumerate(models)], models)))


Index, Model Name
[(0, 'arxiv_disinformation_doc_vectors.pkl'), (1, 'arxiv_deepfake_doc_vectors.pkl'), (2, 'train_arxiv_deepfake_doc_vectors.pkl'), (3, 'arxiv_deepfake_svm_model.pkl'), (4, 'arxiv_deepfake_iso_model.pkl')]


In [None]:
# to load a tfidf model / just for testing here
model_name = datafiles[0]

doc2vec_vectors = joblib.load(MODEL_PATH + model_name)


In [None]:
# then transform again

## SVM Method

In [58]:
# Initialize and fit model
'''
Adjust nu hyperparameter to, simplifing, 
increase/decrease "novelty" sensitivity. 
It is very high now = less outliers
'''

model = OneClassSVM(kernel = 'rbf', 
                  gamma = 'scale', 
                  nu = 0.001).fit(doc2vec_vectors)

In [59]:
joblib.dump(model, MODEL_PATH + filename.split('.')[0] + "_svm_model.pkl")

['../data/models/arxiv_deepfake_svm_model.pkl']

In [60]:
# Prediction
y_pred = model.predict(doc2vec_vectors)

In [61]:
# Filter outlier index
outlier_index = where(y_pred == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

# Un-used, for inspection
#outlier_values = doc2vec_vectors.iloc[outlier_index]
#outlier_values

Outliers: 5


In [62]:
# Create a df with just outliers

df_misclass = df[df.index.isin(indexes)]

In [63]:
# Inspect dataframe

df_misclass.head(3)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,tags,arxiv_affiliation,arxiv_journal_ref,arxiv_doi,cleaning,tokens,year,month_year,category,tokens_merged
8,http://arxiv.org/abs/2203.06825v1,True,http://arxiv.org/abs/2203.06825v1,2022-03-14T02:44:56Z,"[2022, 3, 14, 2, 44, 56, 0, 73, 0]",2022-03-14T02:44:56Z,"[2022, 3, 14, 2, 44, 56, 0, 73, 0]",Fairness Evaluation in Deepfake Detection Mode...,"{'type': 'text/plain', 'language': None, 'base...",Fairness of deepfake detectors in the presence...,...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,fairness of deepfake detectors in the presence...,"[fairness, deepfake, detector, presence, anoma...",2022,"[2022, 3]",cs.CV,fairness deepfake detector presence anomaly we...
46,http://arxiv.org/abs/2110.01640v1,True,http://arxiv.org/abs/2110.01640v1,2021-10-04T18:02:56Z,"[2021, 10, 4, 18, 2, 56, 0, 277, 0]",2021-10-04T18:02:56Z,"[2021, 10, 4, 18, 2, 56, 0, 277, 0]",An Experimental Evaluation on Deepfake Detecti...,"{'type': 'text/plain', 'language': None, 'base...",Significant advances in deep learning have obt...,...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,significant advances in deep learning have obt...,"[significant, advance, deep, learning, obtain,...",2021,"[2021, 10]",cs.CV,significant advance deep learning obtain hallm...
128,http://arxiv.org/abs/2012.10580v1,True,http://arxiv.org/abs/2012.10580v1,2020-12-19T03:02:15Z,"[2020, 12, 19, 3, 2, 15, 5, 354, 0]",2020-12-19T03:02:15Z,"[2020, 12, 19, 3, 2, 15, 5, 354, 0]",Identifying Invariant Texture Violation for Ro...,"{'type': 'text/plain', 'language': None, 'base...",Existing deepfake detection methods have repor...,...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,existing deepfake detection methods have repor...,"[exist, deepfake, detection, method, report, p...",2020,"[2020, 12]",cs.CV,exist deepfake detection method report promisi...


In [64]:
# Add compare function

Another approach to getting outliers using SVM model but diffferent criteria (more of a % than a absolute value)

In [65]:
scores = model.score_samples(doc2vec_vectors)

In [66]:
# Change treshhold as needed

thresh = quantile(scores, 0.03)
print(thresh)

0.10631984670769612


In [67]:
# getting indexes

index = where(scores<=thresh)
index = list(index[0])
print(len(index))


14


In [68]:
# Creating second df
df_misclass_2 = df[df.index.isin(index)]

# And viewing it
df_misclass_2.head(3)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,tags,arxiv_affiliation,arxiv_journal_ref,arxiv_doi,cleaning,tokens,year,month_year,category,tokens_merged
8,http://arxiv.org/abs/2203.06825v1,True,http://arxiv.org/abs/2203.06825v1,2022-03-14T02:44:56Z,"[2022, 3, 14, 2, 44, 56, 0, 73, 0]",2022-03-14T02:44:56Z,"[2022, 3, 14, 2, 44, 56, 0, 73, 0]",Fairness Evaluation in Deepfake Detection Mode...,"{'type': 'text/plain', 'language': None, 'base...",Fairness of deepfake detectors in the presence...,...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,fairness of deepfake detectors in the presence...,"[fairness, deepfake, detector, presence, anoma...",2022,"[2022, 3]",cs.CV,fairness deepfake detector presence anomaly we...
46,http://arxiv.org/abs/2110.01640v1,True,http://arxiv.org/abs/2110.01640v1,2021-10-04T18:02:56Z,"[2021, 10, 4, 18, 2, 56, 0, 277, 0]",2021-10-04T18:02:56Z,"[2021, 10, 4, 18, 2, 56, 0, 277, 0]",An Experimental Evaluation on Deepfake Detecti...,"{'type': 'text/plain', 'language': None, 'base...",Significant advances in deep learning have obt...,...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,significant advances in deep learning have obt...,"[significant, advance, deep, learning, obtain,...",2021,"[2021, 10]",cs.CV,significant advance deep learning obtain hallm...
106,http://arxiv.org/abs/2103.09396v3,True,http://arxiv.org/abs/2103.09396v3,2021-10-03T01:05:56Z,"[2021, 10, 3, 1, 5, 56, 6, 276, 0]",2021-03-17T01:48:34Z,"[2021, 3, 17, 1, 48, 34, 2, 76, 0]",Pros and Cons of GAN Evaluation Measures: New ...,"{'type': 'text/plain', 'language': None, 'base...",This work is an update of a previous paper on ...,...,"[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",,,,this work is an update of a previous paper on ...,"[work, update, previous, paper, topic, publish...",2021,"[2021, 3]",cs.LG,work update previous paper topic publish year ...


## Isolation Forest Method

In [48]:
# Initialize model
iso_model = IsolationForest(n_estimators=100,
                  max_samples='auto',
                  contamination=float(0.01),
                  random_state=42
                  )

# Fitting model
iso_model.fit(doc2vec_vectors)

print(iso_model.get_params())

{'bootstrap': False, 'contamination': 0.01, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 100, 'n_jobs': None, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [49]:
joblib.dump(iso_model, MODEL_PATH + filename.split('.')[0] + "_iso_model.pkl")

['../data/models/arxiv_deepfake_iso_model.pkl']

In [50]:
data = pd.DataFrame()

data['scores'] = iso_model.decision_function(doc2vec_vectors)

data['anomaly_score'] = iso_model.predict(doc2vec_vectors) 



In [51]:
data[data['anomaly_score']==-1]

Unnamed: 0,scores,anomaly_score
89,-0.021069,-1
184,-0.001319,-1
283,-0.040489,-1
295,-0.140542,-1
322,-0.131213,-1


In [52]:
outlier_index = where(data['anomaly_score'] == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

isolation_misclass = df[df.index.isin(indexes)]

Outliers: 5


In [53]:
isolation_misclass.title

89     TAR: Generalized Forensic Framework to Detect ...
184             Detecting Deepfakes with Metric Learning
283    FakeAVCeleb: A Novel Audio-Video Multimodal De...
295    FFR_FD: Effective and Fast Detection of DeepFa...
322    MagDR: Mask-guided Detection and Reconstructio...
Name: title, dtype: object

## Cosine Simularity Method

In [None]:
# Initialize, fit and predict
auto_encoder = MLPRegressor(hidden_layer_sizes=(
                                                 600,
                                                 150, 
                                                 600,
                                               ))

auto_encoder.fit(doc2vec_vectors, doc2vec_vectors)

predicted_vectors = auto_encoder.predict(doc2vec_vectors)

In [None]:
# Visual loss
pd.DataFrame(auto_encoder.loss_curve_).plot()

In [None]:
df.columns

In [None]:
def key_consine_similarity(tupple):
    return tupple[1]

def get_computed_similarities(vectors, predicted_vectors, reverse=False):
    data_size = len(df)
    cosine_similarities = []
    for i in range(data_size):
        cosine_sim_val = (1 - cosine(vectors[i], predicted_vectors[i]))
        cosine_similarities.append((i, cosine_sim_val))

    return sorted(cosine_similarities, key=key_consine_similarity, reverse=reverse)

def display_top_n(sorted_cosine_similarities, n=5):
    for i in range(n):
        index, consine_sim_val = sorted_cosine_similarities[i]
        print('Title: ', df.iloc[index, 7])
        print('ID: ', df.iloc[index, 0])  
        print('Cosine Sim Val :', consine_sim_val)
        print('---------------------------------')

# add function to sort by percentage


In [None]:
# Specify how many 'outliers' you want to see
N = 20

In [None]:
print('Top n unique')

sorted_cosine_similarities = get_computed_similarities(vectors=doc2vec_vectors, predicted_vectors=predicted_vectors)

display_top_n(sorted_cosine_similarities=sorted_cosine_similarities, n = N)

### Visualizing the cosines - will revise during first test

In [None]:
# Seaborn histogram
# Can use to adjust the N above (or percent, once we have that function) to see the low cluster

sns.distplot(losses, hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})

# # Add labels
# plt.title('Title')
# plt.xlabel('Label x')
# plt.ylabel('Label y')

### IIRC not fully functional yet - for more Cosine work

In [None]:
len(list(df.columns))

In [None]:
df.columns

In [None]:
most_unique_index, cosine_sim_val = sorted_cosine_similarities[0]
print(most_unique_index)
most_unique_plot =df.iloc[most_unique_index, 9] # index here matters!
most_unique_words_counter = Counter(preprocess_string(most_unique_plot))
print(most_unique_words_counter)

# intersected_common_word_counter = common_word_counter & most_unique_words_counter

# intersected_common_words = [word[0] for word in intersected_common_word_counter.items()]
# intersected_common_word_counts = [word[1] for word in intersected_common_word_counter.items()]

# intersected_common_word_counter