# Setup

In [1]:
# The usuals
import numpy as np
from numpy import quantile, where, random

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import multiprocessing

# Scientific
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.parsing.preprocessing import preprocess_string

from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neural_network import MLPRegressor

from scipy.spatial.distance import cosine

# Supporting

from tqdm import tqdm
import joblib

from os import listdir
from os.path import isfile, join

from collections import Counter

In [2]:
# Paths
DATA_PATH = '../data/'
OUTPUT = '../output_data/'
MODEL_PATH = '../data/models/'

In [3]:
# Check files in data folder
datafiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]

print('Index, Filename')
print(list(zip([index for index, value in enumerate(datafiles)], datafiles)))

Index, Filename
[(0, 'deepfake_txt.csv'), (1, 'arxiv_disinformation.csv'), (2, 'arxiv_deepfake.csv'), (3, 'results.csv'), (4, 'reddit_machinelearning.csv')]


In [5]:
# Get a file name, can use
filename = datafiles[3]
filename

'results.csv'

In [6]:
# Load dataframe
CONVERTERS = {'tokens': eval, 'published_parsed': eval, 'tags': eval, 'arxiv_primary_category': eval}

df = pd.read_csv(DATA_PATH + filename, converters=CONVERTERS)

In [7]:
# Check data frame
df.head(3)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,arxiv_comment,arxiv_doi,arxiv_journal_ref,arxiv_affiliation,search_term,cleaning,tokens,category,year,month_year
0,http://arxiv.org/abs/2204.02960v1,True,http://arxiv.org/abs/2204.02960v1,2022-04-06T17:54:46Z,"[2022, 4, 6, 17, 54, 46, 2, 96, 0]",2022-04-06T17:54:46Z,"[2022, 4, 6, 17, 54, 46, 2, 96, 0]",Simple and Effective Synthesis of Indoor 3D Sc...,"{'type': 'text/plain', 'language': None, 'base...",We study the problem of synthesizing immersive...,...,,,,,GAN,we study the problem of synthesizing immersive...,"[study, problem, synthesize, immersive, indoor...",cs.CV,2022,2022-4
1,http://arxiv.org/abs/2204.02591v1,True,http://arxiv.org/abs/2204.02591v1,2022-04-06T05:51:04Z,"[2022, 4, 6, 5, 51, 4, 2, 96, 0]",2022-04-06T05:51:04Z,"[2022, 4, 6, 5, 51, 4, 2, 96, 0]","Contextual Attention Mechanism, SRGAN Based In...","{'type': 'text/plain', 'language': None, 'base...",The new alternative is to use deep learning to...,...,,,,,"GAN, fake news",the new alternative is to use deep learning to...,"[new, alternative, use, deep, learning, inpain...",cs.CV,2022,2022-4
2,http://arxiv.org/abs/2204.02411v1,True,http://arxiv.org/abs/2204.02411v1,2022-04-05T18:00:04Z,"[2022, 4, 5, 18, 0, 4, 1, 95, 0]",2022-04-05T18:00:04Z,"[2022, 4, 5, 18, 0, 4, 1, 95, 0]",Texturify: Generating Textures on 3D Shape Sur...,"{'type': 'text/plain', 'language': None, 'base...",Texture cues on 3D objects are key to compelli...,...,Project Page: https://nihalsid.github.io/textu...,,,,GAN,texture cues on 3d objects are key to compelli...,"[texture, cue, object, key, compelling, visual...",cs.CV,2022,2022-4


In [8]:
# Here for tests we will load a second df, not do a traditional train test split, as we want some sort of bias in the second set - to ensure outliers.

# Create a sub df with select values

In [9]:
df.search_term.value_counts()

fake news                986
GAN                      984
disinformation           222
GPT-3                    134
GAN, fake news            14
GAN, disinformation        1
GAN, GPT-3                 1
GPT-3, disinformation      1
Name: search_term, dtype: int64

In [29]:
keep_list = ['disinformation', 'GAN']
df = df.loc[df['search_term'].isin(keep_list)]


In [30]:
df.search_term.value_counts()

GAN               984
disinformation    222
Name: search_term, dtype: int64

In [50]:
df.to_csv(DATA_PATH + 'test_df.csv', index=False)

# Prep work

If "cleaning" column kept from Preprocssing, can use that column

In [31]:
def join_tokens(txt):
        x = ' '.join(txt)
        #x = [token.split('/')[0] for token in x] # use when we need lists with just these!
        return x

In [32]:
df['tokens_merged'] = df['tokens'].dropna().apply(lambda x: join_tokens(x))


# Doc2 Vec

Creates doc2vec vectors for each document in the dataframe

In [34]:

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, action_column, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1
        self.action_column = action_column

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(str(row[self.action_column]).split(), [index]) for index, row in df_x.iterrows()] # edit this: will not work on Chinese

        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers) # maybe want to try Word2Vec

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(str(row[self.action_column]).split())
                                     for index, row in df_x.iterrows()]))

In [35]:
# Initializing model
doc2vec_tr = Doc2VecTransformer('tokens_merged', 
                              vector_size=150,#normally imo 150
                              epochs= 50,
                              )

# Fitting
#doc2vec_tr.fit(df)
fitted = doc2vec_tr.fit(df)

#Transforming
doc2vec_vectors = fitted.transform(df)

100%|██████████| 1206/1206 [00:00<00:00, 4949442.88it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2511584.22it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2852978.36it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2491788.48it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2622255.38it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2652506.88it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2580780.93it/s]
100%|██████████| 1206/1206 [00:00<00:00, 3760840.61it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2918828.98it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2913784.92it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2928969.67it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2908758.27it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2861046.73it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2970246.99it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2963286.83it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2984265.85it/s]
100%|██████████| 1206/1206 [00:00<00:00, 2788495.38it/s]
100%|██████████| 1206/1206 [00:

In [18]:
len(doc2vec_vectors)

2343

In [19]:
doc2vec_vectors[1].shape

(1, 150)

In [20]:
filename

'results.csv'

In [57]:
# ADD SAVE KV
# ADD LOAD KV
m = MODEL_PATH + filename.split('.')[0] + '_doc_vectors.pkl'
print('Saving as: ' + m)

joblib.dump(fitted, m) 

Saving as: ../data/models/arxiv_deepfake_doc_vectors.pkl


['../data/models/arxiv_deepfake_doc_vectors.pkl']

# MODELS

In [54]:
# Check files in models folder
models = [f for f in listdir(MODEL_PATH) if isfile(join(MODEL_PATH, f))]

print('Index, Model Name')
print(list(zip([index for index, value in enumerate(models)], models)))


Index, Model Name
[(0, 'arxiv_disinformation_doc_vectors.pkl'), (1, 'arxiv_deepfake_doc_vectors.pkl'), (2, 'train_arxiv_deepfake_doc_vectors.pkl'), (3, 'arxiv_deepfake_svm_model.pkl'), (4, 'arxiv_deepfake_iso_model.pkl')]


In [None]:
# to load a tfidf model / just for testing here
model_name = datafiles[0]

doc2vec_vectors = joblib.load(MODEL_PATH + model_name)


In [None]:
# then transform again

## SVM Method

In [36]:
# Initialize and fit model
'''
Adjust nu hyperparameter to, simplifing, 
increase/decrease "novelty" sensitivity. 
It is very high now = less outliers
'''

model = OneClassSVM(kernel = 'rbf', 
                  gamma = 'scale', 
                  nu = 0.001).fit(doc2vec_vectors)



In [59]:
joblib.dump(model, MODEL_PATH + filename.split('.')[0] + "_svm_model.pkl")

['../data/models/arxiv_deepfake_svm_model.pkl']

In [37]:
# Prediction
y_pred = model.predict(doc2vec_vectors)



In [38]:
# Filter outlier index
outlier_index = where(y_pred == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

# Un-used, for inspection
#outlier_values = doc2vec_vectors.iloc[outlier_index]
#outlier_values

Outliers: 18


In [39]:
# Create a df with just outliers

df_misclass = df[df.index.isin(indexes)]

In [40]:
# Inspect dataframe

df_misclass.head(3)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,arxiv_doi,arxiv_journal_ref,arxiv_affiliation,search_term,cleaning,tokens,category,year,month_year,tokens_merged
47,http://arxiv.org/abs/2203.14814v1,True,http://arxiv.org/abs/2203.14814v1,2022-03-28T14:51:42Z,"[2022, 3, 28, 14, 51, 42, 0, 87, 0]",2022-03-28T14:51:42Z,"[2022, 3, 28, 14, 51, 42, 0, 87, 0]",Stochastic Parameterizations: Better Modelling...,"{'type': 'text/plain', 'language': None, 'base...",The modelling of small-scale processes is a ma...,...,,,,GAN,the modelling of smallscale processes is a maj...,"[modelling, smallscale, process, major, source...",cs.LG,2022,2022-3,modelling smallscale process major source erro...
240,http://arxiv.org/abs/2202.08143v1,True,http://arxiv.org/abs/2202.08143v1,2022-02-16T15:34:09Z,"[2022, 2, 16, 15, 34, 9, 2, 47, 0]",2022-02-16T15:34:09Z,"[2022, 2, 16, 15, 34, 9, 2, 47, 0]",Bias in Automated Image Colorization: Metrics ...,"{'type': 'text/plain', 'language': None, 'base...",We measure the color shifts present in coloriz...,...,,,,GAN,we measure the color shifts present in coloriz...,"[measure, color, shift, present, colorized, im...",cs.CV,2022,2022-2,measure color shift present colorized image da...
331,http://arxiv.org/abs/2201.10130v1,True,http://arxiv.org/abs/2201.10130v1,2022-01-25T07:06:43Z,"[2022, 1, 25, 7, 6, 43, 1, 25, 0]",2022-01-25T07:06:43Z,"[2022, 1, 25, 7, 6, 43, 1, 25, 0]",Improving Adversarial Waveform Generation base...,"{'type': 'text/plain', 'language': None, 'base...",Adversarial waveform generation has been a pop...,...,,,,GAN,adversarial waveform generation has been a pop...,"[adversarial, waveform, generation, popular, a...",cs.SD,2022,2022-1,adversarial waveform generation popular approa...


In [41]:
df_misclass.columns

Index(['id', 'guidislink', 'link', 'updated', 'updated_parsed', 'published',
       'published_parsed', 'title', 'title_detail', 'summary',
       'summary_detail', 'authors', 'author_detail', 'author', 'links',
       'arxiv_primary_category', 'tags', 'arxiv_comment', 'arxiv_doi',
       'arxiv_journal_ref', 'arxiv_affiliation', 'search_term', 'cleaning',
       'tokens', 'category', 'year', 'month_year', 'tokens_merged'],
      dtype='object')

In [42]:
df_misclass.search_term.value_counts()

GAN    15
Name: search_term, dtype: int64

In [64]:
# Add compare function

Another approach to getting outliers using SVM model but diffferent criteria (more of a % than a absolute value)

In [65]:
scores = model.score_samples(doc2vec_vectors)

In [66]:
# Change treshhold as needed

thresh = quantile(scores, 0.03)
print(thresh)

0.10631984670769612


In [67]:
# getting indexes

index = where(scores<=thresh)
index = list(index[0])
print(len(index))


14


In [68]:
# Creating second df
df_misclass_2 = df[df.index.isin(index)]

# And viewing it
df_misclass_2.head(3)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,tags,arxiv_affiliation,arxiv_journal_ref,arxiv_doi,cleaning,tokens,year,month_year,category,tokens_merged
8,http://arxiv.org/abs/2203.06825v1,True,http://arxiv.org/abs/2203.06825v1,2022-03-14T02:44:56Z,"[2022, 3, 14, 2, 44, 56, 0, 73, 0]",2022-03-14T02:44:56Z,"[2022, 3, 14, 2, 44, 56, 0, 73, 0]",Fairness Evaluation in Deepfake Detection Mode...,"{'type': 'text/plain', 'language': None, 'base...",Fairness of deepfake detectors in the presence...,...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,fairness of deepfake detectors in the presence...,"[fairness, deepfake, detector, presence, anoma...",2022,"[2022, 3]",cs.CV,fairness deepfake detector presence anomaly we...
46,http://arxiv.org/abs/2110.01640v1,True,http://arxiv.org/abs/2110.01640v1,2021-10-04T18:02:56Z,"[2021, 10, 4, 18, 2, 56, 0, 277, 0]",2021-10-04T18:02:56Z,"[2021, 10, 4, 18, 2, 56, 0, 277, 0]",An Experimental Evaluation on Deepfake Detecti...,"{'type': 'text/plain', 'language': None, 'base...",Significant advances in deep learning have obt...,...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,significant advances in deep learning have obt...,"[significant, advance, deep, learning, obtain,...",2021,"[2021, 10]",cs.CV,significant advance deep learning obtain hallm...
106,http://arxiv.org/abs/2103.09396v3,True,http://arxiv.org/abs/2103.09396v3,2021-10-03T01:05:56Z,"[2021, 10, 3, 1, 5, 56, 6, 276, 0]",2021-03-17T01:48:34Z,"[2021, 3, 17, 1, 48, 34, 2, 76, 0]",Pros and Cons of GAN Evaluation Measures: New ...,"{'type': 'text/plain', 'language': None, 'base...",This work is an update of a previous paper on ...,...,"[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",,,,this work is an update of a previous paper on ...,"[work, update, previous, paper, topic, publish...",2021,"[2021, 3]",cs.LG,work update previous paper topic publish year ...


## Isolation Forest Method

In [43]:
# Initialize model
iso_model = IsolationForest(n_estimators=100,
                  max_samples='auto',
                  contamination=float(0.01),
                  random_state=42
                  )

# Fitting model
iso_model.fit(doc2vec_vectors)

print(iso_model.get_params())

{'bootstrap': False, 'contamination': 0.01, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 100, 'n_jobs': None, 'random_state': 42, 'verbose': 0, 'warm_start': False}




In [49]:
joblib.dump(iso_model, MODEL_PATH + filename.split('.')[0] + "_iso_model.pkl")

['../data/models/arxiv_deepfake_iso_model.pkl']

In [44]:
data = pd.DataFrame()

data['scores'] = iso_model.decision_function(doc2vec_vectors)

data['anomaly_score'] = iso_model.predict(doc2vec_vectors) 





In [45]:
data[data['anomaly_score']==-1]

Unnamed: 0,scores,anomaly_score
47,-0.108561,-1
216,-0.064188,-1
235,-0.080612,-1
293,-0.089985,-1
344,-0.079779,-1
478,-9e-06,-1
771,-0.062221,-1
817,-0.03555,-1
918,-0.002202,-1
982,-0.099722,-1


In [46]:
outlier_index = where(data['anomaly_score'] == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

isolation_misclass = df[df.index.isin(indexes)]

Outliers: 13


In [47]:
isolation_misclass.search_term.value_counts()

GAN    10
Name: search_term, dtype: int64

In [49]:
for c in isolation_misclass.summary:
      print(c)
      print('-------------------------------')

The modelling of small-scale processes is a major source of error in climate
models, hindering the accuracy of low-cost models which must approximate such
processes through parameterization. Using stochasticity and machine learning
have led to better models but there is a lack of work on combining the benefits
from both. We show that by using a physically-informed recurrent neural network
within a probabilistic framework, our resulting model for the Lorenz 96
atmospheric simulation is competitive and often superior to both a bespoke
baseline and an existing probabilistic machine-learning (GAN) one. This is due
to a superior ability to model temporal correlations compared to standard
first-order autoregressive schemes. The model also generalises to unseen
regimes. We evaluate across a number of metrics from the literature, but also
discuss how the probabilistic metric of likelihood may be a unifying choice for
future probabilistic climate models.
-------------------------------
The incr

## Cosine Simularity Method

In [None]:
# Initialize, fit and predict
auto_encoder = MLPRegressor(hidden_layer_sizes=(
                                                 600,
                                                 150, 
                                                 600,
                                               ))

auto_encoder.fit(doc2vec_vectors, doc2vec_vectors)

predicted_vectors = auto_encoder.predict(doc2vec_vectors)

In [None]:
# Visual loss
pd.DataFrame(auto_encoder.loss_curve_).plot()

In [None]:
df.columns

In [None]:
def key_consine_similarity(tupple):
    return tupple[1]

def get_computed_similarities(vectors, predicted_vectors, reverse=False):
    data_size = len(df)
    cosine_similarities = []
    for i in range(data_size):
        cosine_sim_val = (1 - cosine(vectors[i], predicted_vectors[i]))
        cosine_similarities.append((i, cosine_sim_val))

    return sorted(cosine_similarities, key=key_consine_similarity, reverse=reverse)

def display_top_n(sorted_cosine_similarities, n=5):
    for i in range(n):
        index, consine_sim_val = sorted_cosine_similarities[i]
        print('Title: ', df.iloc[index, 7])
        print('ID: ', df.iloc[index, 0])  
        print('Cosine Sim Val :', consine_sim_val)
        print('---------------------------------')

# add function to sort by percentage


In [None]:
# Specify how many 'outliers' you want to see
N = 20

In [None]:
print('Top n unique')

sorted_cosine_similarities = get_computed_similarities(vectors=doc2vec_vectors, predicted_vectors=predicted_vectors)

display_top_n(sorted_cosine_similarities=sorted_cosine_similarities, n = N)

### Visualizing the cosines - will revise during first test

In [None]:
# Seaborn histogram
# Can use to adjust the N above (or percent, once we have that function) to see the low cluster

sns.distplot(losses, hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})

# # Add labels
# plt.title('Title')
# plt.xlabel('Label x')
# plt.ylabel('Label y')

### IIRC not fully functional yet - for more Cosine work

In [None]:
len(list(df.columns))

In [None]:
df.columns

In [None]:
most_unique_index, cosine_sim_val = sorted_cosine_similarities[0]
print(most_unique_index)
most_unique_plot =df.iloc[most_unique_index, 9] # index here matters!
most_unique_words_counter = Counter(preprocess_string(most_unique_plot))
print(most_unique_words_counter)

# intersected_common_word_counter = common_word_counter & most_unique_words_counter

# intersected_common_words = [word[0] for word in intersected_common_word_counter.items()]
# intersected_common_word_counts = [word[1] for word in intersected_common_word_counter.items()]

# intersected_common_word_counter