# Setup

In [59]:
# The usuals
import numpy as np
from numpy import quantile, where, random

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import multiprocessing

# Scientific
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.parsing.preprocessing import preprocess_string

from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neural_network import MLPRegressor

from scipy.spatial.distance import cosine

# Supporting

from tqdm import tqdm
import joblib
from datetime import datetime
from os import listdir
from os.path import isfile, join

from collections import Counter

In [40]:
# Paths
DATA_PATH = '../data/'
OUTPUT = '../output_data/'
MODEL_PATH = '../data/models/'

In [41]:
# Check files in data folder
datafiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]

print('Index, Filename')
print(list(zip([index for index, value in enumerate(datafiles)], datafiles)))

Index, Filename
[(0, 'arxiv_disinformation.csv'), (1, 'arxiv_deepfake.csv'), (2, 'deepfake_txt.csv')]


In [42]:
# Get a file name, can use
filename = datafiles[1]
filename

'arxiv_deepfake.csv'

In [43]:
# Load dataframe
CONVERTERS = {'tokens': eval, 'published_parsed': eval, 'tags': eval, 'arxiv_primary_category': eval}

df = pd.read_csv(DATA_PATH + filename, converters=CONVERTERS)

In [44]:
# Check data frame
df.head(3)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,arxiv_primary_category,tags,arxiv_affiliation,arxiv_journal_ref,arxiv_doi,cleaning,tokens,year,month_year,category
0,http://arxiv.org/abs/2203.14315v1,True,http://arxiv.org/abs/2203.14315v1,2022-03-27T14:25:52Z,"[2022, 3, 27, 14, 25, 52, 6, 86, 0]",2022-03-27T14:25:52Z,"[2022, 3, 27, 14, 25, 52, 6, 86, 0]",Adaptive Frequency Learning in Two-branch Face...,"{'type': 'text/plain', 'language': None, 'base...",Face forgery has attracted increasing attentio...,...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,face forgery has attracted increasing attentio...,"[face, forgery, attract, increase, attention, ...",2022,"[2022, 3]",cs.CV
1,http://arxiv.org/abs/2203.13964v1,True,http://arxiv.org/abs/2203.13964v1,2022-03-26T01:55:37Z,"[2022, 3, 26, 1, 55, 37, 5, 85, 0]",2022-03-26T01:55:37Z,"[2022, 3, 26, 1, 55, 37, 5, 85, 0]",Fusing Global and Local Features for Generaliz...,"{'type': 'text/plain', 'language': None, 'base...",With the development of the Generative Adversa...,...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,with the development of the generative adversa...,"[development, generative, adversarial, network...",2022,"[2022, 3]",cs.CV
2,http://arxiv.org/abs/2203.12208v2,True,http://arxiv.org/abs/2203.12208v2,2022-03-25T16:00:07Z,"[2022, 3, 25, 16, 0, 7, 4, 84, 0]",2022-03-23T05:52:23Z,"[2022, 3, 23, 5, 52, 23, 2, 82, 0]",Self-supervised Learning of Adversarial Exampl...,"{'type': 'text/plain', 'language': None, 'base...",Recent studies in deepfake detection have yiel...,...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,recent studies in deepfake detection have yiel...,"[recent, study, deepfake, detection, yield, pr...",2022,"[2022, 3]",cs.CV


In [45]:
# Here for tests we will load a second df, not do a traditional train test split, as we want some sort of bias in the second set - to ensure outliers.

# Split DF

In [46]:
df.columns

Index(['id', 'guidislink', 'link', 'updated', 'updated_parsed', 'published',
       'published_parsed', 'title', 'title_detail', 'summary',
       'summary_detail', 'authors', 'author_detail', 'author', 'arxiv_comment',
       'links', 'arxiv_primary_category', 'tags', 'arxiv_affiliation',
       'arxiv_journal_ref', 'arxiv_doi', 'cleaning', 'tokens', 'year',
       'month_year', 'category'],
      dtype='object')

In [47]:
print(df.published.max())
print(df.published.min())

2022-03-27T14:25:52Z
2018-06-07T19:36:09Z


In [167]:
df_test = df[df['published'] > '2022-01-01T14:25:52Z']
df_train = df[df['published'] < '2022-01-01T14:25:52Z']


In [181]:
# reverse trainin
df_test = df[df['published'] < '2020-01-01T14:25:52Z']
df_train = df[df['published'] > '2020-01-01T14:25:52Z']


In [182]:
print(len(df))

print(len(df_test))
print(len(df_train))

438
44
394


# Prep work

If "cleaning" column kept from Preprocssing, can use that column (but has ALL words, eg also stopwords and all POSs)

In [183]:
def join_tokens(txt):
        x = ' '.join(txt)
        #x = [token.split('/')[0] for token in x] # use when we need lists with just these!
        return x

In [184]:
df_test['tokens_merged'] = df_test['tokens'].dropna().apply(lambda x: join_tokens(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['tokens_merged'] = df_test['tokens'].dropna().apply(lambda x: join_tokens(x))


In [185]:
df_train['tokens_merged'] = df_train['tokens'].dropna().apply(lambda x: join_tokens(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['tokens_merged'] = df_train['tokens'].dropna().apply(lambda x: join_tokens(x))


# Apply Doc2 Vec and SVM to df_train first

## Doc2 Vec for training

Creates doc2vec vectors for each document in the dataframe

In [186]:
class Doc2VecTransformer(BaseEstimator):

    def __init__(self, action_column, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1
        self.action_column = action_column

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(str(row[self.action_column]).split(), [index]) for index, row in df_x.iterrows()] # edit this: will not work on Chinese

        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers) # maybe want to try Word2Vec

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(str(row[self.action_column]).split())
                                     for index, row in df_x.iterrows()]))

In [187]:
# Initializing model
doc2vec_tr = Doc2VecTransformer('tokens_merged', 
                              vector_size=150,#normally imo 150
                              epochs= 50,
                              )

# Fitting
#doc2vec_tr.fit(df_train)
fitted = doc2vec_tr.fit(df_train)

#Transforming
doc2vec_vectors = fitted.transform(df_train)

100%|██████████| 394/394 [00:00<00:00, 1735877.92it/s]
100%|██████████| 394/394 [00:00<00:00, 4040478.67it/s]
100%|██████████| 394/394 [00:00<00:00, 3906751.24it/s]
100%|██████████| 394/394 [00:00<00:00, 3982062.11it/s]
100%|██████████| 394/394 [00:00<00:00, 3508610.99it/s]
100%|██████████| 394/394 [00:00<00:00, 2408973.43it/s]
100%|██████████| 394/394 [00:00<00:00, 2183032.73it/s]
100%|██████████| 394/394 [00:00<00:00, 2631458.24it/s]
100%|██████████| 394/394 [00:00<00:00, 3631990.72it/s]
100%|██████████| 394/394 [00:00<00:00, 2295216.36it/s]
100%|██████████| 394/394 [00:00<00:00, 1782692.31it/s]
100%|██████████| 394/394 [00:00<00:00, 977265.39it/s]
100%|██████████| 394/394 [00:00<00:00, 2266880.35it/s]
100%|██████████| 394/394 [00:00<00:00, 2239235.47it/s]
100%|██████████| 394/394 [00:00<00:00, 1850566.38it/s]
100%|██████████| 394/394 [00:00<00:00, 2282535.60it/s]
100%|██████████| 394/394 [00:00<00:00, 2221177.12it/s]
100%|██████████| 394/394 [00:00<00:00, 1167059.16it/s]
100%|██████

In [188]:
fitted

Doc2VecTransformer(action_column='tokens_merged', epochs=50, vector_size=150)

In [189]:
len(doc2vec_vectors)

394

In [190]:
doc2vec_vectors[1].shape

(1, 150)

In [191]:
filename

'arxiv_deepfake.csv'

In [192]:
# ADD SAVE KV
# ADD LOAD KV
m = MODEL_PATH + 'train_' + filename.split('.')[0] + '_doc_vectors.pkl'
print('Saving as: ' + m)

joblib.dump(doc2vec_tr, m) 

Saving as: ../data/models/train_arxiv_deepfake_doc_vectors.pkl


['../data/models/train_arxiv_deepfake_doc_vectors.pkl']

## SVM Model for training

In [193]:
# Initialize and fit model
'''
Adjust nu hyperparameter to, simplifing, 
increase/decrease "novelty" sensitivity. 
It is very high now = less outliers
'''

model = OneClassSVM(kernel = 'rbf', 
                  gamma = 'scale', 
                  nu = 0.001).fit(doc2vec_vectors)



In [194]:
joblib.dump(model, MODEL_PATH + 'train_' + filename.split('.')[0] + "_svm_model.pkl")

['../data/models/train_arxiv_deepfake_svm_model.pkl']

In [195]:
# Prediction
y_pred = model.predict(doc2vec_vectors)



In [196]:
# Filter outlier index
outlier_index = where(y_pred == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

# Un-used, for inspection
#outlier_values = doc2vec_vectors.iloc[outlier_index]
#outlier_values

Outliers: 6


# TESTING

In [214]:
# Check files in data folder
models = [f for f in listdir(MODEL_PATH) if isfile(join(MODEL_PATH, f))]

print('Index, Model Name')
print(list(zip([index for index, value in enumerate(models)], models)))


Index, Model Name
[(0, 'arxiv_disinformation_doc_vectors.pkl'), (1, 'arxiv_deepfake_doc_vectors.pkl'), (2, 'train_arxiv_deepfake_doc_vectors.pkl'), (3, 'arxiv_deepfake_svm_model.pkl'), (4, 'arxiv_deepfake_iso_model.pkl'), (5, 'train_arxiv_deepfake_svm_model.pkl'), (6, 'train_arxiv_deepfake_iso_model.pkl')]


In [216]:
# to load a model
vector_name = models[2]
print(vector_name)
model_name = models[5]
print(model_name)
iso_model_name = models[6]
print(iso_model_name)

vectorizer = joblib.load(MODEL_PATH + vector_name)
svm_model = joblib.load(MODEL_PATH + model_name)
iso_model = joblib.load(MODEL_PATH + iso_model_name)



train_arxiv_deepfake_doc_vectors.pkl
train_arxiv_deepfake_svm_model.pkl
train_arxiv_deepfake_iso_model.pkl


In [199]:
df_train.head(1)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,tags,arxiv_affiliation,arxiv_journal_ref,arxiv_doi,cleaning,tokens,year,month_year,category,tokens_merged
0,http://arxiv.org/abs/2203.14315v1,True,http://arxiv.org/abs/2203.14315v1,2022-03-27T14:25:52Z,"[2022, 3, 27, 14, 25, 52, 6, 86, 0]",2022-03-27T14:25:52Z,"[2022, 3, 27, 14, 25, 52, 6, 86, 0]",Adaptive Frequency Learning in Two-branch Face...,"{'type': 'text/plain', 'language': None, 'base...",Face forgery has attracted increasing attentio...,...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,face forgery has attracted increasing attentio...,"[face, forgery, attract, increase, attention, ...",2022,"[2022, 3]",cs.CV,face forgery attract increase attention recent...


In [200]:
# Fit testing data
fitted = vectorizer.fit(df_test)#.toarray()


100%|██████████| 44/44 [00:00<00:00, 623477.62it/s]
100%|██████████| 44/44 [00:00<00:00, 723723.04it/s]
100%|██████████| 44/44 [00:00<00:00, 762600.73it/s]
100%|██████████| 44/44 [00:00<00:00, 373581.73it/s]
100%|██████████| 44/44 [00:00<00:00, 726572.35it/s]
100%|██████████| 44/44 [00:00<00:00, 464859.89it/s]
100%|██████████| 44/44 [00:00<00:00, 533379.70it/s]
100%|██████████| 44/44 [00:00<00:00, 750200.72it/s]
100%|██████████| 44/44 [00:00<00:00, 683516.21it/s]
100%|██████████| 44/44 [00:00<00:00, 473203.53it/s]
100%|██████████| 44/44 [00:00<00:00, 428188.81it/s]
100%|██████████| 44/44 [00:00<00:00, 441505.68it/s]
100%|██████████| 44/44 [00:00<00:00, 492131.67it/s]
100%|██████████| 44/44 [00:00<00:00, 507003.78it/s]
100%|██████████| 44/44 [00:00<00:00, 1246955.24it/s]
100%|██████████| 44/44 [00:00<00:00, 1139193.68it/s]
100%|██████████| 44/44 [00:00<00:00, 1255437.93it/s]
100%|██████████| 44/44 [00:00<00:00, 1160687.90it/s]
100%|██████████| 44/44 [00:00<00:00, 1175473.73it/s]
100%|██

In [201]:
test_vectors = fitted.transform(df_test)

In [202]:
len(test_vectors)

44

In [203]:
y_pred = svm_model.predict(test_vectors)



In [204]:
# Approach 1: absolute y_pred wrong

outlier_index = where(y_pred == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

Outliers: 33


In [205]:
# Approach 2: most difference

In [206]:
scores = model.score_samples(test_vectors)



In [207]:
# Change treshhold as needed

thresh = quantile(scores, .1)
print(thresh)

0.00043108544561837027


In [208]:
# getting indexes

index = where(scores<=thresh)
index = list(index[0])
print('Outliers: ' + str(len(index)))


Outliers: 5


In [217]:
# Isolation on train_df

In [218]:
data = pd.DataFrame()

data['scores'] = iso_model.decision_function(test_vectors)

data['anomaly_score'] = iso_model.predict(test_vectors) 





In [219]:
data[data['anomaly_score']==-1]

Unnamed: 0,scores,anomaly_score
0,-0.043646,-1
1,-0.001691,-1
5,-0.001691,-1
7,-0.043646,-1
8,-0.047555,-1
17,-0.067425,-1
18,-0.028537,-1
26,-0.078648,-1
30,-0.074062,-1
32,-0.007405,-1


In [221]:
outlier_index = where(data['anomaly_score'] == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)) + ' of ' + str(len(test_vectors)))

isolation_misclass = df[df.index.isin(indexes)]

# 15 of 44

Outliers: 15 of 44


# SVM Method

In [131]:
# Initialize and fit model
'''
Adjust nu hyperparameter to, simplifing, 
increase/decrease "novelty" sensitivity. 
It is very high now = less outliers
'''

model = OneClassSVM(kernel = 'rbf', 
                  gamma = 'scale', 
                  nu = 0.001).fit(doc2vec_vectors)



In [134]:
joblib.dump(model, MODEL_PATH + 'train_' + filename.split('.')[0] + "_svm_model.pkl")

['../data/models/train_arxiv_deepfake_svm_model.pkl']

In [132]:
# Prediction
y_pred = model.predict(doc2vec_vectors)



In [133]:
# Filter outlier index
outlier_index = where(y_pred == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

# Un-used, for inspection
#outlier_values = doc2vec_vectors.iloc[outlier_index]
#outlier_values

Outliers: 5


In [135]:
# Create a df with just outliers

df_misclass = df_train[df_train.index.isin(indexes)]

In [136]:
# Inspect dataframe

df_misclass.head(3)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,tags,arxiv_affiliation,arxiv_journal_ref,arxiv_doi,cleaning,tokens,year,month_year,category,tokens_merged
130,http://arxiv.org/abs/2012.07989v1,True,http://arxiv.org/abs/2012.07989v1,2020-12-14T22:40:49Z,"[2020, 12, 14, 22, 40, 49, 0, 349, 0]",2020-12-14T22:40:49Z,"[2020, 12, 14, 22, 40, 49, 0, 349, 0]",The Emerging Threats of Deepfake Attacks and C...,"{'type': 'text/plain', 'language': None, 'base...",Deepfake technology (DT) has taken a new level...,...,"[{'term': 'cs.CR', 'scheme': 'http://arxiv.org...",,,10.13140/RG.2.2.23089.81762,deepfake technology dt has taken a new level o...,"[deepfake, technology, take, new, level, sophi...",2020,"[2020, 12]",cs.CR,deepfake technology take new level sophisticat...
140,http://arxiv.org/abs/2011.02674v1,True,http://arxiv.org/abs/2011.02674v1,2020-11-05T06:17:04Z,"[2020, 11, 5, 6, 17, 4, 3, 310, 0]",2020-11-05T06:17:04Z,"[2020, 11, 5, 6, 17, 4, 3, 310, 0]",AOT: Appearance Optimal Transport Based Identi...,"{'type': 'text/plain', 'language': None, 'base...",Recent studies have shown that the performance...,...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,recent studies have shown that the performance...,"[recent, study, show, performance, forgery, de...",2020,"[2020, 11]",cs.CV,recent study show performance forgery detectio...
144,http://arxiv.org/abs/2009.09869v3,True,http://arxiv.org/abs/2009.09869v3,2021-09-26T11:45:47Z,"[2021, 9, 26, 11, 45, 47, 6, 269, 0]",2020-09-21T13:41:24Z,"[2020, 9, 21, 13, 41, 24, 0, 265, 0]",FakeTagger: Robust Safeguards against DeepFake...,"{'type': 'text/plain', 'language': None, 'base...","In recent years, DeepFake is becoming a common...",...,"[{'term': 'cs.CR', 'scheme': 'http://arxiv.org...",,,,in recent years deepfake is becoming a common ...,"[recent, year, deepfake, become, common, threa...",2020,"[2020, 9]",cs.CR,recent year deepfake become common threat soci...


In [137]:
# Add compare function

Another approach to getting outliers using SVM model but diffferent criteria (more of a % than a absolute value)

In [138]:
scores = model.score_samples(doc2vec_vectors)



In [139]:
# Change treshhold as needed

thresh = quantile(scores, 0.03)
print(thresh)

0.10677992749110751


In [140]:
# getting indexes

index = where(scores<=thresh)
index = list(index[0])
print(len(index))


12


In [141]:
# Creating second df
df_misclass_2 = df[df.index.isin(index)]

# And viewing it
df_misclass_2.head(3)

Unnamed: 0,id,guidislink,link,updated,updated_parsed,published,published_parsed,title,title_detail,summary,...,arxiv_primary_category,tags,arxiv_affiliation,arxiv_journal_ref,arxiv_doi,cleaning,tokens,year,month_year,category
16,http://arxiv.org/abs/2202.13843v2,True,http://arxiv.org/abs/2202.13843v2,2022-03-14T11:24:41Z,"[2022, 3, 14, 11, 24, 41, 0, 73, 0]",2022-02-28T14:54:30Z,"[2022, 2, 28, 14, 54, 30, 0, 59, 0]",Deepfake Network Architecture Attribution,"{'type': 'text/plain', 'language': None, 'base...",With the rapid progress of generation technolo...,...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,with the rapid progress of generation technolo...,"[rapid, progress, generation, technology, beco...",2022,"[2022, 2]",cs.CV
62,http://arxiv.org/abs/2108.06702v1,True,http://arxiv.org/abs/2108.06702v1,2021-08-15T09:37:38Z,"[2021, 8, 15, 9, 37, 38, 6, 227, 0]",2021-08-15T09:37:38Z,"[2021, 8, 15, 9, 37, 38, 6, 227, 0]",Deepfake Representation with Multilinear Regre...,"{'type': 'text/plain', 'language': None, 'base...",Generative neural network architectures such a...,...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,generative neural network architectures such a...,"[generative, neural, network, architecture, ga...",2021,"[2021, 8]",cs.CV
76,http://arxiv.org/abs/2107.02016v2,True,http://arxiv.org/abs/2107.02016v2,2021-08-26T07:42:41Z,"[2021, 8, 26, 7, 42, 41, 3, 238, 0]",2021-07-05T13:35:39Z,"[2021, 7, 5, 13, 35, 39, 0, 186, 0]",FFR_FD: Effective and Fast Detection of DeepFa...,"{'type': 'text/plain', 'language': None, 'base...",The internet is filled with fake face images a...,...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",,,,the internet is filled with fake face images a...,"[internet, fill, fake, face, image, video, syn...",2021,"[2021, 7]",cs.CV


# Isolation Forest Method

In [209]:
# Initialize model
iso_model = IsolationForest(n_estimators=100,
                  max_samples='auto',
                  contamination=float(0.01),
                  random_state=42
                  )

# Fitting model
iso_model.fit(doc2vec_vectors)

print(iso_model.get_params())

{'bootstrap': False, 'contamination': 0.01, 'max_features': 1.0, 'max_samples': 'auto', 'n_estimators': 100, 'n_jobs': None, 'random_state': 42, 'verbose': 0, 'warm_start': False}




In [210]:
joblib.dump(iso_model, MODEL_PATH + 'train_' + filename.split('.')[0] + "_iso_model.pkl")

['../data/models/train_arxiv_deepfake_iso_model.pkl']

In [211]:
data = pd.DataFrame()

data['scores'] = iso_model.decision_function(doc2vec_vectors)

data['anomaly_score'] = iso_model.predict(doc2vec_vectors) 





In [212]:
data[data['anomaly_score']==-1]

Unnamed: 0,scores,anomaly_score
98,-0.014786,-1
122,-0.142862,-1
165,-0.107598,-1
381,-0.126937,-1


In [213]:
outlier_index = where(data['anomaly_score'] == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

isolation_misclass = df[df.index.isin(indexes)]

Outliers: 4


In [None]:
isolation_misclass.title

## Cosine Simularity Method

In [None]:
# Initialize, fit and predict
auto_encoder = MLPRegressor(hidden_layer_sizes=(
                                                 600,
                                                 150, 
                                                 600,
                                               ))

auto_encoder.fit(doc2vec_vectors, doc2vec_vectors)

predicted_vectors = auto_encoder.predict(doc2vec_vectors)

In [None]:
# Visual loss
pd.DataFrame(auto_encoder.loss_curve_).plot()

In [None]:
df.columns

In [None]:
def key_consine_similarity(tupple):
    return tupple[1]

def get_computed_similarities(vectors, predicted_vectors, reverse=False):
    data_size = len(df)
    cosine_similarities = []
    for i in range(data_size):
        cosine_sim_val = (1 - cosine(vectors[i], predicted_vectors[i]))
        cosine_similarities.append((i, cosine_sim_val))

    return sorted(cosine_similarities, key=key_consine_similarity, reverse=reverse)

def display_top_n(sorted_cosine_similarities, n=5):
    for i in range(n):
        index, consine_sim_val = sorted_cosine_similarities[i]
        print('Title: ', df.iloc[index, 7])
        print('ID: ', df.iloc[index, 0])  
        print('Cosine Sim Val :', consine_sim_val)
        print('---------------------------------')

# add function to sort by percentage


In [None]:
# Specify how many 'outliers' you want to see
N = 20

In [None]:
print('Top n unique')

sorted_cosine_similarities = get_computed_similarities(vectors=doc2vec_vectors, predicted_vectors=predicted_vectors)

display_top_n(sorted_cosine_similarities=sorted_cosine_similarities, n = N)

### Visualizing the cosines - will revise during first test

In [None]:
# Seaborn histogram
# Can use to adjust the N above (or percent, once we have that function) to see the low cluster

sns.distplot(losses, hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})

# # Add labels
# plt.title('Title')
# plt.xlabel('Label x')
# plt.ylabel('Label y')

### IIRC not fully functional yet - for more Cosine work

In [None]:
len(list(df.columns))

In [None]:
df.columns

In [None]:
most_unique_index, cosine_sim_val = sorted_cosine_similarities[0]
print(most_unique_index)
most_unique_plot =df.iloc[most_unique_index, 9] # index here matters!
most_unique_words_counter = Counter(preprocess_string(most_unique_plot))
print(most_unique_words_counter)

# intersected_common_word_counter = common_word_counter & most_unique_words_counter

# intersected_common_words = [word[0] for word in intersected_common_word_counter.items()]
# intersected_common_word_counts = [word[1] for word in intersected_common_word_counter.items()]

# intersected_common_word_counter