# Setup

In [None]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.parsing.preprocessing import preprocess_string
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

import pandas as pd

from os import listdir
from os.path import isfile, join

In [None]:
# Paths
PATH = '../data/'
OUTPUT = '../output_data/'

In [None]:
onlyfiles = [f for f in listdir(PATH) if isfile(join(PATH, f))]
print(onlyfiles)

In [None]:
filename = 'file.csv'

In [None]:
CONVERTERS = {'tokens': eval, 'pos_tokens': eval}

df = pd.read_csv(PATH + filename, converters=CONVERTERS)

In [None]:
len(df)

In [None]:
df.head(1)

# Prep work

This is currently optimized for Chinese, so may need some tweaking

In [None]:
def join_tokens(txt):
        x = ' '.join(txt)
        #x = [token.split('/')[0] for token in x] # use when I need lists with just these!
        return x

In [None]:
df['tokens_merged'] = df['tokens'].dropna().apply(lambda x: join_tokens(x))


In [None]:
df['tokens_merged'][1]

# Doc2 Vec

Creates doc2vec vectors for each document in the dataframe

In [None]:

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(str(row['tokens_merged']).split(), [index]) for index, row in df_x.iterrows()] # edit this: will not work on Chinese

        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers) # maybe want to try Word2Vec

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(str(row['tokens_merged']).split())
                                     for index, row in df_x.iterrows()]))

In [None]:
doc2vec_tr = Doc2VecTransformer(vector_size=300) #normall imo 150 is okay, could increase epochs
doc2vec_tr.fit(df)
doc2vec_vectors = doc2vec_tr.transform(df)

# SVM Method

In [None]:
from sklearn.svm import OneClassSVM
from numpy import quantile, where, random

In [None]:
model = OneClassSVM(kernel = 'sigmoid', 
                  gamma = 'scale', 
                  nu = 0.001).fit(doc2vec_vectors)

In [None]:
# prediction
y_pred = model.predict(doc2vec_vectors)

In [None]:
# filter outlier index
outlier_index = where(y_pred == -1) # filter outlier values
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

#outlier_values = doc2vec_vectors.iloc[outlier_index]
#outlier_values

In [None]:
# Create a df with just outliers

df_misclass = df[df.index.isin(indexes)]

In [None]:
df_misclass.head(3)

### Another approach to getting outliers

In [None]:
scores = model.score_samples(doc2vec_vectors)

In [None]:
# Change treshhold
thresh = quantile(scores, 0.03)
print(thresh)

In [None]:
index = where(scores<=thresh)
index = list(index[0])
print(len(index))


In [None]:
df_misclass_2 = df[df.index.isin(index)]

In [None]:
df_misclass_2.head(2)

# Isolation Forest Method

In [None]:
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

In [None]:
model=IsolationForest(n_estimators=100,max_samples='auto',contamination=float(0.01),random_state=42)

model.fit(doc2vec_vectors)

print(model.get_params())

In [None]:
data['scores'] = model.decision_function(doc2vec_vectors)#(data[['marks']])

data['anomaly_score'] = model.predict(doc2vec_vectors)#(data[['marks']])

data[data['anomaly_score']==-1].head()



# Cosine simularity method

In [None]:
from sklearn.neural_network import MLPRegressor

auto_encoder = MLPRegressor(hidden_layer_sizes=(
                                                 600,
                                                 150, 
                                                 600,
                                               ))
auto_encoder.fit(doc2vec_vectors, doc2vec_vectors)
predicted_vectors = auto_encoder.predict(doc2vec_vectors)

In [None]:
pd.DataFrame(auto_encoder.loss_curve_).plot()

In [None]:
from scipy.spatial.distance import cosine

def key_consine_similarity(tupple):
    return tupple[1]

def get_computed_similarities(vectors, predicted_vectors, reverse=False):
    data_size = len(df)
    cosine_similarities = []
    for i in range(data_size):
        cosine_sim_val = (1 - cosine(vectors[i], predicted_vectors[i]))
        cosine_similarities.append((i, cosine_sim_val))

    return sorted(cosine_similarities, key=key_consine_similarity, reverse=reverse)

def display_top_n(sorted_cosine_similarities, n=5):
    for i in range(n):
        index, consine_sim_val = sorted_cosine_similarities[i]
        print('Title: ', df.iloc[index, 0])
        print('URL: ', df.iloc[index, 1])  
        print('Cosine Sim Val :', consine_sim_val)
        print('---------------------------------')


In [None]:
N = 10

print('Top n unique')

sorted_cosine_similarities = get_computed_similarities(vectors=doc2vec_vectors, predicted_vectors=predicted_vectors)

display_top_n(sorted_cosine_similarities=sorted_cosine_similarities, n = N)

In [None]:
losses = [x[1] for x in sorted_cosine_similarities]
print(min(losses))
print(max(losses))

### Visualing the cosines

In [None]:
import seaborn as sns

# seaborn histogram
sns.distplot(losses, hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})

# # Add labels
# plt.title('Title')
# plt.xlabel('Label x')
# plt.ylabel('Label y')

In [None]:
sorted_cosine_similarities[::-5]
for i in range(-5, 0, 1):
    index, consine_sim_val = sorted_cosine_similarities[i]
    print('Title: ', df_cut.iloc[index, 0])
    print('URL: ', df_cut.iloc[index, 1])  
    print('Cosine Sim Val :', consine_sim_val)
    print('---------------------------------')

### IIRC not fully functional yet - for mopre Cosine work

In [None]:
from collections import Counter

In [None]:
len(list(df.columns))

In [None]:
most_unique_index, cosine_sim_val = sorted_cosine_similarities[0]
most_unique_plot = df_cut.iloc[most_unique_index, 18] # index here matters!
most_unique_words_counter = Counter(preprocess_string(most_unique_plot))
print(most_unique_words_counter)

intersected_common_word_counter = common_word_counter & most_unique_words_counter

intersected_common_words = [word[0] for word in intersected_common_word_counter.items()]
intersected_common_word_counts = [word[1] for word in intersected_common_word_counter.items()]
intersected_common_word_counter