# Setup

In [None]:
# The usuals
import numpy as np
from numpy import quantile, where, random

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import multiprocessing

# Scientific
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.parsing.preprocessing import preprocess_string

from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neural_network import MLPRegressor

from scipy.spatial.distance import cosine

# Supporting

from tqdm import tqdm
import joblib

from os import listdir
from os.path import isfile, join

from collections import Counter

In [None]:
# Paths
DATA_PATH = '../data/'
OUTPUT = '../output_data/'
MODEL_PATH = '../data/models/'

In [None]:
# Check files in data folder
datafiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]
print(datafiles)

In [None]:
# Get a file name, can use
filename = datafiles[1]
filename

In [None]:
# Load dataframe
CONVERTERS = {'tokens': eval, 'pos_tokens': eval}

df = pd.read_csv(DATA_PATH + filename, converters=CONVERTERS)

In [None]:
# Check data frame
df.head(3)

In [None]:
# Here for tests we will load a second df, not do a traditional train test split, as we want some sort of bias in the second set - to ensure outliers.

# Prep work

This is currently optimized for Chinese, may not be needed with DR data

In [None]:
def join_tokens(txt):
        x = ' '.join(txt)
        #x = [token.split('/')[0] for token in x] # use when we need lists with just these!
        return x

In [None]:
df['tokens_merged'] = df['tokens'].dropna().apply(lambda x: join_tokens(x))


In [None]:
df['tokens_merged'][1]

# Doc2 Vec

Creates doc2vec vectors for each document in the dataframe

In [None]:

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, action_column, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1
        self.action_column = action_column

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(str(row[self.action_column]).split(), [index]) for index, row in df_x.iterrows()] # edit this: will not work on Chinese

        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers) # maybe want to try Word2Vec

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(str(row[self.action_column]).split())
                                     for index, row in df_x.iterrows()]))

In [None]:
# Initializing model
doc2vec_tr = Doc2VecTransformer('tokens_merged', 
                              vector_size=300,#normally imo 150
                              epochs= 50,
                              )

# Fitting
doc2vec_tr.fit(df)

#Transforming
doc2vec_vectors = doc2vec_tr.transform(df)

In [None]:
len(doc2vec_vectors)

In [None]:
doc2vec_vectors[1].shape

In [None]:
filename

In [None]:
# ADD SAVE KV
# ADD LOAD KV
m = MODEL_PATH + filename.split('.')[0] + '_doc_vectors.pkl'
print('Saving as: ' + m)

joblib.dump(doc2vec_vectors, m) 

# MODELS

In [None]:
# Check files in data folder
datafiles = [f for f in listdir(MODEL_PATH) if isfile(join(MODEL_PATH, f))]
print(datafiles)

In [None]:
# to load a model
model_name = datafiles[0]

doc2vec_vectors = joblib.load(MODEL_PATH + model_name)


## SVM Method

In [None]:
# Initialize and fit model
'''
Adjust nu hyperparameter to, simplifing, 
increase/decrease "novelty" sensitivity. 
It is very high now = less outliers
'''

model = OneClassSVM(kernel = 'rbf', 
                  gamma = 'scale', 
                  nu = 0.001).fit(doc2vec_vectors)

In [None]:
# Prediction
y_pred = model.predict(doc2vec_vectors)

In [None]:
# Filter outlier index
outlier_index = where(y_pred == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

# Un-used, for inspection
#outlier_values = doc2vec_vectors.iloc[outlier_index]
#outlier_values

In [None]:
# Create a df with just outliers

df_misclass = df[df.index.isin(indexes)]

In [None]:
# Inspect dataframe

df_misclass.head(3)

In [None]:
# Add compare function

Another approach to getting outliers using SVM model but diffferent criteria (more of a % than a absolute value)

In [None]:
scores = model.score_samples(doc2vec_vectors)

In [None]:
# Change treshhold as needed

thresh = quantile(scores, 0.03)
print(thresh)

In [None]:
# getting indexes

index = where(scores<=thresh)
index = list(index[0])
print(len(index))


In [None]:
# Creating second df
df_misclass_2 = df[df.index.isin(index)]

# And viewing it
df_misclass_2.head(3)

## Isolation Forest Method

In [None]:
# Initialize model
iso_model = IsolationForest(n_estimators=100,
                  max_samples='auto',
                  contamination=float(0.01),
                  random_state=42
                  )

# Fitting model
iso_model.fit(doc2vec_vectors)

print(iso_model.get_params())

In [None]:
data = pd.DataFrame()

data['scores'] = iso_model.decision_function(doc2vec_vectors)

data['anomaly_score'] = iso_model.predict(doc2vec_vectors) 



In [None]:
data[data['anomaly_score']==-1]

In [None]:
outlier_index = where(data['anomaly_score'] == -1)
indexes = list(outlier_index[0])

print('Outliers: ' + str(len(indexes)))

isolation_misclass = df[df.index.isin(indexes)]

In [None]:
isolation_misclass.title

## Cosine Simularity Method

In [None]:
# Initialize, fit and predict
auto_encoder = MLPRegressor(hidden_layer_sizes=(
                                                 600,
                                                 150, 
                                                 600,
                                               ))

auto_encoder.fit(doc2vec_vectors, doc2vec_vectors)

predicted_vectors = auto_encoder.predict(doc2vec_vectors)

In [None]:
# Visual loss
pd.DataFrame(auto_encoder.loss_curve_).plot()

In [None]:
df.columns

In [None]:
def key_consine_similarity(tupple):
    return tupple[1]

def get_computed_similarities(vectors, predicted_vectors, reverse=False):
    data_size = len(df)
    cosine_similarities = []
    for i in range(data_size):
        cosine_sim_val = (1 - cosine(vectors[i], predicted_vectors[i]))
        cosine_similarities.append((i, cosine_sim_val))

    return sorted(cosine_similarities, key=key_consine_similarity, reverse=reverse)

def display_top_n(sorted_cosine_similarities, n=5):
    for i in range(n):
        index, consine_sim_val = sorted_cosine_similarities[i]
        print('Title: ', df.iloc[index, 7])
        print('ID: ', df.iloc[index, 0])  
        print('Cosine Sim Val :', consine_sim_val)
        print('---------------------------------')

# add function to sort by percentage


In [None]:
# Specify how many 'outliers' you want to see
N = 20

In [None]:
print('Top n unique')

sorted_cosine_similarities = get_computed_similarities(vectors=doc2vec_vectors, predicted_vectors=predicted_vectors)

display_top_n(sorted_cosine_similarities=sorted_cosine_similarities, n = N)

### Visualizing the cosines - will revise during first test

In [None]:
# Seaborn histogram
# Can use to adjust the N above (or percent, once we have that function) to see the low cluster

sns.distplot(losses, hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})

# # Add labels
# plt.title('Title')
# plt.xlabel('Label x')
# plt.ylabel('Label y')

### IIRC not fully functional yet - for more Cosine work

In [None]:
len(list(df.columns))

In [None]:
df.columns

In [None]:
most_unique_index, cosine_sim_val = sorted_cosine_similarities[0]
print(most_unique_index)
most_unique_plot =df.iloc[most_unique_index, 9] # index here matters!
most_unique_words_counter = Counter(preprocess_string(most_unique_plot))
print(most_unique_words_counter)

# intersected_common_word_counter = common_word_counter & most_unique_words_counter

# intersected_common_words = [word[0] for word in intersected_common_word_counter.items()]
# intersected_common_word_counts = [word[1] for word in intersected_common_word_counter.items()]

# intersected_common_word_counter