In [None]:
import json
import random
import shutil
from pathlib import Path
from itertools import combinations

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from scipy.spatial.distance import cosine
import pickle

#from pan20_verif_evaluator import evaluate_all


def cosine_sim(a, b):
    print(a, b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def rescale(value, orig_min, orig_max, new_min, new_max):
    """
    Rescales a `value` in the old range defined by
    `orig_min` and `orig_max`, to the new range
    `new_min` and `new_max`. Assumes that
    `orig_min` <= value <= `orig_max`.
    Parameters
    ----------
    value: float, default=None
        The value to be rescaled.
    orig_min: float, default=None
        The minimum of the original range.
    orig_max: float, default=None
        The minimum of the original range.
    new_min: float, default=None
        The minimum of the new range.
    new_max: float, default=None
        The minimum of the new range.
    Returns
    ----------
    new_value: float
        The rescaled value.
    """

    orig_span = orig_max - orig_min
    new_span = new_max - new_min

    try:
        scaled_value = float(value - orig_min) / float(orig_span)
    except ZeroDivisionError:
        orig_span += 1e-6
        scaled_value = float(value - orig_min) / float(orig_span)

    return new_min + (scaled_value * new_span)


def correct_scores(scores, p1, p2):
    for sc in scores:
        if sc <= p1:
            yield rescale(sc, 0, p1, 0, 0.49)
        elif p1 < sc < p2:
            yield 0.5
        else:
            yield rescale(sc, p2, 1, 0.51, 1)  # np.array(list

def test(test_pairs, output_dir, model_directory, num_iterations):
    vectorizer = pickle.load(open(model_directory / 'vectorizer.pickle', 'rb'))
    opt_p1 = pickle.load(open(model_directory / 'opt_p1.pickle', 'rb'))
    opt_p2 = pickle.load(open(model_directory / 'opt_p2.pickle', 'rb'))
    if num_iterations:
        rnd_feature_idxs = pickle.load(open(model_directory / 'rnd_feature_idxs.pickle', 'rb'))

    print('-> calculating test similarities')
    with open(output_dir / 'answers.jsonl', 'w') as outf:
        for line in open(test_pairs):
            d = json.loads(line.strip())
            problem_id = d['id']
            x1, x2 = vectorizer.transform(d['pair']).toarray()
            if num_iterations:
                similarities_ = []
                for i in range(num_iterations):
                    similarities_.append(cosine_sim(x1[rnd_feature_idxs[i, :]],
                                                    x2[rnd_feature_idxs[i, :]]))
                    similarity = np.mean(similarities_)
            else:
                similarity = cosine_sim(x1, x2)

            similarity = np.array(list(correct_scores([similarity], p1=opt_p1, p2=opt_p2)))[0]
            r = {'id': problem_id, 'value': similarity}
            outf.write(json.dumps(r) + '\n')

    

In [None]:
def pipeline_base(text, num_iterations=0, model_directory="C:\\Users\\ivank\\OneDrive\\clef21\\authorship-verification\\mod"):

    vectorizer = pickle.load(open(model_directory + '\\vectorizer.pickle', 'rb'))
    opt_p1 = pickle.load(open(model_directory + '\\opt_p1.pickle', 'rb'))
    opt_p2 = pickle.load(open(model_directory + '\\opt_p2.pickle', 'rb'))

    if num_iterations:
        rnd_feature_idxs = pickle.load(open(model_directory + '\\rnd_feature_idxs.pickle', 'rb'))

    print('-> calculating test similarities')

    if not isinstance(text, str):
        predictions = []
        print('буль')
        for text_variant in text:
            text1, text2 = text_variant.split("$&*&*&$")
            text_list = [text1, text2]
            x1, x2 = vectorizer.transform(text_list).toarray()
            if num_iterations:
                similarities_ = []
                for i in range(num_iterations):
                    similarities_.append(cosine_sim(x1[rnd_feature_idxs[i, :]],
                                                    x2[rnd_feature_idxs[i, :]]))
                    similarity = np.mean(similarities_)
            else:
                similarity = cosine_sim(x1, x2)
            similarity = np.array(list(correct_scores([similarity], p1=opt_p1, p2=opt_p2)))[0]
            similarity = similarity.astype('float32')
            probabilities = np.array([1-similarity,similarity])
            predictions.append(probabilities)
        return(np.array(predictions))#, type(predictions), predictions.size)

    else:
        text1, text2 = text.split("$&*&*&$")
        text_list = [text1, text2]
        x1, x2 = vectorizer.transform(text_list).toarray()
        if num_iterations:
            similarities_ = []
            for i in range(num_iterations):
                similarities_.append(cosine_sim(x1[rnd_feature_idxs[i, :]],
                                                x2[rnd_feature_idxs[i, :]]))
                similarity = np.mean(similarities_)
        else:
            similarity = cosine_sim(x1, x2)
        similarity = np.array(list(correct_scores([similarity], p1=opt_p1, p2=opt_p2)))[0]
        similarity = similarity.astype('float32')
        probabilities = np.array([1-similarity,similarity])
        return(probabilities)#, type(probabilities), probabilities.size)


In [None]:
with open("textcomb4.txt", 'r') as textcomb:
    textcomb=textcomb.read()
    p = pipeline_base(textcomb, num_iterations = 4)
    print(p)

In [None]:
textcomb = open("textcomb3.txt", 'r').read()

In [None]:
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer()

In [None]:
exp = explainer.explain_instance(text_instance=textcomb, classifier_fn=pipeline_base, num_features=6)

In [None]:
exp_iter = explainer.explain_instance(text_instance=textcomb, classifier_fn=pipeline_base, num_features=6)

In [None]:
exp1 = explainer.explain_instance(text_instance=textcomb, classifier_fn=pipeline_base, num_features=6)

In [None]:
exp2 = explainer.explain_instance(text_instance=textcomb, classifier_fn=pipeline_base, num_features=6)

In [None]:
exp3 = explainer.explain_instance(text_instance=textcomb, classifier_fn=pipeline_base, num_features=6)

In [None]:
exp.show_in_notebook(text=True)

In [None]:
exp_iter.show_in_notebook(text=True)

In [None]:
exp1.show_in_notebook(text=True)

In [None]:
exp2.show_in_notebook(text=True)

In [None]:
exp3.show_in_notebook(text=True)