In [1]:
import csv
import os
import re
from collections import Counter
import pickle

import numpy as np
import xlrd
from gensim.models import KeyedVectors
from gensim.models.wrappers import FastText
from nltk import sent_tokenize, word_tokenize
from stanfordcorenlp import StanfordCoreNLP
from log import logger
from decorators import logging

class Document:
    def __init__(self, doc_id, title, description, permissions, tags=None):
        self.id = doc_id
        self.title = title
        self.permissions = permissions
        self.description = description 
        self.tags = tags

    def __str__(self):
        print(self.title)
        print(self.permissions)


class Permission:
    def __init__(self, permission_type, permission_phrase):
        self.ptype = permission_type
        self.pphrase = permission_phrase
    
    def __str__(self):
        print(self.ptype)
        print(self.pphrase)


class Utils:
    CORE_NLP_DIR = r'/home/huseyin/LIB/stanford-corenlp-full-2018-10-05'
    
    @staticmethod
    @logging
    def preprocess(text):
        paragrahps = text.split("\n")
        sentences = []
        for p in paragrahps:
            for s in sent_tokenize(p):
                sentences.append(s)
        return sentences

    @staticmethod
    @logging
    def remove_hyperlinks(text):
        regex = r"((https?:\/\/)?[^\s]+\.[^\s]+)"
        text = re.sub(regex, '', text)
        return text

    @staticmethod
    def to_lower(w, lower):
        return w.lower() if lower else w
    
    @staticmethod
    @logging
    def vocab(file_path, file_type="csv", lower=True):
        nlp = StanfordCoreNLP(Utils.CORE_NLP_DIR)
        wordsCount = Counter()
        permissions = []
        distincts_permissions = set()
        if file_type == "csv":
            with open(file_path) as f:
                reader = csv.reader(f)
                next(reader) # skip header
                for row in reader:
                    text = row[1]
                    for sentence in Utils.preprocess(text):
                        sentence = Utils.remove_hyperlinks(sentence)
                        for w in nlp.word_tokenize(sentence):
                            wordsCount.update([Utils.to_lower(w, lower)])
                        for p in  row[2].strip().split(","):
                            ptype = Utils.to_lower(p, lower)
                            if ptype not in distincts_permissions:
                                pphrase = [Utils.to_lower(t, lower) for t in p.split("_")]
                                perm = Permission(ptype, pphrase)
                                permissions.append(perm)
                                distincts_permissions.add(ptype)
                            for token in p.split("_"):
                                wordsCount.update([Utils.to_lower(token, lower)])
        elif file_type == "excel":
            handtagged_permissions = ["READ_CALENDAR", "READ_CONTACTS", "RECORD_AUDIO"]
            loc = (file_path)
            wb = xlrd.open_workbook(loc) 
            sheet = wb.sheet_by_index(0)
            sharp_count = 0
            apk_title = ""
            for i in range(sheet.nrows):
                sentence = sheet.cell_value(i, 0)
                if sentence.startswith("##"):
                    sharp_count += 1
                    if sharp_count % 2 == 1:
                        apk_title = sentence.split("##")[1] 
                else:
                    if sharp_count != 0 and sharp_count % 2 == 0:
                        sentence = sentence.strip()
                        for w in nlp.word_tokenize(sentence):
                            wordsCount.update([Utils.to_lower(w, lower)])
                        for p in handtagged_permissions:
                            ptype = Utils.to_lower(p, lower)
                            if ptype not in distincts_permissions:
                                pphrase = [Utils.to_lower(t, lower) for t in p.split("_")]
                                perm = Permission(ptype, pphrase)
                                permissions.append(perm)
                                distincts_permissions.add(ptype)
                                for token in p.split("_"):
                                    wordsCount.update([Utils.to_lower(token, lower)])            
        else:
            raise Exception("Unsupported file type.")
        nlp.close()
        return wordsCount.keys(), {w: i for i, w in enumerate(list(wordsCount.keys()))}, permissions

    @staticmethod
    @logging
    def read_file(file_path, w2i, file_type="csv", lower=True):
        data = []
        doc_id = 0
        if file_type == "csv":
            with open(file_path) as f:
                reader = csv.reader(f)
                next(reader) # skip header
                for row in reader:
                    doc_id += 1
                    title = row[0]
                    description = row[1]
                    permissions = []
                    for p in  row[2].strip().split(","):
                        ptype = Utils.to_lower(p, lower)
                        pphrase = [Utils.to_lower(t, lower) for t in p.split("_")]
                        perm = Permission(ptype, pphrase)
                        permissions.append(perm)

                    sentences = []
                    for sentence in Utils.preprocess(description):
                        sentence = Utils.remove_hyperlinks(sentence)
                        sentences.append(sentence.strip())
                        ###sentences.append([Utils.to_lower(w, lower) for w in word_tokenize(sentence)])           
                    yield Document(doc_id, title, sentences, permissions)
                    
        elif file_type == "excel":
            permission_title = file_path.split("/")[-1].split(".")[0]
            loc = (file_path)
            wb = xlrd.open_workbook(loc) 
            sheet = wb.sheet_by_index(0)
            sharp_count = 0
            title = ""
            permissions = []
            sentences = []
            tags = []
            for i in range(sheet.nrows):
                sentence = sheet.cell_value(i, 0)
                if sentence.startswith("##"):
                    sharp_count += 1
                    if sharp_count % 2 == 1:
                        if doc_id > 0:
                            yield Document(doc_id, title, sentences, permissions, tags)
                        
                        #Document init values
                        title = sentence.split("##")[1]
                        permissions = []
                        sentences = []
                        tags = []
                        doc_id += 1
                        
                        # Permissions for apk
                        ptype = Utils.to_lower(permission_title, lower)
                        pphrase = [Utils.to_lower(t, lower) for t in permission_title.split("_")]
                        perm = Permission(ptype, pphrase)
                        permissions.append(perm)
                else:
                    if sharp_count != 0 and sharp_count % 2 == 0:
                        sentences.append(sentence.strip())
                        ###sentences.append([Utils.to_lower(w, lower) for w in word_tokenize(sentence.strip())]) 
                        tags.append(int(sheet.cell_value(i, 1)))
                        
            yield Document(doc_id, title, sentences, permissions, tags)
            wb.release_resources()
            del wb
        else:
            raise Exception("Unsupported file type.")
    
    @staticmethod
    @logging
    def get_data(file_path, w2i, sequence_type="dependency", file_type="csv", window_size=2, lower=True):
        nlp = StanfordCoreNLP(Utils.CORE_NLP_DIR)
        if sequence_type == "raw":
            return Utils.read_file_raw(file_path, w2i, nlp, file_type, lower)
        elif sequence_type == "dependency":
            return Utils.read_file_dependency(file_path, w2i, nlp, file_type, lower)
        elif sequence_type == "windowed":
            return Utils.read_file_window(file_path, w2i, nlp, file_type, window_size, lower)
        else:
            nlp.close()
            raise Exception("Unknown sequence type")
        
    @staticmethod
    def read_file_raw(file_path, w2i, nlp, file_type="csv", lower=True):
        for doc in Utils.read_file(file_path, w2i, file_type, lower):
            doc.description = [[Utils.to_lower(w, lower) for w in nlp.word_tokenize(sentence)] for sentence in doc.description]
            yield doc
        
    @staticmethod
    def read_file_window(file_path, w2i, nlp, file_type="csv", window_size=2, lower=True):
        for doc in Utils.read_file(file_path, w2i, file_type, lower):
            doc.description = [[Utils.to_lower(w, lower) for w in nlp.word_tokenize(sentence)] for sentence in doc.description]
            doc.description = Utils.split_into_windows(doc.description, window_size)
            yield doc
            
    @staticmethod
    def read_file_dependency(file_path, w2i, nlp, file_type="csv", lower=True):
        for doc in Utils.read_file(file_path, w2i, file_type, lower):
            doc.description = Utils.split_into_dependencies(doc.description, nlp)
            yield doc
            
    @staticmethod
    def split_into_dependencies(sentences, nlp):
        splitted_sentences = []
        for sentence in sentences:
            tokens = nlp.word_tokenize(sentence)
            s = [[tokens[rel[1]-1], tokens[rel[2]-1]] for rel in nlp.dependency_parse(sentence) if rel[0] != 'ROOT']
            splitted_sentences.append(s)
        return splitted_sentences
    
    @staticmethod
    def split_into_windows(sentences, window_size=2):
        splitted_sentences = []
        for sentence in sentences:
            splitted_sentences.append([])
            if len(sentence) < window_size:
                splitted_sentences[-1].append(sentence)
            else:
                for start in range(len(sentence) - window_size + 1):
                    splitted_sentences[-1].append([sentence[i+start] for i in range(window_size)])
        return splitted_sentences
    
    @staticmethod     
    def load_embeddings_file(file_name, embedding_type, lower=True):
        if not os.path.isfile(file_name):
            print(file_name, "does not exist")
            return {}, 0

        if embedding_type == "word2vec":
            model = KeyedVectors.load_word2vec_format(file_name, binary=True, unicode_errors="ignore")
            words = model.index2entity
        elif embedding_type == "fasttext":
            model = FastText.load_fasttext_format(file_name)
            words = [w for w in model.wv.vocab]
        elif embedding_type == "pickle":
            with open(file_name,'rb') as fp:
                model = pickle.load(fp)
                words = model.keys()
        else:
            raise Exception("Unknown Type")
            return {}, 0

        if lower:
            vectors = {word.lower(): model[word] for word in words}
        else:
            vectors = {word: model[word] for word in words}

        if "UNK" not in vectors:
            unk = np.mean([vectors[word] for word in vectors.keys()], axis=0)
            vectors["UNK"] = unk

        return vectors, len(vectors["UNK"])



In [2]:
class OptionParser:
    def __init__(self, train, train_file_type, external_embedding, external_embedding_type, wembedding_dims, lstm_dims):
        self.train = train
        self.train_file_type = train_file_type
        self.external_embedding = external_embedding
        self.external_embedding_type = external_embedding_type
        self.wembedding_dims = wembedding_dims
        self.lstm_dims = lstm_dims
    
options = OptionParser("/home/huseyin/Desktop/Security/data/whyper/Read_Calendar.xls",
                       "excel",
                       "/home/huseyin/Desktop/Security/data/subset_fasttext.bin",
                       "pickle",
                       300,
                       128)


ext_embeddings_arg, ext_emb_dim_arg = Utils.load_embeddings_file(options.external_embedding, options.external_embedding_type)

In [3]:
import dynet_config
# Declare GPU as the default device type
dynet_config.set_gpu()
# Set some parameters manualy
dynet_config.set(mem=400,random_seed=9)
# Initialize dynet import using above configuration in the current scope

import dynet as dy

from numpy import inf
import random
random.seed(33)

class SimpleModel:
    def __init__(self, vocab, w2i, permissions, options, ext_embeddings_arg, ext_emb_dim_arg):
        self.model = dy.ParameterCollection()
        self.trainer = dy.AdamTrainer(self.model)
        
        self.w2i = w2i
        self.wdims = options.wembedding_dims
        self.ldims = options.lstm_dims
        self.all_permissions = permissions
        self.train_file_type = options.train_file_type
        
        self.wlookup = self.model.add_lookup_parameters((len(w2i), self.wdims)) #PAD, and INITIAL tokens?
        if options.external_embedding is not None:
            ext_embeddings, ext_emb_dim =  ext_embeddings_arg, ext_emb_dim_arg #Utils.load_embeddings_file(options.external_embedding, options.external_embedding_type)
            assert (ext_emb_dim == self.wdims)
            print("Initializing word embeddings by pre-trained vectors")
            count = 0
            for word in self.w2i:
                if word in ext_embeddings:
                    count += 1
                    self.wlookup.init_row(self.w2i[word], ext_embeddings[word])
            self.ext_embeddings = ext_embeddings
            print("Vocab size: %d; #words having pretrained vectors: %d" % (len(self.w2i), count))
            
        self.sentence_rnn = [dy.SimpleRNNBuilder(1, self.wdims, self.ldims, self.model)] # Try bi-rnn and lstm
        self.permission_rrn = [dy.SimpleRNNBuilder(1, self.wdims, self.ldims, self.model)] # Try bi-rnn and lstm
        
        self.mlp_w = self.model.add_parameters((128, self.ldims))
        self.mlp_b = self.model.add_parameters(128)
        self.mlp_v = self.model.add_parameters((1, 128))
    
    def cos_similiariy(self, v1, v2):
        from numpy import dot
        from numpy.linalg import norm
        return dot(v1, v2)/(norm(v1)*norm(v2))
    
    def cosine_proximity(self, pred, gold):
        def l2_normalize(x):
            square_sum = dy.sqrt(dy.bmax(dy.sum_elems(dy.square(x)), np.finfo(float).eps * dy.ones((1))[0]))
            return dy.cdiv(x, square_sum)

        y_true = l2_normalize(pred)
        y_pred = l2_normalize(gold)

        return -dy.sum_elems(dy.cmult(y_true, y_pred))
    
    def cosine_loss(self, pred, gold):
        return dy.cdiv(dy.dot_product(pred,gold), dy.cmult(dy.squared_norm(pred), dy.squared_norm(gold)))
    
    def description_permission_sim_w_max(self, sentences, perm):
        max_sim = -inf
        for sentence_enc in sentences:
            sim = self.cos_similiariy(sentence_enc, perm)
            if max_sim < sim: max_sim = sim
        return max_sim

    def statistics(self, similarities):
        statistics = {}
        for app_id in similarities.keys():
            statistics[app_id] = {"related": {"max" : None, "avg" : None, "all" : []},
                                  "unrelated": {"max" : None, "avg" : None, "all" : []}}

            max_related, max_unrelated = -inf, -inf
            avg_related, avg_unrelated = 0, 0
            for related_p in similarities[app_id]["related"]:
                if max_related < related_p[1]: 
                    max_related = related_p[1]
                avg_related += related_p[1]
                statistics[app_id]["related"]["all"].append(related_p[1])

            for unrelated_p in similarities[app_id]["unrelated"]:
                if max_unrelated < unrelated_p[1]: 
                    max_unrelated = unrelated_p[1]
                avg_unrelated += unrelated_p[1]
                statistics[app_id]["unrelated"]["all"].append(unrelated_p[1])

            statistics[app_id]["related"]["max"] = max_related
            statistics[app_id]["unrelated"]["max"] = max_unrelated
            statistics[app_id]["related"]["avg"] = avg_related / len(similarities[app_id]["related"])
            statistics[app_id]["unrelated"]["avg"] = avg_unrelated / len(similarities[app_id]["unrelated"])
        return statistics

    def statistics_gold(self, similarities):
        statistics = {}
        for app_id in similarities.keys():
            statistics[app_id] = {"related": { "all" : []},
                                  "unrelated": {"all" : []}}
            max_related, max_unrelated = -inf, -inf
            avg_related, avg_unrelated = 0, 0
            for related_p in similarities[app_id]["related"]:
                statistics[app_id]["related"]["all"].append(related_p[1])

            for unrelated_p in similarities[app_id]["unrelated"]:
                statistics[app_id]["unrelated"]["all"].append(unrelated_p[1])
        return statistics

    def train(self, file_path):
        document_permission_similiarities = {}
        permission_vecs = {}
        # gather all permission encoding of permissions
        for perm  in self.all_permissions:
            rnn_forward = self.permission_rrn[0].initial_state()
            for entry in perm.pphrase:
                vec = self.wlookup[int(self.w2i.get(entry, 0))]
                rnn_forward = rnn_forward.add_input(vec)
            permission_vecs[perm.ptype] = rnn_forward.output().npvalue()
            dy.renew_cg()

        for doc in Utils.read_file(file_path, self.w2i, file_type=self.train_file_type):
            if doc.description:                
                #Sentence encoding
                sentence_enc_s = []
                for sentence in doc.descriptions:
                    rnn_forward = self.sentence_rnn[0].initial_state()
                    for entry in sentence:
                        vec = self.wlookup[int(self.w2i.get(entry, 0))]
                        rnn_forward = rnn_forward.add_input(vec)
                    if rnn_forward.output() is not None:
                        sentence_enc_s.append(rnn_forward.output().npvalue())
                    dy.renew_cg()
                    
                document_permission_similiarities[doc.id] = {"related": [], "unrelated" : []}
                app_permissions = set()
                for related_p in doc.permissions:
                    sim = self.description_permission_sim_w_max(sentence_enc_s, permission_vecs[related_p.ptype])
                    document_permission_similiarities[doc.id]["related"].append((related_p.ptype, sim))
                    app_permissions.add(related_p.ptype)
                for unrelated_p in self.all_permissions:
                    if unrelated_p.ptype not in app_permissions:
                        sim = self.description_permission_sim_w_max(sentence_enc_s, permission_vecs[unrelated_p.ptype])
                        document_permission_similiarities[doc.id]["unrelated"].append((unrelated_p.ptype, sim))
                
        return document_permission_similiarities

    def train_gold(self, file_path):
        document_permission_similiarities = {}
        permission_vecs = {}
        # gather all permission encoding of permissions
        for perm  in self.all_permissions:
            rnn_forward = self.permission_rrn[0].initial_state()
            for entry in perm.pphrase:
                vec = self.wlookup[int(self.w2i.get(entry, 0))]
                rnn_forward = rnn_forward.add_input(vec)
            permission_vecs[perm.ptype] = rnn_forward.output().npvalue()
            dy.renew_cg()

        for doc in Utils.read_file(file_path, self.w2i, file_type=self.train_file_type):
            if doc.description:                
                #Sentence encoding
                sentence_enc_s = []
                for sentence,tag in zip(doc.description, doc.tags):
                    if tag != 0 and tag != 4:
                        rnn_forward = self.sentence_rnn[0].initial_state()
                        for entry in sentence:
                            vec = self.wlookup[int(self.w2i.get(entry, 0))]
                            rnn_forward = rnn_forward.add_input(vec)
                        if rnn_forward.output() is not None:
                            sentence_enc_s.append(rnn_forward.output().npvalue())
                        dy.renew_cg()
                    
                document_permission_similiarities[doc.id] = {"related": [], "unrelated" : []}


                app_permissions = set()
                for related_p in doc.permissions:
                    document_permission_similiarities[doc.id]["related"].extend([(related_p.ptype, self.cos_similiariy(sentence_enc, permission_vecs[related_p.ptype])) for sentence_enc in sentence_enc_s])
                    app_permissions.add(related_p.ptype)
                for unrelated_p in self.all_permissions:
                    if unrelated_p.ptype not in app_permissions:
                        document_permission_similiarities[doc.id]["unrelated"].extend([(unrelated_p.ptype, self.cos_similiariy(sentence_enc, permission_vecs[unrelated_p.ptype])) for sentence_enc in sentence_enc_s])
                
        return document_permission_similiarities
    
    @logging
    def train_gold_splitted(self, documents): 
        for doc in documents:
            if doc.description:                
                #Sentence encoding
                sentence_enc_s = []
                for sentence,tag in zip(doc.description, doc.tags):
                    sentence_enc_s.append([])
                    if tag == 1 or tag == 2 or tag == 3:
                        for window in sentence:
                            permission_vecs = {}
                            # gather all permission encoding of permissions
                            for perm  in self.all_permissions:
                                rnn_forward = self.permission_rrn[0].initial_state()
                                for entry in perm.pphrase:
                                    vec = self.wlookup[int(self.w2i.get(entry, 0))]
                                    rnn_forward = rnn_forward.add_input(vec)
                                permission_vecs[perm.ptype] = rnn_forward.output()
                                
                            rnn_forward = self.sentence_rnn[0].initial_state()
                            for entry in window:
                                vec = self.wlookup[int(self.w2i.get(entry, 0))]
                                rnn_forward = rnn_forward.add_input(vec)
                            
                            loss = 1-self.cosine_loss(rnn_forward.output(), permission_vecs[doc.permissions[0].ptype])
                            loss.backward()
                            self.trainer.update()
                            dy.renew_cg()
                    
                    elif tag == 0:
                        for window in sentence:
                            permission_vecs = {}
                            # gather all permission encoding of permissions
                            for perm  in self.all_permissions:
                                rnn_forward = self.permission_rrn[0].initial_state()
                                for entry in perm.pphrase:
                                    vec = self.wlookup[int(self.w2i.get(entry, 0))]
                                    rnn_forward = rnn_forward.add_input(vec)
                                permission_vecs[perm.ptype] = rnn_forward.output()
                                
                            rnn_forward = self.sentence_rnn[0].initial_state()
                            for entry in window:
                                vec = self.wlookup[int(self.w2i.get(entry, 0))]
                                rnn_forward = rnn_forward.add_input(vec)
                           
                            loss = self.cosine_loss(rnn_forward.output(), permission_vecs[Utils.to_lower("READ_CONTACTS", True)])
                            loss += self.cosine_loss(rnn_forward.output(), permission_vecs[Utils.to_lower("RECORD_AUDIO", True)])
                            
                            loss.backward()
                            self.trainer.update()
                            dy.renew_cg()
                            
    @logging
    def test_gold_splitted(self, documents):
        document_permission_similiarities = {}
        permission_vecs = {}
        # gather all permission encoding of permissions
        for perm  in self.all_permissions:
            rnn_forward = self.permission_rrn[0].initial_state()
            for entry in perm.pphrase:
                vec = self.wlookup[int(self.w2i.get(entry, 0))]
                rnn_forward = rnn_forward.add_input(vec)
            permission_vecs[perm.ptype] = rnn_forward.output().npvalue()
        for doc in documents:
            if doc.description:                
                #Sentence encoding
                sentence_enc_s = []
                for sentence,tag in zip(doc.description, doc.tags):
                    sentence_enc_s.append([])
                    if tag == 1 or tag == 2 or tag == 3:
                        for window in sentence:
                            rnn_forward = self.sentence_rnn[0].initial_state()
                            for entry in window:
                                vec = self.wlookup[int(self.w2i.get(entry, 0))]
                                rnn_forward = rnn_forward.add_input(vec)
                            if rnn_forward.output() is not None:
                                rnn_forward.output().npvalue()
                                sentence_enc_s[-1].append(rnn_forward.output().npvalue())
                            dy.renew_cg()

                document_permission_similiarities[doc.id] = {"related": [], "unrelated" : []}
                app_permissions = set()
                for related_p in doc.permissions:
                    document_permission_similiarities[doc.id]["related"].extend([(related_p.ptype, self.description_permission_sim_w_max(sentence_enc, permission_vecs[related_p.ptype])) for sentence_enc in sentence_enc_s])
                    app_permissions.add(related_p.ptype)
                for unrelated_p in self.all_permissions:
                    if unrelated_p.ptype not in app_permissions:
                        document_permission_similiarities[doc.id]["unrelated"].extend([(unrelated_p.ptype, self.description_permission_sim_w_max(sentence_enc, permission_vecs[unrelated_p.ptype])) for sentence_enc in sentence_enc_s])
        return document_permission_similiarities
    
    @logging
    def train_test_splitted(self, file_path, window_size=2):
        documents = []
        for doc in Utils.get_data(file_path, self.w2i, sequence_type="windowed", file_type=self.train_file_type, window_size=window_size, lower=True):
            documents.append(doc)
        random.shuffle(documents)
        split_point = (3*len(documents))//4
        return documents[:split_point], documents[split_point:]


In [4]:
%%time
words, w2i, permissions = Utils.vocab(options.train, file_type=options.train_file_type)


CPU times: user 4.42 s, sys: 511 ms, total: 4.93 s
Wall time: 8.38 s


In [5]:
%%time
model = SimpleModel(words, w2i, permissions, options, ext_embeddings_arg, ext_emb_dim_arg)

Initializing word embeddings by pre-trained vectors
Vocab size: 4834; #words having pretrained vectors: 4289
CPU times: user 86.6 ms, sys: 3.75 ms, total: 90.3 ms
Wall time: 90.5 ms


In [6]:
%%time
@logging
def draw_histogram(data, img_name):
    stats = model.statistics_gold(data)   
    related_all = []
    unrelated_all = []
    for doc_id in stats:
        related_all.extend([i for i in stats[doc_id]["related"]["all"] if i > -inf])
        unrelated_all.extend([i for i in stats[doc_id]["unrelated"]["all"] if i > -inf])
    print("Related all", len(related_all), sum(related_all))
    print("Unrelated all", len(unrelated_all), sum(unrelated_all))

    from matplotlib import pyplot

    pyplot.title("All similarity")
    pyplot.hist(related_all, bins='auto', alpha=0.5, label='related')
    pyplot.hist(unrelated_all, bins='auto', alpha=0.5, label='unrelated')
    pyplot.legend(loc='upper right')
    pyplot.savefig(img_name)
    pyplot.clf()

train_data, test_data = model.train_test_splitted(options.train)
similarities = model.test_gold_splitted(test_data)
draw_histogram(similarities, "trained_epoch_{}.png".format(0))
for i in range(10):
    print("Epoch {}".format(i+1))
    model.train_gold_splitted(train_data)
    similarities = model.test_gold_splitted(test_data)
    draw_histogram(similarities, "trained_epoch_{}.png".format(i+1))





Related all 47 6.414294843333786
Unrelated all 94 14.119336208219256
Epoch 1
Related all 47 14.376620473055205
Unrelated all 94 -19.333111361101007
Epoch 2
Related all 47 12.055685355669828
Unrelated all 94 -32.3858667674506
Epoch 3
Related all 47 8.972893063728412
Unrelated all 94 -26.62829052708988
Epoch 4
Related all 47 8.841505251565907
Unrelated all 94 -26.52606014423365
Epoch 5
Related all 47 27.643595737420558
Unrelated all 94 -25.94652449732265
Epoch 6
Related all 47 11.090441190876653
Unrelated all 94 -32.67492675701615
Epoch 7
Related all 47 21.550440314474304
Unrelated all 94 -27.676381941857844
Epoch 8
Related all 47 13.752036448597378
Unrelated all 94 -13.542136263258612
Epoch 9
Related all 47 11.154242065755676
Unrelated all 94 -32.77386143732741
Epoch 10
Related all 47 12.697116208575821
Unrelated all 94 -16.241865823022103
CPU times: user 3min 14s, sys: 652 ms, total: 3min 14s
Wall time: 3min 18s
