In [None]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


class ArgumentParser():
    permission_type = "READ_CONTACTS"
    train = "/home/huseyinalecakir/Security/data/acnet-data/ACNET_DATASET.csv"
    train_file_type = "acnet"
    test = "/home/huseyinalecakir/Security/data/whyper/Read_Contacts.csv"
    test_file_type = "whyper"
    external_embedding = "/home/huseyinalecakir/Security/data/pretrained-embeddings/{}".format("scraped_with_porter_stemming_300.bin")
    external_embedding_type = "word2vec"
    wembedding_dims = 300
    lstm_dims = 128
    sequence_type = "windowed"
    window_size = 2
    stemmer = "porter"
    lstm_type = "lstm"
    saved_parameters_dir = "/home/huseyinalecakir/Security/data/saved-parameters/".format(wembedding_dims)
    saved_prevectors = "embeddings.pickle"
    saved_vocab_test = "whyper-vocab.txt"
    saved_vocab_train = "acnet-vocab.txt"
    saved_preprocessed_whyper = "whyper-preprocessed.txt"
    saved_preprocessed_acnet = "acnet-preprocessed.txt"
    outdir = "./test/{}/{}/{}".format(permission_type, lstm_type, wembedding_dims)
    external_info = "no_info"
    external_info_dim = 300

In [None]:
import csv
import os
import random
import sys

import dynet as dy
import dynet_config
import numpy as np
import pandas as pd
import scipy
from numpy import inf

from utils.io_utils import IOUtils
from utils.nlp_utils import NLPUtils

# Declare GPU as the default device type
dynet_config.set_gpu()
# Set some parameters manualy
dynet_config.set(mem=400, random_seed=123456789)
# Initialize dynet import using above configuration in the current scope

random.seed(33)


class DocumentReport:
    """TODO"""
    def __init__(self, app_id):
        self.app_id = app_id
        self.mark = False
        self.preprocessed_sentences = []
        self.sentences = []
        self.prediction_result = None


class SimilarityExperiment:
    """TODO"""
    def __init__(self, w2i, options):
        print('Similarity Experiment - init')
        self.options = options
        self.model = dy.ParameterCollection()
        self.trainer = dy.SimpleSGDTrainer(self.model)
        self.w2i = w2i
        self.wdims = options.wembedding_dims
        self.ldims = options.lstm_dims

        self.ext_embeddings = None
        #Model Parameters
        self.wlookup = self.model.add_lookup_parameters((len(w2i), self.wdims))

        self.__load_model()

        if self.options.lstm_type == "lstm":
            self.sentence_rnn = [dy.VanillaLSTMBuilder(2, self.wdims, self.ldims, self.model)]
            self.document_rnn = [dy.VanillaLSTMBuilder(2, self.ldims, self.ldims, self.model)]
            self.mlp_w = self.model.add_parameters((1, self.ldims))
            self.mlp_b = self.model.add_parameters(1)

        elif self.options.lstm_type == "bilstm":
            self.sentence_rnn = [dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model),
                                 dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model)]
            self.document_rnn = [dy.VanillaLSTMBuilder(1, self.ldims, self.ldims, self.model),
                                 dy.VanillaLSTMBuilder(1, self.ldims, self.ldims, self.model)]
            self.mlp_w = self.model.add_parameters((1, 2*self.ldims))
            self.mlp_b = self.model.add_parameters(1)
        
    def __load_model(self):
        if self.options.external_embedding is not None:
            if os.path.isfile(os.path.join(self.options.saved_parameters_dir,
                                           self.options.saved_prevectors)):
                self.__load_external_embeddings(os.path.join(self.options.saved_parameters_dir,
                                                             self.options.saved_prevectors),
                                                "pickle")
            else:
                self.__load_external_embeddings(self.options.external_embedding,
                                                self.options.external_embedding_type)
                self.__save_model()

    def __save_model(self):
        IOUtils.save_embeddings(os.path.join(self.options.saved_parameters_dir,
                                             self.options.saved_prevectors),
                                self.ext_embeddings)

    def __load_external_embeddings(self, embedding_file, embedding_file_type):
        ext_embeddings, ext_emb_dim = IOUtils.load_embeddings_file(
            embedding_file,
            embedding_file_type,
            lower=True)
        assert ext_emb_dim == self.wdims
        self.ext_embeddings = {}
        print("Initializing word embeddings by pre-trained vectors")
        count = 0
        for word in self.w2i:
            if word in ext_embeddings:
                count += 1
                self.ext_embeddings[word] = ext_embeddings[word]
                self.wlookup.init_row(self.w2i[word], ext_embeddings[word])
        print("Vocab size: %d; #words having pretrained vectors: %d" % (len(self.w2i), count))


def __load_row_acnet_file(infile, gold_permission, stemmer):
    print("Loading row {} ".format(infile))
    print("Reading Train Sentences")
    tagged_train_file = pd.read_csv(infile)
    documents = []
    acnet_map = {"RECORD_AUDIO" : "MICROPHONE", "READ_CONTACTS": "CONTACTS", "READ_CALENDAR": "CALENDAR", "ACCESS_FINE_LOCATION" : "LOCATION" ,
    "CAMERA" : "CAMERA", "READ_SMS" : "SMS", "READ_CALL_LOGS" : "CALL_LOG", "CALL_PHONE" : "PHONE" , "WRITE_SETTINGS" : "SETTINGS" ,
    "GET_TASKS" : "TASKS"}\

    for idx, row in tagged_train_file.iterrows():

        app_id = int(row["app_id"])
        sentence = row["sentence"]

        if documents == []: #if it is the first document
            documents.append(DocumentReport(app_id))
        elif documents[-1].app_id != app_id: # if it is a new document
            documents.append(DocumentReport(app_id))

        if row[acnet_map[gold_permission]] is 1:
            documents[-1].mark = True

        documents[-1].sentences.append(sentence)
        documents[-1].preprocessed_sentences.append(" ".join(NLPUtils.preprocess_sentence(sentence, stemmer)))
    print("Loading completed")
    return documents

def __encode_sequence(model, seq, rnn_builder):
    def predict_sequence(builder, inputs):
        s_init = builder.initial_state()
        return s_init.transduce(inputs)

    if model.options.lstm_type == "bilstm":
        f_in = [entry for entry in seq]
        b_in = [rentry for rentry in reversed(seq)]
        forward_sequence = predict_sequence(rnn_builder[0], f_in)
        backward_sequence = predict_sequence(rnn_builder[1], b_in)
        return dy.concatenate([forward_sequence[-1], backward_sequence[-1]])
    elif model.options.lstm_type == "lstm":
        f_in = [entry for entry in seq]
        state = rnn_builder[0].initial_state()
        for entry in seq:
            state = state.add_input(entry)
        return state.output()

def __train(model, data):
    total_loss = 0
    for index, document in enumerate(data):
        loss = None
        sentence_encodings = []
        for sentence in document.preprocessed_sentences:
            seq = [model.wlookup[int(model.w2i.get(entry, 0))] for entry in sentence]
            if len(seq) > 0:
                encoded_phrase = __encode_sequence(model, seq, model.sentence_rnn)
                sentence_encodings.append(encoded_phrase)

        document_encoding = __encode_sequence(model, sentence_encodings, model.document_rnn)
        y_pred = dy.logistic((model.mlp_w*document_encoding) + model.mlp_b)

        if document.mark:
            loss = dy.binary_log_loss(y_pred, dy.scalarInput(1))
        else:
            loss = dy.binary_log_loss(y_pred, dy.scalarInput(0))

        total_loss += loss.scalar_value()
        if index != 0 and index % 100 == 0:
            print("Index {} Loss {}".format(index, total_loss/(index+1)))
        loss.backward()
        model.trainer.update()
        dy.renew_cg()

def __predict(model, data):
    for _, document in enumerate(data):
        sentence_encodings = []
        for sentence in document.preprocessed_sentences:
            seq = [model.wlookup[int(model.w2i.get(entry, 0))] for entry in sentence]
            if len(seq) > 0:
                encoded_phrase = __encode_sequence(model, seq, model.sentence_rnn)
                sentence_encodings.append(encoded_phrase)

        document_encoding = __encode_sequence(model, sentence_encodings, model.document_rnn)
        y_pred = dy.logistic((model.mlp_w*document_encoding) + model.mlp_b)
        document.prediction_result = y_pred.scalar_value()
        dy.renew_cg()


def run(args):
    print('Extracting training vocabulary')
    w2i, _ = IOUtils.load_vocab(  args.train,
                                        args.train_file_type,
                                        args.saved_parameters_dir,
                                        args.saved_vocab_train,
                                        args.external_embedding,
                                        args.external_embedding_type,
                                        args.stemmer,
                                        True)    

    documents = __load_row_acnet_file(args.train,  args.permission_type, args.stemmer)
    documents = np.array(documents)
    random.shuffle(documents)

    from sklearn.metrics import roc_auc_score, average_precision_score
    from sklearn.model_selection import KFold
    
    all_predictions = []
    roc_scores = []
    pr_scores = []
    kfold = KFold(10, True, 1)
    for foldid, (train, test) in enumerate(kfold.split(documents)):
        model = SimilarityExperiment(w2i, args)
        print("Fold {}:".format(foldid))
        print("Num. of train doc {}".format(len(train)))
        print("Num. of test doc {}".format(len(test)))

        test_documents = documents[test]
        train_documents = documents[train]

        __train(model, train_documents)
        __predict(model, test_documents)

        predictions = [r.prediction_result for r in test_documents]
        gold = []
        for r in test_documents:
            if r.mark:
                gold.append(1)
            else:
                gold.append(0)

        y_true = np.array(gold)
        y_scores = np.array(predictions)

        roc_auc = roc_auc_score(y_true, y_scores)
        pr_auc = average_precision_score(y_true, y_scores)

        roc_scores.append(roc_auc)
        pr_scores.append(pr_auc)

        for r in test_documents:
            mark = 1 if r.mark else 0
            all_predictions.append([" ".join(r.sentences), " ".join(r.preprocessed_sentences), mark, r.prediction_result])

    roc_pr_out_dir = os.path.join(model.options.outdir, "roc_auc.txt")
    with open(roc_pr_out_dir, "w") as target:
        target.write("ROC-AUC {}\n".format(sum(roc_scores)/len(roc_scores)))
        target.write("PR-AUC {}\n".format(sum(pr_scores)/len(pr_scores)))

    predictions_dir = os.path.join(model.options.outdir, "predicted_file.txt")
    with open(predictions_dir, "w") as target:
        for p in all_predictions:
            target.write("{}\n".format("|||".join(str(i) for i in p)))


In [None]:
args = ArgumentParser()

In [None]:
print('Extracting training vocabulary')
w2i, _ = IOUtils.load_vocab(  args.train,
                                    args.train_file_type,
                                    args.saved_parameters_dir,
                                    args.saved_vocab_train,
                                    args.external_embedding,
                                    args.external_embedding_type,
                                    args.stemmer,
                                    True)

In [None]:
documents = __load_row_acnet_file(args.train,  args.permission_type, args.stemmer)


In [None]:
documents = np.array(documents)
random.shuffle(documents)

In [None]:
def confusion_matrix(gold, predicted):
    TP, TN, FN, FP = 0.0, 0.0, 0.0, 0.0
    for g, p in zip(gold, predicted):
        if g == 0:
            if g == p:
                TN += 1
            else:
                FP += 1
        else:
            if g == p:
                TP += 1
            else:
                FN += 1
    return TP, TN, FP, FN

In [None]:
def accuracy_precision_recall(tp, tn, fp, fn):
    precision = tp / (tp + fp)
    recall  = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    return accuracy, precision, recall

In [None]:
def show_metrics(gold, predicted):
    print(gold, predicted)
    tp, tn, fp, fn = confusion_matrix(gold, predicted)
    accuracy, precision, recall = accuracy_precision_recall(tp, tn, fp, fn)
    print("TP : {}\nTN : {}\nFP : {}\nFN : {}".format(tp, tn, fp, fn))
    print("Accuracy : {}\nPrecision : {}\nRecall : {}".format(accuracy, precision, recall))

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold

all_predictions = []
roc_scores = []
pr_scores = []
kfold = KFold(10, True, 1)
for foldid, (train, test) in enumerate(kfold.split(documents)):
    model = SimilarityExperiment(w2i, args)
    print("Fold {}:".format(foldid))
    print("Num. of train doc {}".format(len(train)))
    print("Num. of test doc {}".format(len(test)))

    test_documents = documents[test]
    train_documents = documents[train]

    __train(model, train_documents)
    __predict(model, test_documents)

    predictions = [r.prediction_result for r in test_documents]
    gold = []
    for r in test_documents:
        if r.mark:
            gold.append(1)
        else:
            gold.append(0)

    y_true = np.array(gold)
    y_scores = np.array(predictions)
    

    
    roc_auc = roc_auc_score(y_true, y_scores)
    pr_auc = average_precision_score(y_true, y_scores)

    roc_scores.append(roc_auc)
    pr_scores.append(pr_auc)
    
    show_metrics(gold, predictions)
    print("AUC : {} PR-AUC : {}".format(roc_auc, pr_auc))
    for r in test_documents:
        mark = 1 if r.mark else 0
        all_predictions.append([" ".join(r.sentences), " ".join(r.preprocessed_sentences), mark, r.prediction_result])

roc_pr_out_dir = os.path.join(model.options.outdir, "roc_auc.txt")
with open(roc_pr_out_dir, "w") as target:
    target.write("ROC-AUC {}\n".format(sum(roc_scores)/len(roc_scores)))
    target.write("PR-AUC {}\n".format(sum(pr_scores)/len(pr_scores)))

predictions_dir = os.path.join(model.options.outdir, "predicted_file.txt")
with open(predictions_dir, "w") as target:
    for p in all_predictions:
        target.write("{}\n".format("|||".join(str(i) for i in p)))

In [None]:
roc_pr_out_dir = os.path.join(model.options.outdir, "roc_auc.txt")
with open(roc_pr_out_dir, "w") as target:
    target.write("ROC-AUC {}\n".format(sum(roc_scores)/len(roc_scores)))
    target.write("PR-AUC {}\n".format(sum(pr_scores)/len(pr_scores)))

predictions_dir = os.path.join(model.options.outdir, "predicted_file.txt")
with open(predictions_dir, "w") as target:
    for p in all_predictions:
        target.write("{}\n".format("|||".join(str(i) for i in p)))