In [17]:
import sys
import os
import csv
import random

import pickle
import scipy
import pandas as pd
import numpy as np

seed = 10

import dynet_config

# Declare GPU as the default device type
dynet_config.set_gpu()
# Set some parameters manualy
dynet_config.set(mem=400, random_seed=seed)
# Initialize dynet import using above configuration in the current scope
import dynet as dy


from utils.io_utils import IOUtils
from utils.nlp_utils import NLPUtils

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold

random.seed(seed)
np.random.seed(seed)

In [18]:
class Data:
    def __init__(self):
        self.w2i = None
        self.entries = None
        self.train_entries = None
        self.test_entries = None
        self.ext_embedding = None
        self.reviews = None
        self.predicted_reviews = None

    def to(self, device):
        if self.entries:
            for entry in self.entries:
                entry.index_tensor = entry.index_tensor.to(device=device)
        if self.reviews:
            for doc_id in self.reviews:
                for review in self.reviews[doc_id]:
                    review.index_tensor = review.index_tensor.to(device=device)
        if self.predicted_reviews:
            for doc_id in self.predicted_reviews:
                for review in self.predicted_reviews[doc_id]:
                    review.index_tensor = review.index_tensor.to(device=device)

    def load(self, infile):
        with open(infile, "rb") as target:
            self.ext_embeddings, self.entries, self.w2i = pickle.load(target)

    def save_data(self, infile):
        with open(infile, "rb") as target:
            self.ext_embeddings, self.entries, self.w2i = pickle.dump(target)

    def load_predicted_reviews(self, infile):
        with open(infile, "rb") as target:
            self.predicted_reviews = pickle.load(target)
        for app_id in self.predicted_reviews.keys():
            self.predicted_reviews[app_id].sort(
                key=lambda x: x.prediction_result.item(), reverse=True
            )

    def load_reviews(self, infile):
        with open(infile, "rb") as target:
            self.reviews = pickle.load(target)


class Model:
    def __init__(self, data, opt):
        self.opt = opt
        self.model = dy.ParameterCollection()
        self.trainer = dy.MomentumSGDTrainer(self.model)
        self.w2i = data.w2i
        self.wdims = opt.embedding_size
        self.ldims = opt.hidden_size
        self.attsize = opt.attention_size

        self.ext_embeddings = data.ext_embeddings
        # Model Parameters
        self.wlookup = self.model.add_lookup_parameters((len(self.w2i), self.wdims))

        self.__load_external_embeddings()

        if self.opt.encoder_dir == "single":
            if self.opt.encoder_type == "lstm":
                self.sentence_rnn = [
                    dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model)
                ]
            elif self.opt.encoder_type == "gru":
                self.sentence_rnn = [
                    dy.GRUBuilder(1, self.wdims, self.ldims, self.model)
                ]
            self.attention_w = self.model.add_parameters((self.attsize, self.ldims))
            self.attention_b = self.model.add_parameters(self.attsize)
            self.att_context = self.model.add_parameters(self.attsize)
            self.mlp_w = self.model.add_parameters((1, self.ldims + 2 * self.ldims))
            self.mlp_b = self.model.add_parameters(1)
        elif self.opt.encoder_dir == "bidirectional":
            if self.opt.encoder_type == "lstm":
                self.sentence_rnn = [
                    dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model),
                    dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model),
                ]
            elif self.opt.encoder_type == "gru":
                self.sentence_rnn = [
                    dy.GRUBuilder(1, self.wdims, self.ldims, self.model),
                    dy.GRUBuilder(1, self.wdims, self.ldims, self.model),
                ]

            self.attention_w = self.model.add_parameters((self.attsize, 2 * self.ldims))
            self.attention_b = self.model.add_parameters(self.attsize)
            self.att_context = self.model.add_parameters(self.attsize)
            self.mlp_w = self.model.add_parameters((1, 2 * self.ldims))
            self.mlp_b = self.model.add_parameters(1)#self.mlp_w = self.model.add_parameters((1, 2 * self.ldims + 4 * self.ldims))
            

    def __load_external_embeddings(self):
        print("Initializing word embeddings by pre-trained vectors")
        count = 0
        for word in self.w2i:
            if word in self.ext_embeddings:
                count += 1
                self.wlookup.init_row(self.w2i[word], self.ext_embeddings[word])
        print(
            "Vocab size: %d; #words having pretrained vectors: %d"
            % (len(self.w2i), count)
        )

    def save(self):
        self.model.save(self.opt.model_checkpoint)

    def load(self):
        self.model.populate(self.opt.model_checkpoint)

def write_file(filename, string):
    with open(filename, "a") as target:
        target.write("{}\n".format(string))
        target.flush()


def encode_sequence(model, seq, rnn_builder):
    def predict_sequence(builder, inputs):
        s_init = builder.initial_state()
        return s_init.transduce(inputs)

    if model.opt.encoder_dir == "bidirectional":
        f_in = [entry for entry in seq]
        b_in = [rentry for rentry in reversed(seq)]
        forward_sequence = predict_sequence(rnn_builder[0], f_in)
        backward_sequence = predict_sequence(rnn_builder[1], b_in)
        return [
            dy.concatenate([s1, s2])
            for s1, s2 in zip(forward_sequence, backward_sequence)
        ]
    elif model.opt.encoder_dir == "single":
        f_in = [entry for entry in seq]
        state = rnn_builder[0].initial_state()
        states = []
        for entry in seq:
            state = state.add_input(entry)
            states.append(state.output())
        return states


def max_pooling(encoded_sequence):
    values = np.array([encoding.value() for encoding in encoded_sequence])
    min_indexes = np.argmax(values, axis=0)
    pooled_context = dy.concatenate(
        [encoded_sequence[row][col] for col, row in enumerate(min_indexes)]
    )
    return pooled_context


def min_pooling(encoded_sequence):
    values = np.array([encoding.value() for encoding in encoded_sequence])
    min_indexes = np.argmin(values, axis=0)
    pooled_context = dy.concatenate(
        [encoded_sequence[row][col] for col, row in enumerate(min_indexes)]
    )
    return pooled_context


def average_pooling(encoded_sequence):
    averages = []
    for col in range(encoded_sequence[0].dim()[0][0]):
        avg = []
        for row in range(len(encoded_sequence)):
            avg.append(encoded_sequence[row][col])
        averages.append(dy.average(avg))
    return dy.concatenate(averages)


def train_item(args, model, sentence):
    loss = None
    seq = [
        model.wlookup[int(model.w2i.get(entry, 0))]
        for entry in sentence.preprocessed_sentence
    ]
    if len(seq) > 0:
        encoded_sequence = encode_sequence(model, seq, model.sentence_rnn)
        #global_max = max_pooling(encoded_sequence)
        #global_min = average_pooling(encoded_sequence)
        if len(encoded_sequence) > 0:
            att_mlp_outputs = []
            for e in encoded_sequence:
                mlp_out = (model.attention_w * e) + model.attention_b
                att_mlp_outputs.append(mlp_out)

            lst = []
            for o in att_mlp_outputs:
                lst.append(dy.exp(dy.sum_elems(dy.cmult(o, model.att_context))))

            sum_all = dy.esum(lst)

            probs = [dy.cdiv(e, sum_all) for e in lst]
            att_context = dy.esum(
                [dy.cmult(p, h) for p, h in zip(probs, encoded_sequence)]
            )
            #context = dy.concatenate([att_context, global_max, global_min])
            context = dy.concatenate([att_context])
            y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b)

            if sentence.permissions[args.permission_type]:
                loss = dy.binary_log_loss(y_pred, dy.scalarInput(1))
            else:
                loss = dy.binary_log_loss(y_pred, dy.scalarInput(0))

            loss.backward()
            model.trainer.update()
            loss_val = loss.scalar_value()
            dy.renew_cg()
            return loss_val
    return 0


def test_item(model, sentence):
    seq = [
        model.wlookup[int(model.w2i.get(entry, 0))]
        for entry in sentence.preprocessed_sentence
    ]
    if len(seq) > 0:
        encoded_sequence = encode_sequence(model, seq, model.sentence_rnn)
        #global_max = max_pooling(encoded_sequence)
        #global_min = average_pooling(encoded_sequence)
        if len(encoded_sequence) > 0:
            att_mlp_outputs = []
            for e in encoded_sequence:
                mlp_out = (model.attention_w * e) + model.attention_b
                att_mlp_outputs.append(mlp_out)

            lst = []
            for o in att_mlp_outputs:
                lst.append(dy.exp(dy.sum_elems(dy.cmult(o, model.att_context))))

            sum_all = dy.esum(lst)

            probs = [dy.cdiv(e, sum_all) for e in lst]
            att_context = dy.esum(
                [dy.cmult(p, h) for p, h in zip(probs, encoded_sequence)]
            )
            #context = dy.concatenate([att_context, global_max, global_min])
            context = dy.concatenate([att_context])
            y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b)
            sentence.prediction_result = y_pred.scalar_value()
            dy.renew_cg()
            return sentence.prediction_result
    return 0

def show_attention_weights(model, sentence):
    seq = [
        model.wlookup[int(model.w2i.get(entry, 0))]
        for entry in sentence.preprocessed_sentence
    ]
    if len(seq) > 0:
        encoded_sequence = encode_sequence(model, seq, model.sentence_rnn)
        if len(encoded_sequence) > 0:
            att_mlp_outputs = []
            for e in encoded_sequence:
                mlp_out = (model.attention_w * e) + model.attention_b
                att_mlp_outputs.append(mlp_out)

            lst = []
            for o in att_mlp_outputs:
                lst.append(dy.exp(dy.sum_elems(dy.cmult(o, model.att_context))))

            sum_all = dy.esum(lst)
            probs = [dy.cdiv(e, sum_all).scalar_value() for e in lst]
            return probs


def train_all(args, model, data):
    write_file(args.outdir, "Training...")
    losses = []
    for index, sentence in enumerate(data.train_entries):
        loss = train_item(args, model, sentence)
        if index != 0:
            if index % model.opt.print_every == 0:
                write_file(
                    args.outdir,
                    "Index {} Loss {}".format(
                        index, np.mean(losses[index - model.opt.print_every :])
                    ),
                )
        losses.append(loss)


def test_all(args, model, data):
    def pr_roc_auc(predictions, gold):
        y_true = np.array(gold)
        y_scores = np.array(predictions)
        roc_auc = roc_auc_score(y_true, y_scores)
        pr_auc = average_precision_score(y_true, y_scores)
        return roc_auc, pr_auc

    write_file(args.outdir, "Predicting..")

    predictions, gold = [], []
    for index, sentence in enumerate(data.test_entries):
        pred = test_item(model, sentence)
        predictions.append(pred)
        gold.append(sentence.permissions[args.permission_type])
    return pr_roc_auc(predictions, gold)


def kfold_validation(args, data):
    data.entries = np.array(data.entries)
    random.shuffle(data.entries)

    kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
    roc_l, pr_l = [], []
    for foldid, (train, test) in enumerate(kfold.split(data.entries)):
        write_file(args.outdir, "Fold {}".format(foldid + 1))

        model = Model(data, args)
        data.train_entries = data.entries[train]
        data.test_entries = data.entries[test]
        max_roc_auc, max_pr_auc = 0, 0
        for epoch in range(args.num_epoch):
            train_all(args, model, data)
            roc_auc, pr_auc = test_all(args, model, data)
            if pr_auc > max_pr_auc:
                max_pr_auc = pr_auc
                max_roc_auc = roc_auc
            write_file(
                args.outdir, "Epoch {} ROC {}  PR {}".format(epoch + 1, roc_auc, pr_auc)
            )
        model.save()
        write_file(args.outdir, "ROC {} PR {}".format(max_roc_auc, max_pr_auc))
        roc_l.append(max_roc_auc)
        pr_l.append(max_pr_auc)
    write_file(
        args.outdir, "Summary : ROC {} PR {}".format(np.mean(roc_l), np.mean(pr_l))
    )


In [3]:
class Arguments:
    permission_type = "READ_CONTACTS"
    saved_data = "/Users/huseyinalecakir/huseyin/Work/Security/datasets/saved-parameters/saved-data/ac-net/fasttext_embeddings-sentences-w2i.pickle"
    outdir = "output.txt"
    stemmer = "porter"
    embedding_size = 300
    hidden_size = 128
    attention_size = 128
    output_size = 1
    print_every = 1000
    encoder_dir = "bidirectional"
    encoder_type = "gru"
    num_epoch = 1
    

In [4]:
args = Arguments()

In [5]:
data = Data()
data.load(args.saved_data)

In [6]:
data.entries = np.array(data.entries)
random.shuffle(data.entries)
from sklearn.model_selection import train_test_split
data.train_entries, data.test_entries = train_test_split(data.entries, test_size=0.10, random_state=5)

In [7]:
def load_train_test(infile):
    with open(infile, "rb") as target:
        data.entries, data.train_entries, data.test_entries = pickle.load(target)

def save_train_test(infile):
    with open(infile, "wb") as target:
        pickle.dump([data.entries, data.train_entries, data.test_entries], target)

In [9]:
#save_train_test("fasttext_train_test.pickle")
load_train_test("fasttext_train_test.pickle")

In [10]:
model = Model(data, args)
train_all(args, model, data)
roc_auc, pr_auc = test_all(args, model, data)
print(roc_auc, pr_auc)

Initializing word embeddings by pre-trained vectors
Vocab size: 18373; #words having pretrained vectors: 18373
0.964432877901 0.703991981977


In [11]:
for entry in data.test_entries:
    if type(entry.prediction_result) != float:
        entry.prediction_result = 0

In [34]:
positives = [entry for entry in data.test_entries if entry.permissions["READ_CONTACTS"]==1]
negatives = [entry for entry in data.test_entries if entry.permissions["READ_CONTACTS"]==0]

sorted_positives = sorted(positives, key=lambda x: x.prediction_result, reverse=True)
sorted_negatives = sorted(negatives, key=lambda x: x.prediction_result, reverse=True)
threshold = 0.50
TP = sum([1 for entry in sorted_positives if entry.prediction_result >= threshold])
FN = sum([1 for entry in sorted_positives if entry.prediction_result < threshold])
TN = sum([1 for entry in sorted_negatives if entry.prediction_result < threshold])
FP = sum([1 for entry in sorted_negatives if entry.prediction_result >= threshold])

precision = TP/(TP+FP)
recall = TP/(TP+FN)
acc = (TN+TP)/(TN+FP+TP+FN)
fmeasure = (2 * precision * recall) / (precision + recall)


print("TP:{} - FN:{} - TN:{} - FP:{}".format(TP, FN, TN, FP))
print("Precision:{} - Recall:{}".format(precision, recall))
print("Accuracy:{}".format(acc))
print("F-measuse:{}".format(fmeasure))

TP:61 - FN:26 - TN:2347 - FP:38
Precision:0.6161616161616161 - Recall:0.7011494252873564
Accuracy:0.9741100323624595
F-measuse:0.6559139784946237


In [13]:
with open("attention_fasttext_statement_sentences.txt", "w") as target:
    neg_keys = {}
    for idx, entry in enumerate(sorted_negatives):
        if entry.prediction_result > threshold:
            weights = show_attention_weights(model, entry)
            max_index = weights.index(max(weights))
            mit = entry.preprocessed_sentence[max_index] #most_important_token
            neg_keys[mit] = neg_keys[mit]+1 if mit in neg_keys else 1
            pairs = [(t,w)for t,w in zip(entry.preprocessed_sentence,weights)]
            target.write("{}::{}::{}\n".format(idx+1, entry.sentence, entry.prediction_result))
            print("{}::{}::{}\n".format(idx+1, entry.sentence, entry.prediction_result))
            for t,w in pairs:
                print("{}:{}".format(t,w))
                target.write("{}:{}\n".format(t,w))
            print()
            target.write("\n")
    sorted_lst = sorted(neg_keys.items(), key=lambda kv: kv[1], reverse=True)
    target.write("KEYWORDS\n")
    for key, value in sorted_lst:
        target.write("{}:{}\n".format(key, value))

1::Cloud backup through Dropbox and Google Drive::0.9870849251747131

cloud:0.0847850888967514
backup:0.11934180557727814
dropbox:0.6265543699264526
google:0.059158653020858765
drive:0.11016002297401428

2::Block all numbers (useful when you are sleeping) Turn blacklist off::0.9759328365325928

block:0.010004393756389618
numbers:0.3831016719341278
useful:0.05832824110984802
sleeping:0.009381955489516258
turn:0.00042666899389587343
blacklist:0.5387570858001709

3::You can send texts# emoticons# pictures# voice messages# and other files to your FreePP contacts 3::0.9675911664962769

send:2.570671995272278e-06
texts:4.946612079947954e-06
emoticons:4.8831061576493084e-06
pictures:8.424824045505375e-06
voice:3.968076725868741e-06
messages:9.33533283387078e-06
files:2.4136659703799523e-05
contacts:0.9999417066574097

4::------------------------------------------------------------This is a free version# limited to 5 contacts Problems::0.9645431041717529

free:7.699562047491781e-06
version:7.2

In [14]:
with open("attention_fasttext_permission_sentences.txt", "w") as target:
    pos_keys = {}
    for idx, entry in enumerate(sorted_positives):
        weights = show_attention_weights(model, entry)
        max_index = weights.index(max(weights))
        mit = entry.preprocessed_sentence[max_index] #most_important_token
        pos_keys[mit] = pos_keys[mit]+1 if mit in pos_keys else 1
        pairs = [(t,w)for t,w in zip(entry.preprocessed_sentence,weights)]
        print("{}::{}::{}\n".format(idx+1, entry.sentence, entry.prediction_result))
        target.write("{}::{}::{}\n".format(idx+1, entry.sentence, entry.prediction_result))
        for t,w in pairs:
            print("{}:{}".format(t,w))
            target.write("{}:{}\n".format(t,w))
        print()
        target.write("\n")
    sorted_lst = sorted(pos_keys.items(), key=lambda kv: kv[1], reverse=True)
    target.write("KEYWORDS\n")
    for key, value in sorted_lst:
        target.write("{}:{}\n".format(key, value))

1::CREATE your own QR CODES encoding: > EMAIL ADDRESSES > PHONE NUMBERS > CONTACTS INFORMATIONS > CALENDAR EVENTS > GEOGRAPHIC INFORMATION > SIMPLE TEXT > BOOKMARKS::0.9991316199302673

create:6.384315202012658e-05
qr:2.647963810886722e-05
codes:9.216956823365763e-05
encoding:0.00012453750241547823
email:0.0040395259857177734
addresses:0.006145660765469074
phone:0.001116777304559946
numbers:0.037141770124435425
contacts:0.9110633134841919
informations:0.026592979207634926
calendar:0.01252233050763607
events:0.0008525230805389583
geographic:0.00011304279905743897
information:1.4260299394663889e-05
simple:9.472502824792173e-06
text:9.035205039253924e-06
bookmarks:7.230095070553944e-05

2::Find & merge contacts with similar names::0.9969347715377808

find:7.061444193823263e-05
merge:9.238947677658871e-05
contacts:0.921165406703949
similar:0.05231159180402756
names:0.02636002004146576

3::There are four block modes of call blocker: Blacklist All except the whitelist All except contacts All

In [19]:
print(neg_keys)

{'dropbox': 3, 'blacklist': 1, 'contacts': 2, 'numbers': 1, 'account': 9, 'facebook': 6, 'block': 1, 'instagram': 3, 'sync': 1, 'social': 2, 'accounts': 2, 'contact': 2, 'email': 2, 'backup': 1, 'devices': 1, 'address': 1}


In [20]:
print(pos_keys)

{'contacts': 30, 'blacklist': 3, 'numbers': 1, 'accounts': 5, 'account': 4, 'twitter': 2, 'contact': 10, 'dropbox': 2, 'facebook': 8, 'devices': 2, 'mail': 1, 'sync': 1, 'address': 1, 'friends': 1, 'gmail': 1, 'book': 2, 'whatsapp': 1, 'calendar': 1, 'yahoo': 1, 'sharing': 1, 'real': 1, 'blocking': 1, 'videos': 1, 'wallet': 1, 'choose': 1, 'simpler': 1, 'items': 2, 'phonebook': 1}


In [None]:
"""test_entries = data.test_entries
data.test_entries = data.train_entries
roc_auc, pr_auc = test_all(args, model, data)

ls = []
ls.extend(data.test_entries)
ls.extend(test_entries)
false = sum([1 for entry in ls if entry.prediction_result  < 0.25 and entry.permissions["READ_CONTACTS"] == 1 ])
tagged = sum([1 for entry in ls if entry.permissions["READ_CONTACTS"] == 1])
print(false, tagged, (false/tagged))
false = sum([1 for entry in ls if entry.prediction_result  < 0.25 and entry.permissions["READ_CONTACTS"] == 1 ])
tagged = sum([1 for entry in ls if entry.permissions["READ_CONTACTS"] == 1])
print(false, tagged, (false/tagged))
for entry in ls:
    print(idx+1, entry.sentence, entry.prediction_result)
    print()
false_positives = [entry for entry in sorted_negatives if entry.prediction_result >= 0.5]
for entry in false_positives:
    print(entry.prediction_result)"""