In [1]:
import sys
import os
import csv
import random

import pickle
import scipy
import pandas as pd
import numpy as np

seed = 10

import dynet_config

# Declare GPU as the default device type
dynet_config.set_gpu()
# Set some parameters manualy
dynet_config.set(mem=400, random_seed=seed)
# Initialize dynet import using above configuration in the current scope
import dynet as dy


from utils.io_utils import IOUtils
from utils.nlp_utils import NLPUtils

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold

random.seed(seed)
np.random.seed(seed)

In [2]:
class Data:
    def __init__(self):
        self.w2i = None
        self.entries = None
        self.train_entries = None
        self.test_entries = None
        self.ext_embedding = None
        self.reviews = None
        self.predicted_reviews = None

    def to(self, device):
        if self.entries:
            for entry in self.entries:
                entry.index_tensor = entry.index_tensor.to(device=device)
        if self.reviews:
            for doc_id in self.reviews:
                for review in self.reviews[doc_id]:
                    review.index_tensor = review.index_tensor.to(device=device)
        if self.predicted_reviews:
            for doc_id in self.predicted_reviews:
                for review in self.predicted_reviews[doc_id]:
                    review.index_tensor = review.index_tensor.to(device=device)

    def load(self, infile):
        with open(infile, "rb") as target:
            self.ext_embeddings, self.entries, self.w2i = pickle.load(target)

    def save_data(self, infile):
        with open(infile, "rb") as target:
            self.ext_embeddings, self.entries, self.w2i = pickle.dump(target)

    def load_predicted_reviews(self, infile):
        with open(infile, "rb") as target:
            self.predicted_reviews = pickle.load(target)
        for app_id in self.predicted_reviews.keys():
            self.predicted_reviews[app_id].sort(
                key=lambda x: x.prediction_result.item(), reverse=True
            )

    def load_reviews(self, infile):
        with open(infile, "rb") as target:
            self.reviews = pickle.load(target)


class Model:
    def __init__(self, data, opt):
        self.opt = opt
        self.model = dy.ParameterCollection()
        self.trainer = dy.MomentumSGDTrainer(self.model)
        self.w2i = data.w2i
        self.wdims = opt.embedding_size
        self.ldims = opt.hidden_size
        self.attsize = opt.attention_size

        self.ext_embeddings = data.ext_embeddings
        # Model Parameters
        self.wlookup = self.model.add_lookup_parameters((len(self.w2i), self.wdims))

        self.__load_external_embeddings()

        if self.opt.encoder_dir == "single":
            if self.opt.encoder_type == "lstm":
                self.sentence_rnn = [
                    dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model)
                ]
            elif self.opt.encoder_type == "gru":
                self.sentence_rnn = [
                    dy.GRUBuilder(1, self.wdims, self.ldims, self.model)
                ]
            self.attention_w = self.model.add_parameters((self.attsize, self.ldims))
            self.attention_b = self.model.add_parameters(self.attsize)
            self.att_context = self.model.add_parameters(self.attsize)
            self.mlp_w = self.model.add_parameters((1, self.ldims + 2 * self.ldims))
            self.mlp_b = self.model.add_parameters(1)
        elif self.opt.encoder_dir == "bidirectional":
            if self.opt.encoder_type == "lstm":
                self.sentence_rnn = [
                    dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model),
                    dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model),
                ]
            elif self.opt.encoder_type == "gru":
                self.sentence_rnn = [
                    dy.GRUBuilder(1, self.wdims, self.ldims, self.model),
                    dy.GRUBuilder(1, self.wdims, self.ldims, self.model),
                ]

            self.attention_w = self.model.add_parameters((self.attsize, 2 * self.ldims))
            self.attention_b = self.model.add_parameters(self.attsize)
            self.att_context = self.model.add_parameters(self.attsize)
            self.mlp_w = self.model.add_parameters((1, 2 * self.ldims + 4 * self.ldims))
            self.mlp_b = self.model.add_parameters(1)
            

    def __load_external_embeddings(self):
        print("Initializing word embeddings by pre-trained vectors")
        count = 0
        for word in self.w2i:
            if word in self.ext_embeddings:
                count += 1
                self.wlookup.init_row(self.w2i[word], self.ext_embeddings[word])
        print(
            "Vocab size: %d; #words having pretrained vectors: %d"
            % (len(self.w2i), count)
        )

    def save(self):
        self.model.save(self.opt.model_checkpoint)

    def load(self):
        self.model.populate(self.opt.model_checkpoint)

def write_file(filename, string):
    with open(filename, "a") as target:
        target.write("{}\n".format(string))
        target.flush()


def encode_sequence(model, seq, rnn_builder):
    def predict_sequence(builder, inputs):
        s_init = builder.initial_state()
        return s_init.transduce(inputs)

    if model.opt.encoder_dir == "bidirectional":
        f_in = [entry for entry in seq]
        b_in = [rentry for rentry in reversed(seq)]
        forward_sequence = predict_sequence(rnn_builder[0], f_in)
        backward_sequence = predict_sequence(rnn_builder[1], b_in)
        return [
            dy.concatenate([s1, s2])
            for s1, s2 in zip(forward_sequence, backward_sequence)
        ]
    elif model.opt.encoder_dir == "single":
        f_in = [entry for entry in seq]
        state = rnn_builder[0].initial_state()
        states = []
        for entry in seq:
            state = state.add_input(entry)
            states.append(state.output())
        return states


def max_pooling(encoded_sequence):
    values = np.array([encoding.value() for encoding in encoded_sequence])
    min_indexes = np.argmax(values, axis=0)
    pooled_context = dy.concatenate(
        [encoded_sequence[row][col] for col, row in enumerate(min_indexes)]
    )
    return pooled_context


def min_pooling(encoded_sequence):
    values = np.array([encoding.value() for encoding in encoded_sequence])
    min_indexes = np.argmin(values, axis=0)
    pooled_context = dy.concatenate(
        [encoded_sequence[row][col] for col, row in enumerate(min_indexes)]
    )
    return pooled_context


def average_pooling(encoded_sequence):
    averages = []
    for col in range(encoded_sequence[0].dim()[0][0]):
        avg = []
        for row in range(len(encoded_sequence)):
            avg.append(encoded_sequence[row][col])
        averages.append(dy.average(avg))
    return dy.concatenate(averages)


def train_item(args, model, sentence):
    loss = None
    seq = [
        model.wlookup[int(model.w2i.get(entry, 0))]
        for entry in sentence.preprocessed_sentence
    ]
    if len(seq) > 0:
        encoded_sequence = encode_sequence(model, seq, model.sentence_rnn)
        global_max = max_pooling(encoded_sequence)
        global_min = average_pooling(encoded_sequence)
        if len(encoded_sequence) > 0:
            att_mlp_outputs = []
            for e in encoded_sequence:
                mlp_out = (model.attention_w * e) + model.attention_b
                att_mlp_outputs.append(mlp_out)

            lst = []
            for o in att_mlp_outputs:
                lst.append(dy.exp(dy.sum_elems(dy.cmult(o, model.att_context))))

            sum_all = dy.esum(lst)

            probs = [dy.cdiv(e, sum_all) for e in lst]
            att_context = dy.esum(
                [dy.cmult(p, h) for p, h in zip(probs, encoded_sequence)]
            )
            context = dy.concatenate([att_context, global_max, global_min])
            #context = dy.concatenate([att_context])
            y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b)

            if sentence.permissions[args.permission_type]:
                loss = dy.binary_log_loss(y_pred, dy.scalarInput(1))
            else:
                loss = dy.binary_log_loss(y_pred, dy.scalarInput(0))

            loss.backward()
            model.trainer.update()
            loss_val = loss.scalar_value()
            dy.renew_cg()
            return loss_val
    return 0


def test_item(model, sentence):
    seq = [
        model.wlookup[int(model.w2i.get(entry, 0))]
        for entry in sentence.preprocessed_sentence
    ]
    if len(seq) > 0:
        encoded_sequence = encode_sequence(model, seq, model.sentence_rnn)
        global_max = max_pooling(encoded_sequence)
        global_min = average_pooling(encoded_sequence)
        if len(encoded_sequence) > 0:
            att_mlp_outputs = []
            for e in encoded_sequence:
                mlp_out = (model.attention_w * e) + model.attention_b
                att_mlp_outputs.append(mlp_out)

            lst = []
            for o in att_mlp_outputs:
                lst.append(dy.exp(dy.sum_elems(dy.cmult(o, model.att_context))))

            sum_all = dy.esum(lst)

            probs = [dy.cdiv(e, sum_all) for e in lst]
            att_context = dy.esum(
                [dy.cmult(p, h) for p, h in zip(probs, encoded_sequence)]
            )
            context = dy.concatenate([att_context, global_max, global_min])
            #context = dy.concatenate([att_context])
            y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b)
            sentence.prediction_result = y_pred.scalar_value()
            dy.renew_cg()
            return sentence.prediction_result
    return 0

def show_attention_weights(model, sentence):
    seq = [
        model.wlookup[int(model.w2i.get(entry, 0))]
        for entry in sentence.preprocessed_sentence
    ]
    if len(seq) > 0:
        encoded_sequence = encode_sequence(model, seq, model.sentence_rnn)
        if len(encoded_sequence) > 0:
            att_mlp_outputs = []
            for e in encoded_sequence:
                mlp_out = (model.attention_w * e) + model.attention_b
                att_mlp_outputs.append(mlp_out)

            lst = []
            for o in att_mlp_outputs:
                lst.append(dy.exp(dy.sum_elems(dy.cmult(o, model.att_context))))

            sum_all = dy.esum(lst)
            probs = [dy.cdiv(e, sum_all).scalar_value() for e in lst]
            return probs


def train_all(args, model, data):
    write_file(args.outdir, "Training...")
    losses = []
    for index, sentence in enumerate(data.train_entries):
        loss = train_item(args, model, sentence)
        if index != 0:
            if index % model.opt.print_every == 0:
                write_file(
                    args.outdir,
                    "Index {} Loss {}".format(
                        index, np.mean(losses[index - model.opt.print_every :])
                    ),
                )
        losses.append(loss)


def test_all(args, model, data):
    def pr_roc_auc(predictions, gold):
        y_true = np.array(gold)
        y_scores = np.array(predictions)
        roc_auc = roc_auc_score(y_true, y_scores)
        pr_auc = average_precision_score(y_true, y_scores)
        return roc_auc, pr_auc

    write_file(args.outdir, "Predicting..")

    predictions, gold = [], []
    for index, sentence in enumerate(data.test_entries):
        pred = test_item(model, sentence)
        predictions.append(pred)
        gold.append(sentence.permissions[args.permission_type])
    return pr_roc_auc(predictions, gold)


def kfold_validation(args, data):
    data.entries = np.array(data.entries)
    random.shuffle(data.entries)

    kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
    roc_l, pr_l = [], []
    for foldid, (train, test) in enumerate(kfold.split(data.entries)):
        write_file(args.outdir, "Fold {}".format(foldid + 1))

        model = Model(data, args)
        data.train_entries = data.entries[train]
        data.test_entries = data.entries[test]
        max_roc_auc, max_pr_auc = 0, 0
        for epoch in range(args.num_epoch):
            train_all(args, model, data)
            roc_auc, pr_auc = test_all(args, model, data)
            if pr_auc > max_pr_auc:
                max_pr_auc = pr_auc
                max_roc_auc = roc_auc
            write_file(
                args.outdir, "Epoch {} ROC {}  PR {}".format(epoch + 1, roc_auc, pr_auc)
            )
        model.save()
        write_file(args.outdir, "ROC {} PR {}".format(max_roc_auc, max_pr_auc))
        roc_l.append(max_roc_auc)
        pr_l.append(max_pr_auc)
    write_file(
        args.outdir, "Summary : ROC {} PR {}".format(np.mean(roc_l), np.mean(pr_l))
    )


In [3]:
class Arguments:
    permission_type = "READ_CONTACTS"
    saved_data = "/Users/huseyinalecakir/huseyin/Work/Security/datasets/saved-parameters/saved-data/ac-net/nostem-own_embeddings-sentences-w2i.pickle"
    outdir = "output.txt"
    stemmer = "porter"
    embedding_size = 300
    hidden_size = 128
    attention_size = 128
    output_size = 1
    print_every = 1000
    encoder_dir = "bidirectional"
    encoder_type = "gru"
    num_epoch = 1
    

In [4]:
args = Arguments()

In [5]:
data = Data()
data.load(args.saved_data)

In [6]:
"""data.entries = np.array(data.entries)
random.shuffle(data.entries)
from sklearn.model_selection import train_test_split
data.train_entries, data.test_entries = train_test_split(data.entries, test_size=0.10, random_state=5)"""

'data.entries = np.array(data.entries)\nrandom.shuffle(data.entries)\nfrom sklearn.model_selection import train_test_split\ndata.train_entries, data.test_entries = train_test_split(data.entries, test_size=0.10, random_state=5)'

In [7]:
def load_train_test(infile):
    with open(infile, "rb") as target:
        data.entries, data.train_entries, data.test_entries = pickle.load(target)

def save_train_test(infile):
    with open(infile, "wb") as target:
        pickle.dump([data.entries, data.train_entries, data.test_entries], target)

In [8]:
#save_train_test("nostem_train_test.pickle")
load_train_test("nostem_train_test.pickle")

In [10]:
model = Model(data, args)
train_all(args, model, data)
roc_auc, pr_auc = test_all(args, model, data)
print(roc_auc, pr_auc)

Initializing word embeddings by pre-trained vectors
Vocab size: 17760; #words having pretrained vectors: 17760
0.970004096484 0.734982115288


In [11]:
"""for entry in data.test_entries:
    if type(entry.prediction_result) != float:
        entry.prediction_result = 0"""

'for entry in data.test_entries:\n    if type(entry.prediction_result) != float:\n        entry.prediction_result = 0'

In [12]:
positives = [entry for entry in data.test_entries if entry.permissions["READ_CONTACTS"]==1]
negatives = [entry for entry in data.test_entries if entry.permissions["READ_CONTACTS"]==0]

sorted_positives = sorted(positives, key=lambda x: x.prediction_result, reverse=True)
sorted_negatives = sorted(negatives, key=lambda x: x.prediction_result, reverse=True)
threshold = 0.5
TP = sum([1 for entry in sorted_positives if entry.prediction_result >= threshold])
FN = sum([1 for entry in sorted_positives if entry.prediction_result < threshold])
TN = sum([1 for entry in sorted_negatives if entry.prediction_result < threshold])
FP = sum([1 for entry in sorted_negatives if entry.prediction_result >= threshold])

precision = TP/(TP+FP)
recall = TP/(TP+FN)
acc = (TN+TP)/(TN+FP+TP+FN)
fmeasure = (2 * precision * recall) / (precision + recall)


print("TP:{} - FN:{} - TN:{} - FP:{}".format(TP, FN, TN, FP))
print("Precision:{} - Recall:{}".format(precision, recall))
print("Accuracy:{}".format(acc))
print("F-measuse:{}".format(fmeasure))

TP:65 - FN:22 - TN:2331 - FP:54
Precision:0.5462184873949579 - Recall:0.7471264367816092
Accuracy:0.9692556634304207
F-measuse:0.6310679611650485


In [13]:
with open("attention_pooling_nostem_statement_sentences.txt", "w") as target:
    neg_keys = {}
    for idx, entry in enumerate(sorted_negatives):
        if entry.prediction_result > threshold:
            weights = show_attention_weights(model, entry)
            max_index = weights.index(max(weights))
            mit = entry.preprocessed_sentence[max_index] #most_important_token
            neg_keys[mit] = neg_keys[mit]+1 if mit in neg_keys else 1
            pairs = [(t,w)for t,w in zip(entry.preprocessed_sentence,weights)]
            target.write("{}::{}::{}\n".format(idx+1, entry.sentence, entry.prediction_result))
            print("{}::{}::{}\n".format(idx+1, entry.sentence, entry.prediction_result))
            for t,w in pairs:
                print("{}:{}".format(t,w))
                target.write("{}:{}\n".format(t,w))
            print()
            target.write("\n")
    sorted_lst = sorted(neg_keys.items(), key=lambda kv: kv[1], reverse=True)
    target.write("KEYWORDS\n")
    for key, value in sorted_lst:
        target.write("{}:{}\n".format(key, value))

1::Cloud backup through Dropbox and Google Drive::0.9969434142112732

cloud:0.5694823265075684
backup:0.18788458406925201
dropbox:0.10521133244037628
google:0.06281254440546036
drive:0.07460920512676239

2::You can send texts# emoticons# pictures# voice messages# and other files to your FreePP contacts 3::0.9924032688140869

send:0.2737778425216675
texts:0.1632281243801117
emoticons:0.21293653547763824
pictures:0.13522407412528992
voice:0.06813249737024307
messages:0.04893691465258598
files:0.044238146394491196
freepp:0.022625522688031197
contacts:0.03090031072497368

3::Block numbers from those you dont want to be able to contact you::0.9884445667266846

block:0.23516929149627686
numbers:0.11936233937740326
dont:0.24794796109199524
want:0.2104370892047882
able:0.08709251135587692
contact:0.09999079257249832

4::Block all numbers (useful when you are sleeping) Turn blacklist off::0.9872746467590332

block:0.21978293359279633
numbers:0.1735323667526245
useful:0.2539244592189789
sleeping

In [14]:
with open("attention_pooling_nostem_permission_sentences.txt", "w") as target:
    pos_keys = {}
    for idx, entry in enumerate(sorted_positives):
        weights = show_attention_weights(model, entry)
        max_index = weights.index(max(weights))
        mit = entry.preprocessed_sentence[max_index] #most_important_token
        pos_keys[mit] = pos_keys[mit]+1 if mit in pos_keys else 1
        pairs = [(t,w)for t,w in zip(entry.preprocessed_sentence,weights)]
        print("{}::{}::{}\n".format(idx+1, entry.sentence, entry.prediction_result))
        target.write("{}::{}::{}\n".format(idx+1, entry.sentence, entry.prediction_result))
        for t,w in pairs:
            print("{}:{}".format(t,w))
            target.write("{}:{}\n".format(t,w))
        print()
        target.write("\n")
    sorted_lst = sorted(pos_keys.items(), key=lambda kv: kv[1], reverse=True)
    target.write("KEYWORDS\n")
    for key, value in sorted_lst:
        target.write("{}:{}\n".format(key, value))

1::Find & merge contacts with similar names::0.999570906162262

find:0.3219963312149048
merge:0.21467503905296326
contacts:0.16603198647499084
similar:0.15800480544567108
names:0.139291912317276

2::Quickly export your backups to Dropbox# Google Drive# Email SMART DIALER Beautiful dialer to call and add new contacts T9 Dialer::0.9994131326675415

quickly:0.22188009321689606
export:0.11256125569343567
backups:0.11081589758396149
dropbox:0.052838634699583054
google:0.0341520719230175
drive:0.03487897291779518
email:0.06586393713951111
smart:0.06546397507190704
dialer:0.05373670160770416
beautiful:0.04807166010141373
dialer:0.039178457111120224
call:0.026005340740084648
add:0.021048497408628464
new:0.019175797700881958
contacts:0.01858445629477501
dialer:0.0757443755865097

3::association of favorite contacts into groups for quick dial::0.999180018901825

association:0.3354455232620239
favorite:0.1479257494211197
contacts:0.0729619637131691
groups:0.1095021516084671
quick:0.14142668247222

In [15]:
print(neg_keys)

{'cloud': 1, 'send': 1, 'dont': 1, 'useful': 1, 'free': 1, 'securely': 1, 'sync': 2, 'use': 1, 'gmail': 1, 'instantly': 1, 'conveniently': 1, 'trace': 1, 'droid': 1, 'business': 1, 'backup': 1, 'share': 2, 'apps': 1, 'keep': 3, 'take': 1, 'save': 1, 'funds': 1, 'square': 1, 'perfect': 1, 'update': 1, 'money': 1, 'support': 1, 'new': 1, 'build': 1, 'protect': 1, 'offered': 1, 'real': 1, 'want': 1, 'long': 1, 'complete': 1, 'chat': 1, 'add': 1, 'communities': 1, 'contact': 1, 'mts': 1, 'meta': 1, 'large': 1, 'blackboard': 1, 'one': 1, 'please': 1, 'overview': 1, 'instant': 1, 'press': 1, 'twitter': 1, 'state': 1, 'moreover': 1}


In [16]:
print(pos_keys)

{'find': 1, 'quickly': 1, 'association': 1, 'makes': 1, 'need': 2, 'create': 2, 'read': 2, 'group': 2, 'browse': 1, 'features': 1, 'secret': 1, 'powerful': 1, 'nice': 1, 'highlights': 1, 'c': 1, 'four': 1, 'select': 2, 'wipe': 1, 'manage': 1, 'list': 1, 'authorize': 1, 'profile': 1, 'vips': 1, 'september': 1, 'contacts': 1, 'searching': 1, 'supports': 1, 'turn': 1, 'premium': 1, 'note': 1, 'know': 1, 'easily': 1, 'support': 2, 'free': 1, 'jorte': 1, 'pictures': 1, 'block': 1, 'schedule': 1, 'account': 1, 'reasons': 1, 'android': 2, 'automatic': 1, 'add': 1, 'everyday': 1, 'sms': 1, 'whatsapp': 1, 'rate': 1, 'preview': 1, 'set': 1, 'contact': 1, 'updates': 1, 'take': 2, 'login': 1, 'phone': 1, 'share': 3, 'type': 1, 'also': 2, 'edit': 1, 'use': 2, 'movie': 1, 'supported': 1, 'signal': 1, 'always': 1, 'run': 1, 'google': 1, 'try': 1, 'big': 1, 'using': 1, 'recall': 1, 'magicapp': 1, 'post': 1, 'application': 1, 'multiple': 1, 'price': 1, 'buy': 1}


In [None]:
"""test_entries = data.test_entries
data.test_entries = data.train_entries
roc_auc, pr_auc = test_all(args, model, data)

ls = []
ls.extend(data.test_entries)
ls.extend(test_entries)
false = sum([1 for entry in ls if entry.prediction_result  < 0.25 and entry.permissions["READ_CONTACTS"] == 1 ])
tagged = sum([1 for entry in ls if entry.permissions["READ_CONTACTS"] == 1])
print(false, tagged, (false/tagged))
false = sum([1 for entry in ls if entry.prediction_result  < 0.25 and entry.permissions["READ_CONTACTS"] == 1 ])
tagged = sum([1 for entry in ls if entry.permissions["READ_CONTACTS"] == 1])
print(false, tagged, (false/tagged))
for entry in ls:
    print(idx+1, entry.sentence, entry.prediction_result)
    print()
false_positives = [entry for entry in sorted_negatives if entry.prediction_result >= 0.5]
for entry in false_positives:
    print(entry.prediction_result)"""