In [None]:
import sys
import os
import csv
import random

import scipy
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.init as init
from torch import optim

from utils.io_utils import IOUtils
from utils.nlp_utils import NLPUtils

seed = 10

random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

In [None]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')

class ArgumentParser():
    permission_type = "READ_CONTACTS"
    train = "/home/huseyinalecakir/Security/data/acnet-data/ACNET_DATASET.csv"
    train_file_type = "acnet"
    external_embedding = "/home/huseyinalecakir/Security/data/pretrained-embeddings/{}".format("scraped_with_porter_stemming_300.bin")
    external_embedding_type = "word2vec"
    #wembedding_dims = 300
    #lstm_dims = 128
    stemmer = "porter"
    saved_parameters_dir = "/home/huseyinalecakir/Security/data/saved-parameters/"
    saved_prevectors    = "embeddings.pickle"
    saved_vocab_train = "acnet-vocab.txt"
    lower = True
    outdir = "./test/{}".format(permission_type)

In [None]:
class TorchOptions():
    def __init__(self):
        self.rnn_size = 300
        self.init_weight = 0.08
        self.decay_rate = 0.985
        self.learning_rate = 0.0001
        self.plot_every = 2500
        self.print_every = 2500
        self.grad_clip = 5
        self.dropout = 0
        self.dropoutrec = 0
        self.learning_rate_decay = 0.985
        self.learning_rate_decay_after = 1 

In [None]:
class SentenceReport:
    """TODO"""
    def __init__(self, id, sentence, mark):
        self.app_id = id
        self.mark = mark
        self.preprocessed_sentence = None
        self.sentence = sentence
        self.prediction_result = None
        self.index_tensors = None

In [None]:
def load_row_acnet(infile, gold_permission, stemmer, embeddings):
    print("Loading row {} ".format(infile))
    #read training data
    print("Reading Train Sentences")
    tagged_train_file = pd.read_csv(infile)
    train_sententence_reports = []
    acnet_map = {"RECORD_AUDIO" : "MICROPHONE", "READ_CONTACTS": "CONTACTS", "READ_CALENDAR": "CALENDAR", "ACCESS_FINE_LOCATION" : "LOCATION" ,
    "CAMERA" : "CAMERA", "READ_SMS" : "SMS", "READ_CALL_LOGS" : "CALL_LOG", "CALL_PHONE" : "PHONE" , "WRITE_SETTINGS" : "SETTINGS" ,
    "GET_TASKS" : "TASKS"}
    for idx, row in tagged_train_file.iterrows():
        app_id = int(row["app_id"])
        sentence = row["sentence"]
        mark = row[acnet_map[gold_permission]]
        sentence_report = SentenceReport(app_id, sentence, mark)
        preprocessed = NLPUtils.preprocess_sentence(sentence_report.sentence, stemmer)
        sentence_report.preprocessed_sentence = [word for word in preprocessed if word in embeddings]
        train_sententence_reports.append(sentence_report)
    print("Loading completed")
    return train_sententence_reports

In [None]:
class LSTM(nn.Module):
    def __init__(self, opt):
        super(LSTM, self).__init__()
        self.opt = opt
        self.i2h = nn.Linear(opt.rnn_size, 4 * opt.rnn_size)
        self.h2h = nn.Linear(opt.rnn_size, 4 * opt.rnn_size)
        if opt.dropoutrec > 0:
            self.dropout = nn.Dropout(opt.dropoutrec)
            
    def forward(self, x, prev_c, prev_h):
        gates = self.i2h(x) + self.h2h(prev_h)
        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
        ingate = torch.sigmoid(ingate)
        forgetgate = torch.sigmoid(forgetgate)
        cellgate = torch.tanh(cellgate)
        outgate = torch.sigmoid(outgate)
        if self.opt.dropoutrec > 0:
            cellgate = self.dropout(cellgate)
        cy = (forgetgate * prev_c) + (ingate * cellgate)
        hy = outgate * torch.tanh(cy)  # n_b x hidden_dim
        return cy, hy

In [None]:
class Encoder(nn.Module):
    def __init__(self, opt, w2i, ext_embeddings):
        super(Encoder, self).__init__()
        self.opt = opt
        self.w2i = w2i
        self.hidden_size = opt.rnn_size
        
        self.embedding = nn.Embedding(len(w2i), self.hidden_size)
        self.lstm = LSTM(self.opt)
        if opt.dropout > 0:
            self.dropout = nn.Dropout(opt.dropout)
        self.__initParameters()
        self.__initalizedPretrainedEmbeddings(ext_embeddings)

    def __initParameters(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                init.uniform_(param, -opt.init_weight, opt.init_weight)
                
    def __initalizedPretrainedEmbeddings(self, embeddings):
        weights_matrix = np.zeros(((len(self.w2i), self.hidden_size)))
        
        for word in self.w2i:
            weights_matrix[self.w2i[word]] = embeddings[word]
        self.embedding.from_pretrained(torch.FloatTensor(weights_matrix))
        
    def forward(self, input_src, prev_c, prev_h):
        src_emb = self.embedding(input_src) # batch_size x src_length x emb_size
        if self.opt.dropout > 0:
            src_emb = self.dropout(src_emb)
        prev_cy, prev_hy = self.lstm(src_emb, prev_c, prev_h)
        return prev_cy, prev_hy

In [None]:
class Classifier(nn.Module):
    def __init__(self, opt, output_size):
        super(Classifier, self).__init__()
        self.opt = opt
        self.hidden_size = opt.rnn_size
        self.linear = nn.Linear(self.hidden_size, output_size)
        
        if opt.dropout > 0:
            self.dropout = nn.Dropout(opt.dropout)
        
        self.sigmoid = nn.Sigmoid()
        self.__initParameters()
        
    def __initParameters(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                init.uniform_(param, -self.opt.init_weight, self.opt.init_weight)
         
    def forward(self, prev_h):

        if self.opt.dropout > 0:
            prev_h = self.dropout(prev_h)
        h2y = self.linear(prev_h)
        pred = self.sigmoid(h2y)
        return pred

In [None]:
def create_index_tensors(data, w2i):
    for sentence_report in data:
        sentence_report.index_tensor = torch.zeros((1, len(sentence_report.preprocessed_sentence)), dtype=torch.long)
        for idx, word in enumerate(sentence_report.preprocessed_sentence):
            sentence_report.index_tensor[0][idx] = w2i[word]

In [None]:
def load_embeddings(options):
    if options.external_embedding is not None:
        if os.path.isfile(os.path.join(options.saved_parameters_dir,
                                       options.saved_prevectors)):
            ext_embeddings, _ = IOUtils.load_embeddings_file(os.path.join(options.saved_parameters_dir,
                                                                        options.saved_prevectors),
                                                          "pickle",
                                                          options.lower)
            return ext_embeddings
        else:
            ext_embeddings, _ = IOUtils.load_embeddings_file(options.external_embedding,
                                                          options.external_embedding_type,
                                                          options.lower)
            IOUtils.save_embeddings(os.path.join(options.saved_parameters_dir,
                                                 options.saved_prevectors),
                                    ext_embeddings)
            return ext_embeddings
    else:
        raise Exception("external_embedding option is None")

In [None]:
def trainItem(opt, sentence, encoder, classifier, optimizer, criterion):
    optimizer.zero_grad()
    c = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)
    h = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)
    for i in range(sentence.index_tensor.size(1)):
        c, h = encoder(sentence.index_tensor[:, i], c, h)

    pred = classifier(h)
    loss = criterion(pred, torch.tensor([[sentence.mark]], dtype=torch.float))
    loss.backward()
    if opt.grad_clip != -1:
        torch.nn.utils.clip_grad_value_(encoder.parameters(),opt.grad_clip)
        torch.nn.utils.clip_grad_value_(classifier.parameters(),opt.grad_clip)
    optimizer.step()
    
    return loss

In [None]:
def predict(opt, sentence, encoder, classifier):
    c = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)
    h = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)

    for i in range(sentence.index_tensor.size(1)):
        c, h = encoder(sentence.index_tensor[:, i], c, h)

    pred = classifier(h)
    return pred

In [None]:
args = ArgumentParser()
opt = TorchOptions()

In [None]:
ext_embeddings = load_embeddings(args)
ext_embeddings["contacts"] = ext_embeddings["contact"] #contacts  embedding ini sonradan eklemem gerekiyor. Vocab olusturma asamasinda guncelleme yap.

In [None]:
print('Extracting training vocabulary')
w2i, _ = IOUtils.load_vocab(args.train,
                            args.train_file_type,
                            args.saved_parameters_dir,
                            args.saved_vocab_train,
                            args.external_embedding,
                            args.external_embedding_type,
                            args.stemmer,
                            args.lower)

In [None]:
sentences = load_row_acnet(args.train,  args.permission_type, args.stemmer, ext_embeddings)


In [None]:
create_index_tensors(sentences, w2i)

In [None]:
def train_and_test(opt, epoch_num, w2i, train_sentences, test_sentences, fp):
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    encoder = Encoder(opt, w2i, ext_embeddings)
    classifier = Classifier(opt, 1) 
    
    params = list(encoder.parameters()) + list(classifier.parameters())
    optimizer = optim.Adam(params)
    
    optim_state = {"learningRate" : opt.learning_rate, "alpha" :  opt.decay_rate}
    #encoder_optimizer = optim.RMSprop(encoder.parameters(),  lr=optim_state["learningRate"], alpha=optim_state["alpha"])
    #classifier_optimizer = optim.RMSprop(classifier.parameters(),  lr=optim_state["learningRate"], alpha=optim_state["alpha"])
    
    criterion = nn.BCELoss()
    
    pr_scores = []
    roc_scores = []
    losses = []

    for epoch in range(epoch_num):
        #fp.write("\n---Epoch {}---\n".format(epoch+1))
        print("---Epoch {}---\n".format(epoch+1))
        
        print("Training...")
        #fp.write("Training...\n")
        encoder.train()
        classifier.train()
        for index, sentence in enumerate(train_data):
            loss = trainItem(opt, sentence, encoder, classifier, optimizer, criterion)
            if index != 0:
                if index % opt.print_every == 0:
                    #fp.write("Index {} Loss {}\n".format(index,np.mean(losses[epoch*len(train_data)+index-opt.print_every:])))
                    print("Index {} Loss {}".format(index,np.mean(losses[epoch*len(train_data)+index-opt.print_every:])))
            losses.append(loss.item())
        
        # Learning Rate Decay Optimization
        if opt.learning_rate_decay < 1:
            if epoch >= opt.learning_rate_decay_after:
                decay_factor = opt.learning_rate_decay
                optim_state["learningRate"] = optim_state["learningRate"] * decay_factor 
                for param_group in optimizer.param_groups:
                    param_group['lr'] = optim_state["learningRate"]
                for param_group in classifier_optimizer.param_groups:
                    param_group['lr'] = optim_state["learningRate"]
        
        print("Predicting..")     
        #fp.write("Predicting..\n")
        encoder.eval()
        classifier.eval()
        predictions = []
        gold = []
        with torch.no_grad():
            for index, sentence in enumerate(test_data):
                pred = predict(opt, sentence, encoder, classifier)
                predictions.append(pred)
                gold.append(sentence.mark)

        y_true = np.array(gold)
        y_scores = np.array(predictions)
        roc_auc = roc_auc_score(y_true, y_scores)
        pr_auc = average_precision_score(y_true, y_scores)
        pr_scores.append(pr_auc)
        roc_scores.append(roc_auc)
        #fp.write("Scores ROC {} PR {}\n".format(roc_auc, pr_auc))
        print("Scores ROC {} PR {}".format(roc_auc, pr_auc))
    return roc_scores, pr_scores

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold

documents = np.array(sentences)
random.shuffle(documents)

random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)


In [None]:
kfold = KFold(10, True, 1)
kfold_splits = kfold.split(documents)


chunkend_losses = []

roc_scores = []
pr_scores = []
file = open("out.txt", "w")
for foldid, (train, test) in enumerate(kfold_splits):
    print("\nFOLD ID {}\n".format(foldid))
    train_data = documents[train]
    test_data = documents[test]

    roc, pr = train_and_test(opt, 5, w2i, train_data, test_data, file)
    print("Fold Results :\n")
    print("ROC : {}\n".format(roc))
    print("PR : {}\n".format(pr))
    
    roc_scores.append(roc)
    pr_scores.append(pr)
file.close()