In [None]:
import sys
import os
import csv
import random

import pickle
import scipy
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.init as init
from torch import optim

from utils.io_utils import IOUtils
from utils.nlp_utils import NLPUtils
from common import *

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold

seed = 10

random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

In [None]:
class ArgumentParser:
    permission_type = "READ_CONTACTS"
    train = "/home/huseyinalecakir/Security/data/acnet-data/ACNET_DATASET.csv"
    train_file_type = "acnet"
    external_embedding = "/home/huseyinalecakir/Security/data/pretrained-embeddings/{}".format("scraped_with_porter_stemming_300.bin")
    external_embedding_type = "word2vec"
    stemmer = "porter"
    saved_parameters_dir = "/home/huseyinalecakir/Security/data/saved-parameters/"
    saved_prevectors    = "embeddings.pickle"
    saved_vocab_train = "acnet-vocab.txt"
    saved_all_data = "{}/all_data".format(saved_parameters_dir)
    reviews = "/home/huseyinalecakir/Security/data/reviews/acnet-reviews/acnet_initial/app_reviews_original.csv"
    lower = True
    outdir = "./test/{}".format(permission_type)

class TorchOptions:
    rnn_size = 300
    init_weight = 0.08
    decay_rate = 0.985
    learning_rate = 0.0001
    plot_every = 2500
    print_every = 2500
    grad_clip = 5
    dropout = 0
    dropoutrec = 0
    learning_rate_decay = 0.985
    learning_rate_decay_after = 1    

In [None]:
%%time
#ext_embeddings = load_embeddings(args)
#reviews = load_reviews(args.reviews, args.stemmer, ext_embeddings)
#sentences = load_row_acnet(args.train, args.stemmer, ext_embeddings)
"""
print('Extracting training vocabulary')
w2i = IOUtils.load_vocab(args.train, 
                         args.train_file_type, 
                         args.saved_parameters_dir, 
                         args.saved_vocab_train,
                         args.external_embedding,
                         args.external_embedding_type,
                         args.stemmer,
                         args.lower)

#Update review vocab
review_vocab = reviews_vocab(reviews)
for w in review_vocab:
    if w not in w2i:
        w2i[w] = len(w2i)
"""
#create_index_tensors(sentences, reviews, w2i)
#save_all_data(args.saved_all_data, ext_embeddings, rev2, sentences, w2i)

In [None]:
def train_item(opt, args, sentence, encoder, classifier, optimizer, criterion):
    optimizer.zero_grad()
    c = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)
    h = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)
    for i in range(sentence.index_tensor.size(1)):
        c, h = encoder(sentence.index_tensor[:, i], c, h)

    pred = classifier(h)
    loss = criterion(pred, torch.tensor([[sentence.permissions[args.permission_type]]], dtype=torch.float))
    loss.backward()
    if opt.grad_clip != -1:
        torch.nn.utils.clip_grad_value_(encoder.parameters(),opt.grad_clip)
        torch.nn.utils.clip_grad_value_(classifier.parameters(),opt.grad_clip)
    optimizer.step()
    return loss

def predict(opt, sentence, encoder, classifier):
    c = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)
    h = torch.zeros((1, opt.rnn_size), dtype=torch.float, requires_grad=True)

    for i in range(sentence.index_tensor.size(1)):
        c, h = encoder(sentence.index_tensor[:, i], c, h)
    pred = classifier(h)
    return pred

def train_and_test(opt, args, epoch_num, w2i, train_data, test_data, foldid):
    encoder = Encoder(opt, w2i, ext_embeddings)
    classifier = Classifier(opt, 1) 
    
    params = list(encoder.parameters()) + list(classifier.parameters())
    optimizer = optim.Adam(params) 
    optim_state = {"learningRate" : opt.learning_rate, "alpha" :  opt.decay_rate}
    criterion = nn.BCELoss()
    
    pr_scores = []
    roc_scores = []
    losses = []

    for epoch in range(epoch_num):
        print("---Epoch {}---\n".format(epoch+1))
        
        print("Training...")
        encoder.train()
        classifier.train()
        for index, sentence in enumerate(train_data):
            loss = train_item(opt, args, sentence, encoder, classifier, optimizer, criterion)
            if index != 0:
                if index % opt.print_every == 0:
                    print("Index {} Loss {}".format(index,np.mean(losses[epoch*len(train_data)+index-opt.print_every:])))
            losses.append(loss.item())
        
        # Learning Rate Decay Optimization
        if opt.learning_rate_decay < 1:
            if epoch >= opt.learning_rate_decay_after:
                decay_factor = opt.learning_rate_decay
                optim_state["learningRate"] = optim_state["learningRate"] * decay_factor 
                for param_group in optimizer.param_groups:
                    param_group['lr'] = optim_state["learningRate"]

        
        print("Predicting..")     
        encoder.eval()
        classifier.eval()
        predictions = []
        gold = []
        with torch.no_grad():
            for index, sentence in enumerate(test_data):
                pred = predict(opt, sentence, encoder, classifier)
                predictions.append(pred)
                gold.append(sentence.permissions[args.permission_type])

        y_true = np.array(gold)
        y_scores = np.array(predictions)
        roc_auc = roc_auc_score(y_true, y_scores)
        pr_auc = average_precision_score(y_true, y_scores)
        pr_scores.append(pr_auc)
        roc_scores.append(roc_auc)
        print("Scores ROC {} PR {}".format(roc_auc, pr_auc))
        
        #Save Model
        model_save_dir = os.path.join(args.saved_parameters_dir, "models", "model_for_review_prediction.pt")
        if not os.path.exists(os.path.dirname(model_save_dir)):
            os.makedirs(os.path.dirname(model_save_dir))
        torch.save({
            'encoder': encoder.state_dict(),
            'classifier': classifier.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch' : epoch,
            'loss' : loss,
            'foldid' : foldid,
            'pr_auc' : pr_auc,
            'roc_auc' : roc_auc
            }, model_save_dir)
    return roc_scores, pr_scores

def load_model_and_predict_reviews(args, model_path, reviews):
    encoder = Encoder(opt, w2i, ext_embeddings)
    classifier = Classifier(opt, 1) 
    
    checkpoint = torch.load(model_path)
    encoder.load_state_dict(checkpoint["encoder"])
    classifier.load_state_dict(checkpoint["classifier"])

    pr_auc = checkpoint["pr_auc"]
    roc_auc = checkpoint["roc_auc"]
    print(pr_auc, roc_auc)
    with torch.no_grad():
        for app_id in reviews:
            for review in reviews[app_id]:
                pred = predict(opt, review, encoder, classifier)
                review.prediction_result = pred
                

In [None]:
args = ArgumentParser()
opt = TorchOptions()

In [None]:
save_dir = os.path.join(args.saved_all_data, "without_prediction.pickle")
ext_embeddings, reviews, sentences, w2i = load_all_data(save_dir)

In [None]:
documents = np.array(sentences)
random.shuffle(documents)

In [None]:
train_and_test(opt, args, 1, w2i, documents[:100], documents[:100], 0)

In [None]:
key = list(reviews.keys())[0]

In [None]:
model_save_dir = os.path.join(args.saved_parameters_dir, "models", "model_for_review_prediction.pt")
load_model_and_predict_reviews(args, model_save_dir, nr)

In [None]:
save_dir = os.path.join(args.saved_all_data, "with_prediction.pickle")
save_all_data(save_dir, ext_embeddings, nr, sentences, w2i)