In [90]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [120]:
from common import SentenceReport, DocumentReport
from utils.nlp_utils import NLPUtils
from utils.io_utils import IOUtils

import os
import csv
import pandas as pd
import pickle

In [111]:
def load_sentences(infile, permission, stemmer, embeddings):
    print("Loading row {} ".format(infile))
    tagged_train_file = pd.read_csv(infile)
    sentences = []
    
    app_id = None
    for idx, row in tagged_train_file.iterrows():
        sentence_id = str(row["app_id"])
        sentence = row["sentence"]
        if sentence_id.startswith("##"):
            app_id = sentence_id
        if not (sentence.startswith("##") or sentence.startswith("Description Tag") or sentence.startswith("CATEGORY")):
            try:
                if int(row[permission]) == 1 or int(row[permission]) == 0: #eliminate different tags other than zero and one       
                    sentence_report = SentenceReport(app_id, sentence)
                    sentence_report.permissions[permission] = int(row[permission])
                    preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
                    sentence_report.preprocessed_sentence = [word for word in preprocessed if word in embeddings]
                    if sentence_report.preprocessed_sentence != []:
                        sentences.append(sentence_report)
                else:
                    pass
                    # Pass tags other than zero and one 
            except:
                pass # sentences with no tag
    print("Loading completed")
    return sentences

In [125]:
def load_documents(infile, permission, stemmer, embeddings):
    print("Loading row {} ".format(infile))
    tagged_train_file = pd.read_csv(infile)
    documents = []
    
    app_id = None
    for idx, row in tagged_train_file.iterrows():
        sentence_id = str(row["app_id"])
        sentence = row["sentence"]
        if sentence_id.startswith("##"):
            app_id = sentence_id
            documents.append(DocumentReport(app_id))
            documents[-1].permissions[permission] = 0
        if not (sentence.startswith("##") or sentence.startswith("Description Tag") or sentence.startswith("CATEGORY")):
            if not (sentence.startswith("##") or sentence.startswith("Description Tag") or sentence.startswith("CATEGORY")):
                try:
                    if int(row[permission]) == 1 or int(row[permission]) == 0: #eliminate different tags other than zero and one       
                        if row[permission] == 1:
                            documents[-1].permissions[permission] = 1
                        documents[-1].sentences.append(sentence)
                        preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
                        filtered = [word for word in preprocessed if word in embeddings]
                        documents[-1].preprocessed_sentences.append(filtered)
                    else:
                        pass
                        # Pass tags other than zero and one 
                except:
                    pass # sentences with no tag
    print("Loading completed")
    return documents

In [113]:
def vocab(infile, permission, stemmer, embeddings):
    print("Loading row {} ".format(infile))
    tagged_train_file = pd.read_csv(infile)
    w2i = {}
    for idx, row in tagged_train_file.iterrows():
        sentence = row["sentence"]

        if not (sentence.startswith("##") or sentence.startswith("Description Tag") or sentence.startswith("CATEGORY")):
            try:
                if int(row[permission]) == 1 or int(row[permission]) == 0: #eliminate different tags other than zero and one       
                    preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
                    filtered = [word for word in preprocessed if word in embeddings]
                    for token in filtered:
                        if token not in w2i:
                            w2i[token] = len(w2i)
                else:
                    pass
                    # Pass tags other than zero and one 
            except:
                pass # sentences with no tag
    print("Loading completed")
    return w2i

In [114]:
def filtered_vocab_embeddings(w2i, embeddings):
    subset = {}
    for key in w2i:
        subset[key] = embeddings[key]
    return subset

In [115]:
def save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile):
    sentences = load_sentences(infile, permission, stemmer, embeddings)
    w2i = vocab(infile, permission, stemmer, embeddings)
    subset_embeddings = filtered_vocab_embeddings(w2i, embeddings)
    with open(outfile, "wb") as target:
        pickle.dump([subset_embeddings, sentences, w2i], target)

In [126]:
def save_document_based_dataset(infile, permission, embeddings, stemmer, outfile):
    documents = load_documents(infile, permission, stemmer, embeddings)
    w2i = vocab(infile, permission, stemmer, embeddings)
    subset_embeddings = filtered_vocab_embeddings(w2i, embeddings)
    with open(outfile, "wb") as target:
        pickle.dump([subset_embeddings, documents, w2i], target)

In [117]:
stemmer = "porter"
embeddings_file = os.path.join(os.environ["SECURITY_DATASETS"], "embeddings/own-embeddings/scraped_with_porter_stemming_300.bin")

In [118]:
embeddings, embeddings_dim = IOUtils.load_embeddings_file(embeddings_file, "word2vec", lower=True)

In [121]:
#create sentence based embeddings
permission = "STORAGE"
infile = "{}.csv".format(permission)
outfile = "{}-embeddings-sentences-w2i.pickle".format(permission)
save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile)

permission = "RECORD_AUDIO"
infile = "{}.csv".format(permission)
outfile = "{}-embeddings-sentences-w2i.pickle".format(permission)
save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile)

permission = "READ_CONTACTS"
infile = "{}.csv".format(permission)
outfile = "{}-embeddings-sentences-w2i.pickle".format(permission)
save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile)

Loading row STORAGE.csv 
Loading completed
Loading row STORAGE.csv 
Loading completed
Loading row RECORD_AUDIO.csv 
Loading completed
Loading row RECORD_AUDIO.csv 
Loading completed
Loading row READ_CONTACTS.csv 
Loading completed
Loading row READ_CONTACTS.csv 
Loading completed


In [127]:
#create document based embeddings
permission = "STORAGE"
infile = "{}.csv".format(permission)
outfile = "{}-embeddings-documents-w2i.pickle".format(permission)
save_document_based_dataset(infile, permission, embeddings, stemmer, outfile)

permission = "RECORD_AUDIO"
infile = "{}.csv".format(permission)
outfile = "{}-embeddings-documents-w2i.pickle".format(permission)
save_document_based_dataset(infile, permission, embeddings, stemmer, outfile)

permission = "READ_CONTACTS"
infile = "{}.csv".format(permission)
outfile = "{}-embeddings-documents-w2i.pickle".format(permission)
save_document_based_dataset(infile, permission, embeddings, stemmer, outfile)

Loading row STORAGE.csv 
Loading completed
Loading row STORAGE.csv 
Loading completed
Loading row RECORD_AUDIO.csv 
Loading completed
Loading row RECORD_AUDIO.csv 
Loading completed
Loading row READ_CONTACTS.csv 
Loading completed
Loading row READ_CONTACTS.csv 
Loading completed


In [128]:
#fasttext embeddings version
embeddings_file = os.path.join(os.environ["SECURITY_DATASETS"], "embeddings/cc.en.300.bin")

In [129]:
embeddings, embeddings_dim = IOUtils.load_embeddings_file(embeddings_file, "fasttext", lower=True)

In [130]:
#create sentence based embeddings
permission = "STORAGE"
infile = "{}.csv".format(permission)
outfile = "{}-fasttext-embeddings-sentences-w2i.pickle".format(permission)
save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile)

permission = "RECORD_AUDIO"
infile = "{}.csv".format(permission)
outfile = "{}-fasttext-embeddings-sentences-w2i.pickle".format(permission)
save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile)

permission = "READ_CONTACTS"
infile = "{}.csv".format(permission)
outfile = "{}-fasttext-embeddings-sentences-w2i.pickle".format(permission)
save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile)

Loading row STORAGE.csv 
Loading completed
Loading row STORAGE.csv 
Loading completed
Loading row RECORD_AUDIO.csv 
Loading completed
Loading row RECORD_AUDIO.csv 
Loading completed
Loading row READ_CONTACTS.csv 
Loading completed
Loading row READ_CONTACTS.csv 
Loading completed
