In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from common import SentenceReport, DocumentReport
from utils.nlp_utils import NLPUtils
from utils.io_utils import IOUtils

import os
import csv
import pandas as pd
import pickle

In [34]:
class OwnDataCreator:
    def load_sentences(infile, permission, stemmer, embeddings):
        print("Loading row {} ".format(infile))
        tagged_train_file = pd.read_csv(infile)
        sentences = []

        app_id = None
        for idx, row in tagged_train_file.iterrows():
            sentence_id = str(row["app_id"])
            sentence = row["sentence"]
            if sentence_id.startswith("#"):
                app_id = sentence_id
            if not (sentence.startswith("##") or sentence.startswith("Description Tag") or sentence.startswith("CATEGORY")):
                try:
                    if int(row[permission]) == 1 or int(row[permission]) == 0: #eliminate different tags other than zero and one       
                        sentence_report = SentenceReport(app_id, sentence)
                        sentence_report.permissions[permission] = int(row[permission])
                        preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
                        sentence_report.preprocessed_sentence = [word for word in preprocessed if word in embeddings]
                        if sentence_report.preprocessed_sentence != []:
                            sentences.append(sentence_report)
                    else:
                        pass
                        # Pass tags other than zero and one 
                except:
                    pass # sentences with no tag
        print("Loading completed")
        return sentences
    
    def load_documents(infile, permission, stemmer, embeddings):
        print("Loading row {} ".format(infile))
        tagged_train_file = pd.read_csv(infile)
        documents = []

        app_id = None
        for idx, row in tagged_train_file.iterrows():
            sentence_id = str(row["app_id"])
            sentence = row["sentence"]
            if sentence_id.startswith("#"):
                app_id = sentence_id
                documents.append(DocumentReport(app_id))
                documents[-1].permissions[permission] = 0
            if not (sentence.startswith("##") or sentence.startswith("Description Tag") or sentence.startswith("CATEGORY")):
                if not (sentence.startswith("##") or sentence.startswith("Description Tag") or sentence.startswith("CATEGORY")):
                    try:
                        if int(row[permission]) == 1 or int(row[permission]) == 0: #eliminate different tags other than zero and one       
                            if row[permission] == 1:
                                documents[-1].permissions[permission] = 1
                            documents[-1].sentences.append(sentence)
                            preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
                            filtered = [word for word in preprocessed if word in embeddings]
                            if filtered:
                                documents[-1].preprocessed_sentences.append(filtered)
                        else:
                            pass
                            # Pass tags other than zero and one 
                    except:
                        pass # sentences with no tag
        non_empty_documents = []
        for doc in documents:
            if len(doc.preprocessed_sentences) != 0:
                non_empty_documents.append(doc)
                
        print("Loading completed")
        return non_empty_documents

    def vocab(infile, permission, stemmer, embeddings):
        print("Loading row {} ".format(infile))
        tagged_train_file = pd.read_csv(infile)
        w2i = {}
        for idx, row in tagged_train_file.iterrows():
            sentence = row["sentence"]

            if not (sentence.startswith("##") or sentence.startswith("Description Tag") or sentence.startswith("CATEGORY")):
                try:
                    if int(row[permission]) == 1 or int(row[permission]) == 0: #eliminate different tags other than zero and one       
                        preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
                        filtered = [word for word in preprocessed if word in embeddings]
                        for token in filtered:
                            if token not in w2i:
                                w2i[token] = len(w2i)
                    else:
                        pass
                        # Pass tags other than zero and one 
                except:
                    pass # sentences with no tag
        print("Loading completed")
        return w2i

    def filtered_vocab_embeddings(w2i, embeddings):
        subset = {}
        for key in w2i:
            subset[key] = embeddings[key]
        return subset

    def save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile):
        sentences = OwnDataCreator.load_sentences(infile, permission, stemmer, embeddings)
        w2i = OwnDataCreator.vocab(infile, permission, stemmer, embeddings)
        subset_embeddings = OwnDataCreator.filtered_vocab_embeddings(w2i, embeddings)
        with open(outfile, "wb") as target:
            pickle.dump([subset_embeddings, sentences, w2i], target)

    def save_document_based_dataset(infile, permission, embeddings, stemmer, outfile):
        documents = OwnDataCreator.load_documents(infile, permission, stemmer, embeddings)
        w2i = OwnDataCreator.vocab(infile, permission, stemmer, embeddings)
        subset_embeddings = OwnDataCreator.filtered_vocab_embeddings(w2i, embeddings)
        with open(outfile, "wb") as target:
            pickle.dump([subset_embeddings, documents, w2i], target)
    
    def run():
        """
        For fasttext embeddings,
        stemmer =  "nostemmer"
        embeddings_file = os.path.join(os.environ["SECURITY_DATASETS"], "embeddings/cc.en.300.bin")
        embeddings, embeddings_dim = IOUtils.load_embeddings_file(embeddings_file, "fasttext", lower=True)
        #sentences
        outfile = os.path.join(os.environ["SECURITY_DATASETS"], "saved-parameters/saved-data/saved-data/own-data/{}-fasttext-embeddings-sentences-w2i.pickle".format(permission))
        #documents
        outfile = os.path.join(os.environ["SECURITY_DATASETS"], "saved-parameters/saved-data/saved-data/own-data/{}-fasttext-embeddings-documents-w2i.pickle".format(permission))

        """
        """
        For no stem own embeddings,
        stemmer =  "nostemmer"
        embeddings_file = os.path.join(os.environ["SECURITY_DATASETS"], "embeddings/own-embeddings/sscraped_no_stemming_300.bin")
        embeddings, embeddings_dim = IOUtils.load_embeddings_file(embeddings_file, "word2vec", lower=True)
        #sentences
        outfile = os.path.join(os.environ["SECURITY_DATASETS"], "saved-parameters/saved-data/saved-data/own-data/{}-nostem-embeddings-sentences-w2i.pickle".format(permission))
        #documents
        outfile = os.path.join(os.environ["SECURITY_DATASETS"], "saved-parameters/saved-data/saved-data/own-data/{}-nostem-embeddings-documents-w2i.pickle".format(permission))

        """
        stemmer = "porter" 
        embeddings_file = os.path.join(os.environ["SECURITY_DATASETS"], "embeddings/own-embeddings/scraped_with_porter_stemming_300.bin")
        embeddings, embeddings_dim = IOUtils.load_embeddings_file(embeddings_file, "word2vec", lower=True)
        """
        #create sentence based embeddings
        permission = "STORAGE"
        infile = os.path.join(os.environ["SECURITY_DATASETS"], "created-data/{}.csv".format(permission))
        outfile = os.path.join(os.environ["SECURITY_DATASETS"],"saved-parameters/saved-data/own-data/{}-embeddings-sentences-w2i.pickle".format(permission))
        OwnDataCreator.save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile)

        permission = "RECORD_AUDIO"
        infile = os.path.join(os.environ["SECURITY_DATASETS"], "created-data/{}.csv".format(permission))
        outfile = os.path.join(os.environ["SECURITY_DATASETS"],"saved-parameters/saved-data/own-data/{}-embeddings-sentences-w2i.pickle".format(permission))
        OwnDataCreator.save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile)

        permission = "READ_CONTACTS"
        infile = os.path.join(os.environ["SECURITY_DATASETS"], "created-data/{}.csv".format(permission))
        outfile = os.path.join(os.environ["SECURITY_DATASETS"],"saved-parameters/saved-data/own-data/{}-embeddings-sentences-w2i.pickle".format(permission))
        OwnDataCreator.save_sentence_based_dataset(infile, permission, embeddings, stemmer, outfile)
        """
        #create document based embeddings
        permission = "STORAGE"
        infile = os.path.join(os.environ["SECURITY_DATASETS"], "created-data/{}.csv".format(permission))
        outfile = os.path.join(os.environ["SECURITY_DATASETS"],"saved-parameters/saved-data/own-data/{}-embeddings-documents-w2i.pickle".format(permission))
        OwnDataCreator.save_document_based_dataset(infile, permission, embeddings, stemmer, outfile)

        permission = "RECORD_AUDIO"
        infile = os.path.join(os.environ["SECURITY_DATASETS"], "created-data/{}.csv".format(permission))
        outfile = os.path.join(os.environ["SECURITY_DATASETS"],"saved-parameters/saved-data/own-data/{}-embeddings-documents-w2i.pickle".format(permission))
        OwnDataCreator.save_document_based_dataset(infile, permission, embeddings, stemmer, outfile)

        permission = "READ_CONTACTS"
        infile = os.path.join(os.environ["SECURITY_DATASETS"], "created-data/{}.csv".format(permission))
        outfile = os.path.join(os.environ["SECURITY_DATASETS"],"saved-parameters/saved-data/own-data/{}-embeddings-documents-w2i.pickle".format(permission))
        OwnDataCreator.save_document_based_dataset(infile, permission, embeddings, stemmer, outfile)

In [7]:
class AcNetDataCreator:
    def load_documents(infile, stemmer, embeddings):
        print("Loading row {} ".format(infile))
        # read training data
        tagged_file = pd.read_csv(infile)
        documents = []
        acnet_map = {
            "RECORD_AUDIO": "MICROPHONE",
            "READ_CONTACTS": "CONTACTS",
            "READ_CALENDAR": "CALENDAR",
            "ACCESS_FINE_LOCATION": "LOCATION",
            "CAMERA": "CAMERA",
            "READ_SMS": "SMS",
            "READ_CALL_LOGS": "CALL_LOG",
            "CALL_PHONE": "PHONE",
            "WRITE_SETTINGS": "SETTINGS",
            "GET_TASKS": "TASKS",
            "STORAGE": "STORAGE",
        }

        for idx, row in tagged_file.iterrows():
            app_id = row["app_id"]
            sentence = row["sentence"]

            if documents == []:  # if it is the first document
                documents.append(DocumentReport(app_id))
            elif documents[-1].app_id != app_id:  # if it is a new document
                documents.append(DocumentReport(app_id))

            for permission in acnet_map:
                if (
                    permission not in documents[-1].permissions
                    or row[acnet_map[permission]] == 1
                ):
                    documents[-1].permissions[permission] = row[acnet_map[permission]]

            documents[-1].sentences.append(sentence)
            preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)

            filtered = [word for word in preprocessed if word in embeddings]
            if filtered:
                documents[-1].preprocessed_sentences.append(filtered)
        print("Loading completed")
        return documents
    
    def load_sentences(infile, stemmer, embeddings):
        print("Loading row {} ".format(infile))
        # read training data
        tagged_file = pd.read_csv(infile)
        sentences = []
        acnet_map = {
            "RECORD_AUDIO": "MICROPHONE",
            "READ_CONTACTS": "CONTACTS",
            "READ_CALENDAR": "CALENDAR",
            "ACCESS_FINE_LOCATION": "LOCATION",
            "CAMERA": "CAMERA",
            "READ_SMS": "SMS",
            "READ_CALL_LOGS": "CALL_LOG",
            "CALL_PHONE": "PHONE",
            "WRITE_SETTINGS": "SETTINGS",
            "GET_TASKS": "TASKS",
            "STORAGE": "STORAGE",
        }
        for idx, row in tagged_file.iterrows():
            app_id = row["app_id"]
            sentence = row["sentence"]
            sentence_report = SentenceReport(app_id, sentence)

            for permission in acnet_map:
                sentence_report.permissions[permission] = row[acnet_map[permission]]

            preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
            sentence_report.preprocessed_sentence = [
                word for word in preprocessed if word in embeddings
            ]
            if sentence_report.preprocessed_sentence != []:
                sentences.append(sentence_report)
        print("Loading completed")
        return sentences

    def vocab(infile, stemmer, embeddings):
        print("Loading row {} ".format(infile))
        tagged_file = pd.read_csv(infile)
        w2i = {}
        for idx, row in tagged_file.iterrows():
            sentence = row["sentence"]
            preprocessed = NLPUtils.preprocess_sentence(sentence, stemmer)
            filtered = [word for word in preprocessed if word in embeddings]
            for token in filtered:
                if token not in w2i:
                    w2i[token] = len(w2i)
        print("Loading completed")
        return w2i

    def filtered_vocab_embeddings(w2i, embeddings):
        subset = {}
        for key in w2i:
            subset[key] = embeddings[key]
        return subset

    def save_sentence_based_dataset(infile, embeddings, stemmer, outfile):
        sentences = AcNetDataCreator.load_sentences(infile, stemmer, embeddings)
        w2i = AcNetDataCreator.vocab(infile, stemmer, embeddings)
        subset_embeddings = AcNetDataCreator.filtered_vocab_embeddings(w2i, embeddings)
        with open(outfile, "wb") as target:
            pickle.dump([subset_embeddings, sentences, w2i], target)

    def save_document_based_dataset(infile, embeddings, stemmer, outfile):
        documents = AcNetDataCreator.load_documents(infile, stemmer, embeddings)
        w2i = AcNetDataCreator.vocab(infile, stemmer, embeddings)
        subset_embeddings = AcNetDataCreator.filtered_vocab_embeddings(w2i, embeddings)
        with open(outfile, "wb") as target:
            pickle.dump([subset_embeddings, documents, w2i], target)
    
    def run():
        """
        For fasttext embeddings,
        stemmer =  "nostemmer"
        embeddings_file = os.path.join(os.environ["SECURITY_DATASETS"], "embeddings/cc.en.300.bin")
        embeddings, embeddings_dim = IOUtils.load_embeddings_file(embeddings_file, "fasttext", lower=True)
        #sentences
        outfile = os.path.join(os.environ["SECURITY_DATASETS"], "saved-parameters/saved-data/saved-data/ac-net/fasttext-embeddings-sentences-w2i.pickle")
        #documents
        outfile = os.path.join(os.environ["SECURITY_DATASETS"], "saved-parameters/saved-data/saved-data/ac-net/fasttext-embeddings-documents-w2i.pickle")

        """
        """
        For no stem own embeddings,
        stemmer =  "nostemmer"
        embeddings_file = os.path.join(os.environ["SECURITY_DATASETS"], "embeddings/own-embeddings/sscraped_no_stemming_300.bin")
        embeddings, embeddings_dim = IOUtils.load_embeddings_file(embeddings_file, "word2vec", lower=True)
        #sentences
        outfile = os.path.join(os.environ["SECURITY_DATASETS"], "saved-parameters/saved-data/saved-data/ac-net/nostem-embeddings-sentences-w2i.pickle")
        #documents
        outfile = os.path.join(os.environ["SECURITY_DATASETS"], "saved-parameters/saved-data/saved-data/ac-net/nostem-embeddings-documents-w2i.pickle")

        """
        stemmer = "porter" # "nostemmer"
        embeddings_file = os.path.join(os.environ["SECURITY_DATASETS"], "embeddings/own-embeddings/scraped_with_porter_stemming_300.bin")
        embeddings, embeddings_dim = IOUtils.load_embeddings_file(embeddings_file, "word2vec", lower=True)
        """
        #create sentence based embeddings
        infile = os.path.join(os.environ["SECURITY_DATASETS"], "acnet-data/ACNET_DATASET.csv")
        outfile = os.path.join(os.environ["SECURITY_DATASETS"],"saved-parameters/saved-data/ac-net/embeddings-sentences-w2i.pickle")
        AcNetDataCreator.save_sentence_based_dataset(infile, embeddings, stemmer, outfile)
        """

        #create document based embeddings
        infile = os.path.join(os.environ["SECURITY_DATASETS"], "acnet-data/ACNET_DATASET.csv")
        outfile = os.path.join(os.environ["SECURITY_DATASETS"],"saved-parameters/saved-data/ac-net/embeddings-documents-w2i.pickle")
        AcNetDataCreator.save_document_based_dataset(infile, embeddings, stemmer, outfile)

In [35]:
OwnDataCreator.run()

Loading row /Users/huseyinalecakir/huseyin/Work/Security/datasets/created-data/STORAGE.csv 
Loading completed
Loading row /Users/huseyinalecakir/huseyin/Work/Security/datasets/created-data/STORAGE.csv 
Loading completed
Loading row /Users/huseyinalecakir/huseyin/Work/Security/datasets/created-data/RECORD_AUDIO.csv 
Loading completed
Loading row /Users/huseyinalecakir/huseyin/Work/Security/datasets/created-data/RECORD_AUDIO.csv 
Loading completed
Loading row /Users/huseyinalecakir/huseyin/Work/Security/datasets/created-data/READ_CONTACTS.csv 
Loading completed
Loading row /Users/huseyinalecakir/huseyin/Work/Security/datasets/created-data/READ_CONTACTS.csv 
Loading completed


In [30]:
stemmer = "porter" # "nostemmer"
embeddings_file = os.path.join(os.environ["SECURITY_DATASETS"], "embeddings/own-embeddings/scraped_with_porter_stemming_300.bin")
embeddings, embeddings_dim = IOUtils.load_embeddings_file(embeddings_file, "word2vec", lower=True)
permission = "STORAGE"
infile = os.path.join(os.environ["SECURITY_DATASETS"], "created-data/{}.csv".format(permission))
outfile = os.path.join(os.environ["SECURITY_DATASETS"],"saved-parameters/saved-data/own-data/{}-embeddings-documents-w2i.pickle".format(permission))


In [32]:
documents = OwnDataCreator.load_documents(infile, permission, stemmer, embeddings)


Loading row /Users/huseyinalecakir/huseyin/Work/Security/datasets/created-data/STORAGE.csv 
Loading completed


TypeError: object of type 'DocumentReport' has no len()

In [33]:
for doc in documents:
    if len(doc.preprocessed_sentences) == 0:
        print(doc)

<common.DocumentReport object at 0x1a22095f98>
<common.DocumentReport object at 0x1a2124feb8>
<common.DocumentReport object at 0x1a21c85b70>
<common.DocumentReport object at 0x1a221cd908>
