In [1]:
import os
import re
import sys
import inspect
import time
import csv
import requests
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from pandas import ExcelFile

current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 

from utils.io_utils import IOUtils
from utils.nlp_utils import NLPUtils

Using the default treebank "en_ewt" for language "en".
Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/huseyinalecakir/Security/source/PermissionDescriptionFidelity/utils/../../../data/models/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: depparse
With settings: 
{'model_path': '/home/huseyinalecakir/Security/source/PermissionDescriptionFidelity/utils/../../../data/models/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/home/huseyinalecakir/Security/source/PermissionDescriptionFidelity/utils/../../../data/models/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [2]:
def get_descriptions(id_file, out_file):
    #Get application ids beforehand
    app_infos = {}
    with open(id_file, "r") as target:
        for app_id in target:
            app_infos[app_id.rstrip()] = None

    #Get application descriptions
    base_url = "http://localhost:3000/api/apps/"
    lang = "/?lang=en"
    with open(out_file, "w") as out:
        writer = csv.writer(out)
        writer.writerow(["application_id", "description"])
        for app_id in app_infos:
            response = requests.get(base_url+app_id+lang)
            if response.status_code == 200:
                if "description" in response.json():
                    writer.writerow([app_id, response.json()["description"]])
                    
def get_app_category(app_id):
    base_url = 'http://localhost:3000/api/apps/'
    response = requests.get(base_url+app_id)
    if response.status_code == 200:
        if "genreId" not in response.json():
            print("Not found genre id in", app_id)
            return "NOT_FOUND"
        else:
            gid = response.json()["genreId"].upper()
            print(app_id, gid)
            return gid
    else:
        print(app_id, " category cannot be found")
        return "NOT_FOUND"

def update_sheets(file_name, sheetname):
    df = pd.read_excel(file_name, sheet_name=sheetname)
    data = []
    for index, row in df.iterrows():
        data.append([row["Count"], row["Sentences"], row["Manually Marked"], row["uses-permission"], row["permission"]])
    updated_data = []
    app_order = 1
    description_tag = None
    for row in data:
        row[1] = str(row[1])
        if row[1].startswith("##"):
            if app_order != 1:
                description_row = [np.nan, "Description Tag", description_tag, np.nan, np.nan]
                updated_data.append(description_row)
            row[0] = "##{}".format(app_order)
            app_order += 1
            app_id = row[1].split("##")[1].strip()
            category = get_app_category(app_id)
            app_category = "CATEGORY/" + category
            category_row = [np.nan, app_category , np.nan, np.nan, np.nan]
            description_tag = row[2]
            row[2] = np.nan
            updated_data.append(row)
            updated_data.append(category_row)
        else:
            updated_data.append(row)
    return updated_data

def write_to_excel(updated_data, output_file):
    def background_apply(value):
        if value.startswith("##") or value.startswith("Description Tag"):
            font_weight = 'bold'
        else:
            font_weight = 'normal'
        return 'font-weight: %s' % font_weight
    
    updated_df = pd.DataFrame(updated_data, columns = ["Count", "Sentences", "Manually Marked", "uses-permission", "permission"])
    styled = updated_df.style.applymap(background_apply, subset=['Sentences'])
    styled.to_excel(output_file, engine='openpyxl')

def clean_play_store_data(file_path):
    """TODO"""
    number_of_apps = 0
    data = {}
    with open(file_path) as stream:
        reader = csv.reader(stream)
        next(reader)
        start_time = time.time()
        for row in reader:
            number_of_apps += 1
            app_id = row[0]
            text = row[1]
            data[app_id] = []
            for sentence in NLPUtils.sentence_tokenization(text):
                data[app_id].append(sentence)
    return data

def remove_given_pattern(regex, data):
    updated_data = {}
    for  key in list(data.keys()):
        new_lines = []
        for line in data[key]:
            updated = re.sub(regex, '', line)
            new_lines.append(updated)
        updated_data[key] = new_lines
    return updated_data

def remove_emoji(data):
    updated_data = {}
    import demoji
    demoji.download_codes()
    for key in list(data.keys()):
        new_lines = []
        for line in data[key]:
            updated = demoji.replace(line, repl="").strip()
            if updated:
                new_lines.append(updated)
        updated_data[key] = new_lines
    return updated_data

def select_given_apps(data, file_path):
    selected_apps = {}
    last_key = None
    with open(file_path) as stream:
        for line in stream:
            line = line.rstrip()
            if line.startswith("%%"):
                last_key = line.split("%%")[1]
                selected_apps[last_key] = {"uses-permission" : [], "permission" : []}
            else:
                if line:
                    if line.startswith("uses-permission"):
                        selected_apps[last_key]["uses-permission"].append(line)
                    elif line.startswith("permission"):
                        selected_apps[last_key]["permission"].append(line)
    new_data = {}
    for app_id in selected_apps:
        if app_id in data:
            new_data[app_id] = {}
            new_data[app_id]["data"] = data[app_id]
            new_data[app_id]["permissions"] = selected_apps[app_id]
    return new_data

def write_excel(data, outfile, eliminated_apps={}, count=1000):
    import xlwt
    header = ["Count", "Sentences", "Manually Marked", "uses-permission", "permission"]
    
    style = xlwt.XFStyle()
    # font
    font = xlwt.Font()
    font.bold = True
    style.font = font
    
    workbook = xlwt.Workbook()
    sheet = workbook.add_sheet('Sheet')
    sheet.write(0, 0, "Count", style=style) 
    sheet.write(0, 1, "Sentences", style=style) 
    sheet.write(0, 2, "Manually Marked", style=style)
    sheet.write(0, 3, "uses-permission", style=style)
    sheet.write(0, 4, "permission", style=style)

    row_number = 1
    app_num = 1
    for idx, app_id in zip(range(count), data):
        if app_id not in eliminated_apps:
            sheet.write(row_number, 0, "#{}".format(app_num), style=style)
            sheet.write(row_number, 1, "##{}".format(app_id), style=style)
            uses_permissions = ":".join(data[app_id]["permissions"]["uses-permission"])
            permissions = ":".join(data[app_id]["permissions"]["permission"])
            sheet.write(row_number, 3, ":{}".format(uses_permissions))
            sheet.write(row_number, 4, ":{}".format(permissions))
            row_number += 1
            for sentence in data[app_id]["data"]:
                sheet.write(row_number, 1, sentence)
                row_number += 1
            app_num += 1
    workbook.save(outfile)


def run(file_name, sheet_name):
    updated_data = update_sheets(file_name, sheet_name)
    write_to_excel(updated_data, file_name)

In [4]:
DIR_NAME = os.path.abspath('')
ID_PATH = os.path.join(DIR_NAME, "../../../data/play_store_data/Apps_ReadContacts_989_id.txt")
CSV_PATH = os.path.join(DIR_NAME, "../../../data/play_store_data/Apps_ReadContacts_989_id.csv")
PERMISSION_LIST_PATH = os.path.join(DIR_NAME, "../../../data/play_store_data/Apps_ReadContacts_989_permissionsByAAPT.txt")
OUT_PATH = os.path.join(DIR_NAME, "../../../data/play_store_data/Apps_ReadContacts_989_id.xls")


In [None]:
get_descriptions(ID_PATH, CSV_PATH)

In [11]:
regex = r"^[^\w\!\?\\\(\)\[\]\“\‘\"]+"
data = clean_play_store_data(CSV_PATH)
data = remove_given_pattern(regex, data)
data = remove_emoji(data)
data = select_given_apps(data, PERMISSION_LIST_PATH)
run(OUT_PATH,"Sheet")

[33mDownloading emoji data ...[0m
[92m... OK[0m (Got response in 5.49 seconds)
[33mWriting emoji data to /home/huseyinalecakir/.demoji/codes.json ...[0m
[92m... OK[0m


In [14]:
#prev_id_list = get_prev_versions(VERSION1)
write_excel(data, OUT_PATH)