In [1]:
import os
import re
import sys
import inspect
import time
import csv
import requests
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from pandas import ExcelFile

current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 

from utils.io_utils import IOUtils
from utils.nlp_utils import NLPUtils

Using the default treebank "en_ewt" for language "en".
Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/huseyinalecakir/Security/source/PermissionDescriptionFidelity/utils/../../../data/models/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: depparse
With settings: 
{'model_path': '/home/huseyinalecakir/Security/source/PermissionDescriptionFidelity/utils/../../../data/models/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/home/huseyinalecakir/Security/source/PermissionDescriptionFidelity/utils/../../../data/models/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [8]:
def get_descriptions(id_file, out_file):
    #Get application ids beforehand
    app_infos = {}
    with open(id_file, "r") as target:
        for app_id in target:
            app_infos[app_id.rstrip()] = None

    #Get application descriptions
    base_url = "http://localhost:3000/api/apps/"
    lang = "/?lang=en"
    with open(out_file, "w") as out:
        writer = csv.writer(out)
        writer.writerow(["application_id", "description"])
        for app_id in app_infos:
            response = requests.get(base_url+app_id+lang)
            if response.status_code == 200:
                if "description" in response.json():
                    writer.writerow([app_id, response.json()["description"]])
                    
def get_app_category(app_id):
    base_url = 'http://localhost:3000/api/apps/'
    response = requests.get(base_url+app_id)
    if response.status_code == 200:
        if "genreId" not in response.json():
            print("Not found genre id in", app_id)
            return "NOT_FOUND"
        else:
            gid = response.json()["genreId"].upper()
            return gid
    else:
        print(app_id, " category cannot be found")
        return "NOT_FOUND"

def update_sheets(file_name, sheetname):
    df = pd.read_excel(file_name, sheet_name=sheetname)
    data = []
    for index, row in df.iterrows():
        data.append([row["Count"], row["Sentences"], row["Manually Marked"], row["uses-permission"], row["permission"]])
    updated_data = []
    app_order = 1
    description_tag = None
    for row in data:
        row[1] = str(row[1])
        if row[1].startswith("##"):
            if app_order != 1:
                description_row = [np.nan, "Description Tag", description_tag, np.nan, np.nan]
                updated_data.append(description_row)
            row[0] = "##{}".format(app_order)
            app_order += 1
            app_id = row[1].split("##")[1].strip()
            category = get_app_category(app_id)
            app_category = "CATEGORY/" + category
            category_row = [np.nan, app_category , np.nan, np.nan, np.nan]
            description_tag = row[2]
            row[2] = np.nan
            updated_data.append(row)
            updated_data.append(category_row)
        else:
            updated_data.append(row)
    return updated_data

def write_to_excel(updated_data, output_file):
    def background_apply(value):
        if value.startswith("##") or value.startswith("Description Tag"):
            font_weight = 'bold'
        else:
            font_weight = 'normal'
        return 'font-weight: %s' % font_weight
    
    updated_df = pd.DataFrame(updated_data, columns = ["Count", "Sentences", "Manually Marked", "uses-permission", "permission"])
    styled = updated_df.style.applymap(background_apply, subset=['Sentences'])
    styled.to_excel(output_file, engine='openpyxl')

def clean_play_store_data(file_path):
    """TODO"""
    number_of_apps = 0
    data = {}
    with open(file_path) as stream:
        reader = csv.reader(stream)
        next(reader)
        start_time = time.time()
        for row in reader:
            number_of_apps += 1
            app_id = row[0]
            text = row[1]
            data[app_id] = []
            for sentence in NLPUtils.sentence_tokenization(text):
                data[app_id].append(sentence)
    return data

def remove_given_pattern(regex, data):
    updated_data = {}
    for  key in list(data.keys()):
        new_lines = []
        for line in data[key]:
            updated = re.sub(regex, '', line)
            new_lines.append(updated)
        updated_data[key] = new_lines
    return updated_data

def remove_emoji(data):
    updated_data = {}
    import demoji
    demoji.download_codes()
    for key in list(data.keys()):
        new_lines = []
        for line in data[key]:
            updated = demoji.replace(line, repl="").strip()
            if updated:
                new_lines.append(updated)
        updated_data[key] = new_lines
    return updated_data

def select_given_apps(data, file_path):
    selected_apps = {}
    last_key = None
    with open(file_path) as stream:
        for line in stream:
            line = line.rstrip()
            if line.startswith("%%"):
                last_key = line.split("%%")[1]
                selected_apps[last_key] = {"uses-permission" : [], "permission" : []}
            else:
                if line:
                    if line.startswith("uses-permission"):
                        selected_apps[last_key]["uses-permission"].append(line)
                    elif line.startswith("permission"):
                        selected_apps[last_key]["permission"].append(line)
    new_data = {}
    for app_id in selected_apps:
        if app_id in data:
            new_data[app_id] = {}
            new_data[app_id]["data"] = data[app_id]
            new_data[app_id]["permissions"] = selected_apps[app_id]
    return new_data

def write_excel(data, outfile, eliminated_apps={}, count=1000):
    import xlwt
    header = ["Count", "Sentences", "Manually Marked", "uses-permission", "permission"]
    
    style = xlwt.XFStyle()
    # font
    font = xlwt.Font()
    font.bold = True
    style.font = font
    
    workbook = xlwt.Workbook()
    sheet = workbook.add_sheet('Sheet')
    sheet.write(0, 0, "Count", style=style) 
    sheet.write(0, 1, "Sentences", style=style) 
    sheet.write(0, 2, "Manually Marked", style=style)
    sheet.write(0, 3, "uses-permission", style=style)
    sheet.write(0, 4, "permission", style=style)

    row_number = 1
    app_num = 1
    for idx, app_id in zip(range(count), data):
        if app_id not in eliminated_apps:
            sheet.write(row_number, 0, "#{}".format(app_num), style=style)
            sheet.write(row_number, 1, "##{}".format(app_id), style=style)
            uses_permissions = ":".join(data[app_id]["permissions"]["uses-permission"])
            permissions = ":".join(data[app_id]["permissions"]["permission"])
            sheet.write(row_number, 3, ":{}".format(uses_permissions))
            sheet.write(row_number, 4, ":{}".format(permissions))
            row_number += 1
            for sentence in data[app_id]["data"]:
                sheet.write(row_number, 1, sentence)
                row_number += 1
            app_num += 1
    workbook.save(outfile)


def run(file_name, sheet_name):
    updated_data = update_sheets(file_name, sheet_name)
    write_to_excel(updated_data, file_name)

In [4]:
DIR_NAME = os.path.abspath('')
ID_PATH = os.path.join(DIR_NAME, "../../../data/play_store_data/Apps_ReadContacts_989_id.txt")
CSV_PATH = os.path.join(DIR_NAME, "../../../data/play_store_data/Apps_ReadContacts_989_id.csv")
PERMISSION_LIST_PATH = os.path.join(DIR_NAME, "../../../data/play_store_data/Apps_ReadContacts_989_permissionsByAAPT.txt")
OUT_PATH = os.path.join(DIR_NAME, "../../../data/play_store_data/Apps_ReadContacts_989_id.xls")


In [5]:
get_descriptions(ID_PATH, CSV_PATH)

In [7]:
regex = r"^[^\w\!\?\\\(\)\[\]\“\‘\"]+"
data = clean_play_store_data(CSV_PATH)
data = remove_given_pattern(regex, data)
data = remove_emoji(data)
data = select_given_apps(data, PERMISSION_LIST_PATH)
write_excel(data, OUT_PATH)
run(OUT_PATH,"Sheet")

[33mDownloading emoji data ...[0m
[92m... OK[0m (Got response in 0.39 seconds)
[33mWriting emoji data to /home/huseyinalecakir/.demoji/codes.json ...[0m
[92m... OK[0m
com.narvii.amino.x42 SOCIAL
com.bijoysingh.clipo PRODUCTIVITY
coocent.tools.music.ringtonemaker MUSIC_AND_AUDIO
net.invoice.bee BUSINESS
com.tgclab.adicon LIFESTYLE
com.kakao.talk COMMUNICATION
mega.privacy.android.app PRODUCTIVITY
com.storypark.families.android EDUCATION
air.app.scb.breeze.android.main.my.prod FINANCE
com.zoho.mail PRODUCTIVITY
com.fi6715.godough FINANCE
com.mteducare.mtrobomateplus EDUCATION
com.zepp.zgolf SPORTS
ws.coverme.im COMMUNICATION
jim.mykeyboard.myphotokeyboard PHOTOGRAPHY
com.jpm.sig.android FINANCE
com.microsoft.office.powerpoint PRODUCTIVITY
co.village SHOPPING
ranat.torkia.ranat_torkia.turkish.ringtones MUSIC_AND_AUDIO
com.careem.acma MAPS_AND_NAVIGATION
ru.euphoria.doggy SOCIAL
com.adslinfotech.simpleaccounting FINANCE
com.hi5.app SOCIAL
com.tomatomusic.loudrts PERSONALIZATION
com

com.whatsapp.w4b COMMUNICATION
com.beta.bcard PRODUCTIVITY
com.onewaycab TRAVEL_AND_LOCAL
finarea.HotVoip SOCIAL
tw.com.taishinbank.richart FINANCE
com.cbs.sportsapp.android.psu SPORTS
com.xero.touch BUSINESS
com.synergygb.mercantil.Tpago FINANCE
ch.threema.app.work COMMUNICATION
app.mywed.android EVENTS
bestringtones2019.topringtones2020.android LIBRARIES_AND_DEMO
com.octopuscards.nfc_reader TOOLS
com.microsoft.office.word PRODUCTIVITY
com.SmoreGames.WordWinner GAME_WORD
com.shenyaocn.android.barmaker TOOLS
fax.app BUSINESS
mozat.rings.loops SOCIAL
in.justride TRAVEL_AND_LOCAL
es.lacaixa.mobile.android.newwapicon FINANCE
com.alltrails.alltrails HEALTH_AND_FITNESS
com.sporteasy.android SPORTS
com.gettechnology.incomeon_income SOCIAL
com.vipera.chebanca FINANCE
com.gigaset.elements LIFESTYLE
com.winrgames.solitaire GAME_CARD
com.music.ringtone.maker.mp3 TOOLS
com.pack.myshiftwork PRODUCTIVITY
com.wf.wellsfargomobile FINANCE
eu.eleader.mobilebanking.pekao FINANCE
com.apps.power.super.cle

com.shinsegae.mobile.froyo SHOPPING
com.babyfish.policesiren ENTERTAINMENT
com.airtelmyplan BUSINESS
com.projectstar.ishredder.android.standard TOOLS
com.oscprofessionals.sales_assistant BUSINESS
ru.mw FINANCE
com.lucidchart.android.chart PRODUCTIVITY
com.zappoint.zappoint BUSINESS
com.punchbowl.mobile LIFESTYLE
com.narvii.amino.x237406644 SOCIAL
ru.kykyryza FINANCE
com.colorcall.colorphone.flash.dialer PERSONALIZATION
com.breezeghana.ui FINANCE
com.keepcalling.ui COMMUNICATION
com.callpod.android_apps.keeper PRODUCTIVITY
com.nudgeyourself.nudge HEALTH_AND_FITNESS
krk.joker.jokerkeyboard PERSONALIZATION
haygot.togyah.app EDUCATION
com.ifs.mobilebanking.fiid3164 FINANCE
org.myklos.inote PRODUCTIVITY
com.safetyculture.iauditor BUSINESS
Not found genre id in com.g5e.smmmtch3pg.android
com.tomatomusic.birdsrts PERSONALIZATION
com.opentecheng.paginegialle.dream TRAVEL_AND_LOCAL
com.intuit.quickbooks BUSINESS
com.fragileheart.musiccutter MUSIC_AND_AUDIO
com.narvii.amino.x250948626 SOCIAL
ard

com.ovuline.fertility MEDICAL
com.emeint.android.mwallet.bm FINANCE
com.yinzcam.nfl.seahawks SPORTS
org.telegram.BifToGram COMMUNICATION
com.appgenix.bizcal PRODUCTIVITY
com.armut.armutha LIFESTYLE
com.makelifesimple.duplicatedetector TOOLS
com.fotoable.faceswap.c403 PHOTOGRAPHY
com.iledger.book BUSINESS
br.com.maceda.android.antifurtow TOOLS
com.ubercab MAPS_AND_NAVIGATION
com.mykronoz.zefit4 HEALTH_AND_FITNESS
finarea.Telbo COMMUNICATION
com.planetfitness HEALTH_AND_FITNESS
me.swiftgift.swiftgift SHOPPING
com.rayg.sirens ENTERTAINMENT
com.applegends.ringtones.free_songs ENTERTAINMENT
com.momobills.billsapp BUSINESS
co.alegra.app BUSINESS
com.jrj.bellsandwhistles PERSONALIZATION
de.eos.uptrade.android.fahrinfo.berlin MAPS_AND_NAVIGATION
com.fi7233.godough FINANCE
com.cisco.im COMMUNICATION
com.chamberlain.myq.chamberlain LIFESTYLE
la.droid.qr PRODUCTIVITY
com.narvii.amino.x231629506 SOCIAL
com.grandcinema.gcapp.screens ENTERTAINMENT
Not found genre id in ir.mobillet.app
com.obhai TRAV

com.ifs.banking.fiid1425 FINANCE
com.guruinfomedia.ebook.pdfviewer PRODUCTIVITY
com.homescreen.phone.theme PRODUCTIVITY
vn.mobifone.mobifonenext BUSINESS
ru.gelin.android.sendtosd TOOLS
cb.ibank FINANCE
com.paltalk.chat.android SOCIAL
ua.com.cs.ifobs.mobile.android.otp BUSINESS
com.nearbuy.nearbuymobile FOOD_AND_DRINK
com.blocktrail.mywallet FINANCE
com.naver.nozzle PRODUCTIVITY
com.niceringtonesapps.funrts PERSONALIZATION
es.mrcl.app.juasapp ENTERTAINMENT
com.three60.cabioclient TRAVEL_AND_LOCAL
com.datainfosys.datamail COMMUNICATION
com.iflytek.freeringtones MUSIC_AND_AUDIO
com.s4m FINANCE
com.tac.woodproof HEALTH_AND_FITNESS
com.meihillman.ringtonemaker VIDEO_PLAYERS
com.sony.songpal MUSIC_AND_AUDIO
com.google.android.apps.tachyon COMMUNICATION
com.signnow.android PRODUCTIVITY
tursky.jan.settings TOOLS
au.com.nab.mobile FINANCE
nl.afas.pocket2 PRODUCTIVITY
in.fulldive.shell ENTERTAINMENT
com.publicbank.PBBVOCA FINANCE
com.bitdefender.centralmgmt TOOLS
finarea.VoipStunt SOCIAL
com.mo