# Шаг 3: Декомпиляция APK и генерация набора данных

In [1]:
import json
import requests
from requests_toolbelt.multipart.encoder import MultipartEncoder
import subprocess
import os
import json
import csv
import pandas as pd
import time
import numpy as np
import shutil
import glob

In [2]:
APK_BASE_PATH = "D:\\ETU\\VKR\\scanned_apps\\"
SMALI_PATH = "D:\\ETU\\VKR\\decompile\\"
APKTOOL_PATH = "C:\\Windows\\" #apktool.bat path
JSON_PATH = "D:\\ETU\\VKR\\json\\"
CSV_PATH = "D:\\ETU\\VKR\\"

headers=["code", "CWE_ID", "description", "vulnerability_status"]

In [3]:
def default_data_row(row):
    code = str(row).strip()
    cwe_id = ""
    desc = ""
    vulnerability_status = 0
    csv_data_row = [code, cwe_id, desc, vulnerability_status]
    return csv_data_row

In [4]:
def delete_irrelevant_rows(file):
    df = []
    starts = (".", ":", "#", "goto", "value", "\"", "}", "return", "throw", "accessFlags", "name")
    for entry in file:
        if not entry[0].strip().startswith(starts):
            df.append(entry)
    return df

In [5]:
def delete_rows_with_no_code(df):        
    filtered = df["code"] != ""
    df = df[filtered]
    df.reset_index()
    return df

In [6]:
def get_csv_source(apk_source_path, path):
    file_path = apk_source_path + path + ".smali"
    source = pd.read_csv(file_path, encoding='utf-8', header=None, delimiter="\0",skip_blank_lines=False).fillna("")
    return source

In [7]:
def decompile_with_apktool(apk):
    command = APKTOOL_PATH + "apktool.bat d " + APK_BASE_PATH + apk + " -o " + SMALI_PATH + apk
    os.system(command)

## Генерация набора данных

In [8]:
apk_list = os.listdir(APK_BASE_PATH)
vulnerability_dataset = []

for apk in apk_list:
    print("Generating vulnerability dataset from: " + apk)
    
    try:
        with open(JSON_PATH + apk + ".json") as json_data:
            d = json.load(json_data)
            cwe_list = d["cwe"]
            json_data.close()
    except Exception as ej:
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(str(ej))
        continue
        
    if not cwe_list: continue
    
    decompile_with_apktool(apk)

    file_list = []
    apk_source_path = SMALI_PATH + apk

    for path, subdirs, files in os.walk(apk_source_path, topdown=False):
        for name in files:
            file_list.append(os.path.join(path, name))

    for entry in cwe_list:
        try:
            file_path = [s for s in file_list if entry["path"] + ".smali" in s]
            if file_path: file_path = file_path[0]
            else: break
            source = pd.read_csv(file_path, encoding='utf-8', header=None, delimiter="\0",skip_blank_lines=False).fillna("")

            file_df = [] 
            for index, row in source.iterrows():
                file_df.append(default_data_row(row[0]))

            file_df = pd.DataFrame(file_df, columns = headers)
            file_df = delete_rows_with_no_code(file_df)

            for cwe_data in entry["list"]:
                is_caller = False
                for index, row in file_df.iterrows():
                    if all(x in str(row["code"]) for x in [".method", cwe_data["caller"]]):
                        is_caller = True
                        continue
                    if is_caller:
                        if all(x in str(row["code"]) for x in ["invoke", cwe_data["target"]]):
                            file_df.at[index, "CWE_ID"] = cwe_data["id"]
                            file_df.at[index, "description"] = cwe_data["description"]
                            file_df.at[index, "vulnerability_status"] = 1
                            break
        except Exception as ei:
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(str(ei))
            continue

        file_tolist = file_df.values.tolist()
        vulnerability_dataset.extend(delete_irrelevant_rows(file_tolist))
        file_list = [value for value in file_list if value != file_path]

    if os.path.isdir(SMALI_PATH + apk):
        shutil.rmtree(SMALI_PATH + apk)

vulberability_dataframe = pd.DataFrame(vulnerability_dataset, columns = headers)

vulberability_dataframe.to_csv('vulnerability_dataset_BALANCE.csv', sep=',', encoding='utf-8', index=False)

Generating vulnerability dataset from: acca.apk
Generating vulnerability dataset from: acdisplay.apk
Generating vulnerability dataset from: activityforcenewtask.apk
Generating vulnerability dataset from: activity-launcher.apk
Generating vulnerability dataset from: activity-launcher-fork.apk
Generating vulnerability dataset from: adaptive-brightness-tile.apk
Generating vulnerability dataset from: adbio.apk
Generating vulnerability dataset from: adblock-plus.apk
Generating vulnerability dataset from: adfree.apk
Generating vulnerability dataset from: adguard-content-blocker.apk
Generating vulnerability dataset from: admincontrol.apk
Generating vulnerability dataset from: adsilence.apk
Generating vulnerability dataset from: afh-downloader.apk
Generating vulnerability dataset from: afwall+.apk
Generating vulnerability dataset from: ag-store.apk
Generating vulnerability dataset from: aicia.apk
Generating vulnerability dataset from: aiproute.apk
Generating vulnerability dataset from: airmessa