In [1]:
import collections
import json
import os
import math
import pandas
from pandas import DataFrame

In [3]:

def json_file_list(path: str, sections: list) -> dict:
    """
    Lists the contents of a specific folder and returns a dictionary containing the contents for each folder within each section.
    """
    file_list_dict = dict.fromkeys(sections)
    for section in sections:
        file_list_dict[section] = os.listdir(path + section)
    return file_list_dict


def read_json(filename: str) -> dict:
    """
    Loads the JSON file.
    """
    try:
        with open(filename, "r") as f:
            data = json.loads(f.read())
    except:
        raise Exception(f"Reading {filename}: file encountered an error")
    return data


def read_file(path: str, section: str) -> list:
    """
    Loads the TXT file.
    """
    if section in path:
        try:
            with open(path, "r") as f:
                data = f.readlines()
        except:
            raise Exception(f"Reading {path} file encountered an error")
        return data


def save_dataframe(dataframe: DataFrame, path: str, section: str, df_name: str):
    """
    Saves the dataframe in CSV format.
    """
    dataframe.to_csv(path + section + df_name + ".csv", index=False)


def flatten(d: dict, sep=".") -> collections.OrderedDict:
    """
    Transforms the nested JSON into a dictionary with tree nodes concatenated into a single level, using a dot as the separator for keys from the original dictionary.
    """
    # import collections
    # obj = {}
    obj = collections.OrderedDict()
    def recurse(t, parent_key=""):
        if isinstance(t, list):
            for i in range(len(t)):
                recurse(t[i], parent_key + sep + str(i) if parent_key else str(i))
        elif isinstance(t, dict):
            for k, v in t.items():                
                recurse(v, parent_key + sep + k if parent_key else k)
        else:
            obj[parent_key] = t
    recurse(d)
    return obj


def normalize(flat_json):
    """
    Function that transforms the flat JSON into a dataframe.
    """
    keys_list = list(flat_json.keys())
    unique_cols = {}
    for items in keys_list:
        api = items.split(sep='.')[-1]
        if api not in unique_cols:
            unique_cols[api] = 0
    for k, v in flat_json.items():
        k_split = k.split(sep='.')[-1]
        if k_split in unique_cols:
            unique_cols[k_split] = unique_cols[k_split] + v
    df1 = pandas.json_normalize(unique_cols)
    return df1


def compute_tf(word_dict):
    """
    Calculates the term frequency (TF) of a document.
    Term frequency is the number of times a term appears in a document divided by the total number of terms in that document.
    """
    tf_dict = {}
    word_count = len(word_dict.keys())
    for word, count in word_dict.items():
        tf_dict[word] = count / float(word_count)
    return tf_dict


def compute_idf(corpus: dict, wordlist: list) -> dict:
    """
    Calculates the inverse document frequency (IDF) of a term using the formula:
    IDF(t) = log_e(Total number of documents / Number of documents containing term t)
    """
    idf_dict = dict.fromkeys(wordlist, 0.0)
    N = len(corpus.keys())
    for documents in corpus:
        for word, value in corpus[documents].items():
            if value > 0:
                idf_dict[word] += 1
    for word, value in idf_dict.items():
        idf_dict[word] = math.log10(((1 + N) / (1 + value)) + 1)
    return idf_dict


def compute_tfidf(tf: dict, idf: dict) -> dict:
    """
    Calculates the TF-IDF for the corpus (TF * IDF for each term).
    """
    tf_idf = {}
    for document_id in tf:
        tf_idf[document_id] = {}
        for term, value in tf[document_id].items():
            tf_idf[document_id][term] = value * idf[term]
    return tf_idf


def create_wordlist(corpus: dict) -> list:
    """
    Creates the list of terms from the document corpus (dictionary).
    """
    wordlist = []
    
    """
    # This section, despite having a fairly straightforward logic, was not efficient for generating the wordlist.
    for documents in corpus:
        for term in corpus[documents].keys():
            if term not in wordlist:
                wordlist.append(term)
    """
    # Creates the wordlist more efficiently than iterating through each term in every document and comparing each one individually with the entire wordlist.
    temp_list = []
    for documents in corpus:
        for term in corpus[documents].keys():
            temp_list.append(term)
    wordlist = list(dict.fromkeys(temp_list))
    return wordlist


def convert_to_dataframe(tf_idf: dict):
    """
    Converts the dictionary with TF-IDF values into a dataframe.
    """
    df1 = pandas.DataFrame
    cont = 0
    sample_id = sorted(list(tf_idf.keys()))
    for report in sample_id:
        if cont == 0:
            print(cont, "- Preparing the dictionary of sample", report, "to transform into DataFrame", end='')
            flatted = flatten(tf_idf[report])
            df1 = normalize(flatted)
            flatted = None
            print("......completed")
        else:
            print(cont, "- Preparing the dictionary of sample", report, "to transform into DataFrame", end='')
            flatted = flatten(tf_idf[report])
            df2 = normalize(flatted)
            flatted = None
            print("......completed")

            print("Concatenating DataFrames", end='')
            table = pandas.concat([df1, df2], ignore_index=True, axis=0)
            df1 = table
            print("......completed")
            
            print("DataFrame with", len(table.columns), "columns")
            
            print("Clearing memory", end='')
            table = None
            df2 = None
            print("......completed")
        cont += 1
    df1["id"] = sample_id
    return df1


def df_process(table: DataFrame) -> DataFrame:
    """
    Replaces N/A values with zero in the dataframe.
    """
    for column in table.columns:
        table[column].fillna(0, inplace=True)
    return table


def strip_chars(dictionary: dict) -> dict:
    """
    Wildcard function, used to implement intermediate processing on the dataframe.
    Reformats strings that are dictionary keys by removing dots to avoid interfering with the flatten function.
    """
    
    #stripped_dict = {}
    #char_to_replace = {'.', ',', ' ', '\n', '\r', '?', '_', '!', '-', '*'}    # ponto virgula e espaço por nada
    #for char in char_to_replace:
    #    stripped_dict = {k.replace(char, ''): v for k, v in dictionary.items()}
    
    # tentativa de substituir as strings por hashes
    hashed_dict = {}
    for key, value in dictionary.items():
        hashed_dict[str(hash(key))] = value
    
    return hashed_dict
    # return stripped_dict


In [11]:
sections = ['behavior']  

path = "augusto_ds/"
  
tf, idf, tf_idf = {}, {}, {} 


In [12]:
print("Getting list of files in section folder", end='')
file_list_dict = json_file_list(path, sections)  
print("............completed")   

Getting list of files in section folder............completed


In [13]:
file_list_dict

{'behavior': ['953 - behavior.json',
  '507 - behavior.json',
  '448 - behavior.json',
  '733 - behavior.json',
  '887 - behavior.json',
  '641 - behavior.json',
  '475 - behavior.json',
  '821 - behavior.json',
  '795 - behavior.json',
  '1105 - behavior.json',
  '310 - behavior.json',
  '1331 - behavior.json',
  '1138 - behavior.json',
  '262 - behavior.json',
  '1077 - behavior.json',
  '774 - behavior.json',
  '494 - behavior.json',
  '914 - behavior.json',
  '540 - behavior.json',
  '749 - behavior.json',
  '432 - behavior.json',
  '866 - behavior.json',
  '1413 - behavior.json',
  '606 - behavior.json',
  '929 - behavior.json',
  '1239 - behavior.json',
  '218 - behavior.json',
  '283 - behavior.json',
  '1142 - behavior.json',
  '1030 - behavior.json',
  '1204 - behavior.json',
  '225 - behavior.json',
  '1220 - behavior.json',
  '201 - behavior.json',
  '1014 - behavior.json',
  '1166 - behavior.json',
  '1286 - behavior.json',
  '1029 - behavior.json',
  '1352 - behavior.json'

In [None]:
for section in sections:
    wordlist = []
    section_corpus = {}

    for file in file_list_dict[section]:    
        document_id = int(file[0:4])  

        if (452 <= document_id <=  468) or (480 <= document_id <= 611) or (712 <= document_id <= 784): # Selection of parts of the corpus to compose the dataframe

        # NoREVIL: (document_id <= 611) or (711 <= document_id <= 784) # After adding goodware, the file became 700MB and colab cannot load it

        # REVIL: (612 <= document_id <=  784) or (791 <= document_id <= 1446)

        # CLOP CONI EGREGOR LOCKBIT: (document_id <=  451) or (712 <= document_id <= 784)

        # MOUNTLOCKER NETWALKER RYUK: (452 <= document_id <=  468) or (480 <= document_id <= 611) or (712 <= document_id <=  784)

            file_path = path + section + "//" + file  
    
            print("Loading file", file, end="")
            document_json = read_json(file_path)
            stripped_document = strip_chars(document_json)
            print("..............completed")

            print("Calculating term frequency (TF)", end="")
            tf[document_id] = compute_tf(stripped_document)
            print("..............completed")

            print("Creating the section_corpus with the documents", end="")
            section_corpus[document_id] = stripped_document
            print("..............completed")

    print("Creating wordlist", end="")
    wordlist = create_wordlist(section_corpus)
    print("..............completed")
    print("Wordlist with " + str(len(wordlist)) + " terms")
    
    print("Calculating inverse document frequency (IDF)", end="")
    idf = compute_idf(section_corpus, wordlist)
    print("..............completed")

    print("Calculating TF-IDF", end="")
    tf_idf = compute_tfidf(tf, idf)
    print("..............completed")
    
    print("Transforming the table", section, "into dataframe")
    dataframe_tf_idf = convert_to_dataframe(tf_idf)
    
    print("Adjusting the dataframe", end="")
    processed_dataframe_tf_idf = df_process(dataframe_tf_idf)
    print("..............completed")
    
    print("Saving dataframe file to disk", end="")
    save_dataframe(processed_dataframe_tf_idf, path, section)
    print("..............completed\n")  

In [None]:
# Checks if the table was created correctly.
import pandas
file_path = "/content/drive/MyDrive/Corpus/behaviornorevil.csv"
with open(file_path,"r") as f:
    table = pandas.read_csv(f)
table

FileNotFoundError: ignored

In [None]:
family = []

for i in range(len(table)):
    if 192<=table["id"][i]<=206:
        family.append("clop")
    elif(207<=table["id"][i]<=310):
        family.append("conti")
    elif(311<=table["id"][i]<=355):
        family.append("egregor")
    elif(403<=table["id"][i]<=435):
        family.append("lockbit")
    elif(438<=table["id"][i]<=451):
        family.append("lockbit")
    elif(452<=table["id"][i]<=468):
        family.append("mountlocker")
    elif(480<=table["id"][i]<=557):
        family.append("netwalker")
    elif(559<=table["id"][i]<=611):
        family.append("ryuk")
    elif(612<=table["id"][i]<=711):
        family.append("revil")
    elif(712<=table["id"][i]<=784):
        family.append("goodware")
    elif(791<table["id"][i]<1446):
        family.append("revil")
    else:
        family.append("unknown")

In [None]:
for i in range(len(family)):
    print(table['id'][i], family[i])