In [1]:
import os
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [35]:
DF_CORPUS_FILE = 'df_corpus.json'
LABELS_FILE = 'labels.json'
CORPUS_FILE = 'corpus.json'

TXT_DIR = '3_compiled_txt'
CSV_DIR = '4_compiled_csv'

TFIDF_DS_FILE = 'tfidf_ds.csv'

In [3]:
def compute_tf(corpus, txt_dict):
    tf_dict = {}
    for term in corpus.keys():
        tf_dict[term] = 0
    for term in txt_dict.keys():
        tf_dict[term] += math.log(1 + txt_dict[term])
    return tf_dict

In [4]:
def compute_idf(corpus):
    idf_dict = {}
    number_of_docs = len(os.listdir(TXT_DIR))
    for term in corpus.keys():
        idf_dict[term] = math.log(number_of_docs / corpus[term])
    return idf_dict

In [5]:
def compute_tf_idf(tf_dict, idf_dict):
    tf_idf_dict = {}
    for term in tf_dict.keys():
        tf_idf_dict[term] = tf_dict[term] * idf_dict[term]
    return tf_idf_dict

In [6]:
json_files = [f.name for f in os.scandir(TXT_DIR) if f.is_file()]

if not os.path.exists(CSV_DIR):
    os.makedirs(CSV_DIR)

In [7]:
json_tf_idf = {}

In [29]:
with open(DF_CORPUS_FILE, 'r') as f:
    df_corpus = json.load(f)

In [9]:
idf_dict = compute_idf(df_corpus)

In [10]:
for file in json_files:
    file_name = file.split('.')[0]
    
    with open(os.path.join(TXT_DIR, file), 'r') as f:
        txt_dict = json.load(f)

        tf_dict = compute_tf(df_corpus, txt_dict)
        tf_idf_dict = compute_tf_idf(tf_dict, idf_dict)
        json_tf_idf[file_name] = tf_idf_dict

In [30]:
list_columns = list(df_corpus.keys())
list_columns.insert(0, 'FILE_ID')

In [31]:
df = pd.DataFrame(columns=list_columns)
df

Unnamed: 0,FILE_ID,msvbvm60.dll,methcallengine,Unnamed: 4,event_sink_addref,dllfunctioncall,event_sink_release,event_sink_queryinterface,__vbaexcepthandler,proccallengine,...,\registry\machine\software\microsoft\windows nt\currentversion\image file execution options\dllnxoptions\virusshare_890baaa6efa59f1405a2d869de9a4e22.exe-query_value_key,\registry\machine\software\microsoft\windows nt\currentversion\image file execution options\virusshare_890baaa6efa59f1405a2d869de9a4e22.exe-open_key_ex,\registry\machine\software\wow6432node\microsoft\windows nt\currentversion\compatibility32\virusshare_890baaa6efa59f1405a2d869de9a4e22-query_value_key,\registry\machine\system\controlset001\services\winsock2\parameters\appid_catalog\124465f8-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-18\02qssdstdgqqbihm-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-18\02mpktfnqqxqjmkf-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-18\02ixrqxmqjokqagk-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-21-4189341723-2754120001-2032404538-500\02qvhzntkcjczhqk-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-18\02pmivjjiqtrvqem-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-21-4189341723-2754120001-2032404538-500\02yxmgkjyzedtuub-open_key_ex


In [33]:
rows = []
for file, tf_idf in json_tf_idf.items():
    tf_idf['FILE_ID'] = file
    rows.append(tf_idf)

df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True)

  df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True)


In [34]:
df

Unnamed: 0,FILE_ID,msvbvm60.dll,methcallengine,Unnamed: 4,event_sink_addref,dllfunctioncall,event_sink_release,event_sink_queryinterface,__vbaexcepthandler,proccallengine,...,\registry\machine\software\microsoft\windows nt\currentversion\image file execution options\dllnxoptions\virusshare_890baaa6efa59f1405a2d869de9a4e22.exe-query_value_key,\registry\machine\software\microsoft\windows nt\currentversion\image file execution options\virusshare_890baaa6efa59f1405a2d869de9a4e22.exe-open_key_ex,\registry\machine\software\wow6432node\microsoft\windows nt\currentversion\compatibility32\virusshare_890baaa6efa59f1405a2d869de9a4e22-query_value_key,\registry\machine\system\controlset001\services\winsock2\parameters\appid_catalog\124465f8-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-18\02qssdstdgqqbihm-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-18\02mpktfnqqxqjmkf-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-18\02ixrqxmqjokqagk-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-21-4189341723-2754120001-2032404538-500\02qvhzntkcjczhqk-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-18\02pmivjjiqtrvqem-open_key_ex,\registry\user\.default\software\microsoft\identitycrl\deviceidentities\production\s-1-5-21-4189341723-2754120001-2032404538-500\02yxmgkjyzedtuub-open_key_ex
0,EPSN21,1.591941,2.032775,3.309417,1.616847,1.629644,1.616847,1.616847,1.612633,2.032775,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,FQ0GOW,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,WVMWHB,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,LMS9JO,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,Y5GV4L,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1685,XXRWLL,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1686,2F8V81,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1687,61JW7Y,0.000000,0.000000,0.704065,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1688,71NDSS,0.000000,0.000000,5.445741,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [36]:
df.to_csv(os.path.join(CSV_DIR, TFIDF_DS_FILE), index=False)