In [3]:
import os
import pandas as pd
import numpy as np
import scipy.stats as stats
from collections import Counter
from math import log2

Calculating entropy

In [4]:
def calculate_entropy(byte_data):
    if len(byte_data) == 0:
        return 0
    
    counts = Counter(byte_data)
    length = len(byte_data)
    return -sum((freq / length) * log2(freq / length) for freq in counts.values())

Feature extraction from file

In [5]:
def extract_values(file_path):
    with open(file_path, 'rb') as file:
        byte_data = file.read()

    if not byte_data:
        return None
    
    byte_array = np.frombuffer(byte_data, dtype=np.uint8)

    entropy = calculate_entropy(byte_array)
    mean = np.mean(byte_array)
    variance = np.var(byte_array)
    std_dev = np.std(byte_array)
    skewness = stats.skew(byte_array)
    kurtosis = stats.kurtosis(byte_array)
    energy = np.sum(byte_array ** 2)
    rms = np.sqrt(np.mean(byte_array ** 2))
    hist, _ = np.histogram(byte_array, bins=256, range=(0,256))

    return [entropy, mean, variance, std_dev, skewness, kurtosis, energy, rms] + hist.tolist()

Files preprocesing

In [6]:
def process_encrypted_files(input_dir, output_dir):
    all_data = []
    os.makedirs(output_dir, exist_ok=True)

    for format_dir in os.listdir(input_dir):
        format_path = os.path.join(input_dir, format_dir)
        if not os.path.isdir(format_path):
            continue

        format_data = []

        for algo in os.listdir(format_path):
            algo_path = os.path.join(format_path, algo)
            if not os.path.isdir(algo_path):
                continue

            for file_name in os.listdir(algo_path):
                file_path = os.path.join(algo_path, file_name)
                features = extract_values(file_path)
                if features:
                    row = [file_name, format_dir] + features + [algo]
                    all_data.append(row)
                    format_data.append(row)

        if format_data:
            columns = ["file_name", "format", "entropy", "mean", "variance", "std_dev",
                       "skewness", "kurtosis", "energy", "rms"] + [f"histogram_{i}" for i in range(256)] + ["algorithm"]
            df_format = pd.DataFrame(format_data, columns=columns)
            df_format.to_csv(os.path.join(output_dir, f"features_{format_dir}.csv"), index=False)

    if all_data:
        columns = ["file_name", "format", "entropy", "mean", "variance", "std_dev",
                   "skewness", "kurtosis", "energy", "rms"] + [f"histogram_{i}" for i in range(256)] + ["algorithm"]
        df_all = pd.DataFrame(all_data, columns=columns)
        df_all.to_csv(os.path.join(output_dir, "features_all.csv"), index=False)

    return "Extraction ended."

Extraction

In [6]:
process_encrypted_files('encrypted_ecb/encrypted_ecb_1000/encrypted_ecb_1000_1key',
                        'features_ecb/features_ecb_1000/features_ecb_1000_1key_csv')

'Extraction ended.'

In [7]:
process_encrypted_files('encrypted_ecb/encrypted_ecb_1800/encrypted_ecb_1800_1key',
                        'features_ecb/features_ecb_1800/features_ecb_1800_1key_csv')


'Extraction ended.'

In [None]:
process_encrypted_files('encrypted_ecb/encrypted_ecb_3000/encrypted_ecb_3000_1key',
                        'features_ecb/features_ecb_3000/features_ecb_3000_1key_csv')


In [6]:
process_encrypted_files('encrypted_ecb/encrypted_ecb_1000/encrypted_ecb_1000_3keys',
                        'features_ecb/features_ecb_1000/features_ecb_1000_3keys_csv')

'Extraction ended.'

In [7]:
process_encrypted_files('encrypted_ecb/encrypted_ecb_1800/encrypted_ecb_1800_3keys',
                        'features_ecb/features_ecb_1800/features_ecb_1800_3keys_csv')

'Extraction ended.'

In [8]:
process_encrypted_files('encrypted_ecb/encrypted_ecb_1800/encrypted_ecb_1800_6keys',
                        'features_ecb/features_ecb_1800/features_ecb_1800_6keys_csv')

'Extraction ended.'

In [9]:
process_encrypted_files('encrypted_ecb/encrypted_ecb_3000/encrypted_ecb_3000_3keys',
                        'features_ecb/features_ecb_3000/features_ecb_3000_3keys_csv')


'Extraction ended.'

In [10]:
process_encrypted_files('encrypted_ecb/encrypted_ecb_3000/encrypted_ecb_3000_6keys',
                        'features_ecb/features_ecb_3000/features_ecb_3000_6keys_csv')


'Extraction ended.'

In [11]:
process_encrypted_files('encrypted_ecb/encrypted_ecb_1000/encrypted_ecb_1000_6keys',
                        'features_ecb/features_ecb_1000/features_ecb_1000_6keys_csv')

'Extraction ended.'

In [7]:
process_encrypted_files('encrypted_cbc/encrypted_cbc_1000/encrypted_cbc_1000_1key',
                        'features_cbc/features_cbc_1000/features_cbc_1000_1key_csv')

'Extraction ended.'

In [8]:
process_encrypted_files('encrypted_cbc/encrypted_cbc_1000/encrypted_cbc_1000_3keys',
                        'features_cbc/features_cbc_1000/features_cbc_1000_3keys_csv')

'Extraction ended.'

In [9]:
process_encrypted_files('encrypted_cbc/encrypted_cbc_1000/encrypted_cbc_1000_6keys',
                        'features_cbc/features_cbc_1000/features_cbc_1000_6keys_csv')

'Extraction ended.'

In [10]:
process_encrypted_files('encrypted_cbc/encrypted_cbc_1800/encrypted_cbc_1800_1key',
                        'features_cbc/features_cbc_1800/features_cbc_1800_1key_csv')

'Extraction ended.'

In [11]:
process_encrypted_files('encrypted_cbc/encrypted_cbc_1800/encrypted_cbc_1800_3keys',
                        'features_cbc/features_cbc_1800/features_cbc_1800_3keys_csv')

'Extraction ended.'

In [12]:
process_encrypted_files('encrypted_cbc/encrypted_cbc_1800/encrypted_cbc_1800_6keys',
                        'features_cbc/features_cbc_1800/features_cbc_1800_6keys_csv')

'Extraction ended.'

In [13]:
process_encrypted_files('encrypted_cbc/encrypted_cbc_3000/encrypted_cbc_3000_1key',
                        'features_cbc/features_cbc_3000/features_cbc_3000_1key_csv')

'Extraction ended.'

In [14]:
process_encrypted_files('encrypted_cbc/encrypted_cbc_3000/encrypted_cbc_3000_3keys',
                        'features_cbc/features_cbc_3000/features_cbc_3000_3keys_csv')

'Extraction ended.'

In [15]:
process_encrypted_files('encrypted_cbc/encrypted_cbc_3000/encrypted_cbc_3000_6keys',
                        'features_cbc/features_cbc_3000/features_cbc_3000_6keys_csv')

'Extraction ended.'