# Imports

In [None]:
from io import BytesIO
from functools import reduce
from google.cloud import storage
import google.auth
import numpy as np
from operator import mul
import os
import pandas as pd
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from tqdm import tqdm
from typing import Dict, List

# only need to turn this on if you're intending to update the notebook
# to copy samples to another new bucket / location for conversion to
# images & saving to another images prefix.
DO_COPY = False

# Data

In [13]:
df = (pd.read_csv('file_sizes_analysis_data/sorel_pdf_dataset.csv')
      .assign(dest_path = '')
     )
df.head()

Unnamed: 0,size_bytes,modify_time_utc,path,size_kilobytes,size_megabytes,dataset,kind,dest_path
0,396936,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,396.936,0.396936,pdf,benign,
1,250800,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,250.8,0.2508,pdf,benign,
2,246975,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,246.975,0.246975,pdf,benign,
3,367605,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,367.605,0.367605,pdf,benign,
4,32782,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/03...,32.782,0.032782,pdf,benign,


In [14]:
dest_bucket = 'dsci591_g4mic'
dest_bucket_benign_prefix = 'raw/benign/'
dest_bucket_malicious_prefix = 'raw/malicious/'
dest_bucket_benign_path = os.path.join('gs://', dest_bucket, dest_bucket_benign_prefix)
dest_bucket_malware_path = os.path.join('gs://', dest_bucket, dest_bucket_malicious_prefix)

s_file_names = df.path.str.split('/').apply(lambda x: x[-1])
benign_mask = df.kind == 'benign'
df.loc[benign_mask, 'dest_path'] = dest_bucket_benign_path + s_file_names[benign_mask]
df.loc[~benign_mask, 'dest_path'] = dest_bucket_malware_path + s_file_names[~benign_mask]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24604 entries, 0 to 24603
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   size_bytes       24604 non-null  int64  
 1   modify_time_utc  24604 non-null  object 
 2   path             24604 non-null  object 
 3   size_kilobytes   24604 non-null  float64
 4   size_megabytes   24604 non-null  float64
 5   dataset          24604 non-null  object 
 6   kind             24604 non-null  object 
 7   dest_path        24604 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.5+ MB


# Bucket Data Copying

Split up because these are in two different projects at the moment. After fixing up perms on a DSCI591-G4 project service account so that it can manage objects in both projects, this became redundant. It could've just been one section for both file sets, since the src/dst paths are all built for both datasets. It separates info & permission sanity check for the origin datasets at least.

In [15]:
def do_copy(src_bucket, dst_bucket, data, verbose=True, exit_on_failure=True):
    '''helper to facilitate copying data from one bucket to another using paths provided in dataset'''
    to_copy = (data
               .loc[:, ['path', 'dest_path']]
               # just grabbing the prefix part of what are otherwise gs://bucket/prefix.
               # using the full gs:// path is good w/ the tensorflow API, since it works
               # with the VFS, and is quick in a Colab env. but it's slow locally.
               .apply(lambda df_x: (df_x
                                   .str.split('/', 3)
                                   .apply(lambda r_x: r_x[-1])
                                   )
                      )
               )
    skipped = 0
    copy_failures = 0
    for _, rec in tqdm(list(to_copy.iterrows())):
        dst = rec.dest_path
        src = rec.path
        
        try:
            from_blob = src_bucket.blob(src)
            _ = src_bucket.copy_blob(from_blob, dst_bucket, dst)
        except Exception as ex:
            if exit_on_failure:
                print(ex)
                return
            copy_failures += 1
        
        # too bad this is horribly slow compared to just using the storage API directly!
        #tf.io.gfile.copy(src, dst, overwrite=True)

    if verbose:
        print(f'completed copying {data.shape[0] - copy_failures - skipped}/{data.shape[0]} files with {skipped} already exists and {copy_failures} failures')

## Sorel

In [16]:
df_sorel = df.loc[lambda df_x: df_x.dataset == 'sorel']
print(f'nunique dataset (should be 1): {df_sorel.dataset.nunique()}')
display(df_sorel.groupby('kind')['kind'].agg(['count']))
display(df_sorel.head())

nunique dataset (should be 1): 1


Unnamed: 0_level_0,count
kind,Unnamed: 1_level_1
malicious,13066


Unnamed: 0,size_bytes,modify_time_utc,path,size_kilobytes,size_megabytes,dataset,kind,dest_path
11538,681788,2022-05-04T21:42:21Z,gs://dsci591_g4_sorel20m/binaries_resampled/00...,681.788,0.681788,sorel,malicious,gs://dsci591_g4mic/raw/malicious/000024a1bdbd0...
11539,116261,2022-05-04T21:40:22Z,gs://dsci591_g4_sorel20m/binaries_resampled/00...,116.261,0.116261,sorel,malicious,gs://dsci591_g4mic/raw/malicious/000148b941e11...
11540,106209,2022-05-04T21:38:53Z,gs://dsci591_g4_sorel20m/binaries_resampled/00...,106.209,0.106209,sorel,malicious,gs://dsci591_g4mic/raw/malicious/0001ee2b649f6...
11541,23503,2022-05-04T21:24:19Z,gs://dsci591_g4_sorel20m/binaries_resampled/00...,23.503,0.023503,sorel,malicious,gs://dsci591_g4mic/raw/malicious/0009398f92bb3...
11542,245423,2022-05-04T21:13:38Z,gs://dsci591_g4_sorel20m/binaries_resampled/00...,245.423,0.245423,sorel,malicious,gs://dsci591_g4mic/raw/malicious/0010e4169bb95...


In [17]:
credentials, project = google.auth.load_credentials_from_file(os.environ['GOOGLE_APPLICATION_CREDENTIALS'])
client = storage.Client(project=project)
src_bucket = client.bucket('dsci591_g4_sorel20m')
dst_bucket = client.bucket(dest_bucket)

# sanity check!
tf.io.gfile.exists(df_sorel.iloc[0].path)

True

In [18]:
if DO_COPY:
    do_copy(src_bucket, dst_bucket, df_sorel)

100%|██████████| 13066/13066 [18:50<00:00, 11.56it/s]

completed copying 13066/13066 files with 0 already exists and 0 failures





## PDF

In [19]:
src_bucket = client.bucket('drexel_dsci_2022_g4mic')
df_pdf = df.loc[lambda df_x: df_x.dataset == 'pdf']
print(f'nunique dataset (should be 1): {df_sorel.dataset.nunique()}')
display(df_pdf.groupby('kind')['kind'].agg(['count']))
display(df_pdf.head())

nunique dataset (should be 1): 1


Unnamed: 0_level_0,count
kind,Unnamed: 1_level_1
benign,8361
malicious,3177


Unnamed: 0,size_bytes,modify_time_utc,path,size_kilobytes,size_megabytes,dataset,kind,dest_path
0,396936,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,396.936,0.396936,pdf,benign,gs://dsci591_g4mic/raw/benign/02eounrel.pdf
1,250800,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,250.8,0.2508,pdf,benign,gs://dsci591_g4mic/raw/benign/02frrltr.pdf
2,246975,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,246.975,0.246975,pdf,benign,gs://dsci591_g4mic/raw/benign/02govbnd.pdf
3,367605,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,367.605,0.367605,pdf,benign,gs://dsci591_g4mic/raw/benign/02solp.pdf
4,32782,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/03...,32.782,0.032782,pdf,benign,gs://dsci591_g4mic/raw/benign/030.pdf


In [20]:
# sanity check!
tf.io.gfile.exists(df_pdf.iloc[0].path)

True

In [22]:
if DO_COPY:
    do_copy(src_bucket, dst_bucket, df_pdf, exit_on_failure=False)

100%|██████████| 11538/11538 [37:29<00:00,  5.13it/s] 

completed copying 11510/11538 files with 0 already exists and 28 failures





# Image Conversion

In [62]:
def read_file(path: str,
              dtype: str = 'uint8',
              ) -> np.array:
    '''reads a file from GCS into a numpy array
    
    args
        path: GCS path to file in gs://bucket/prefix/filename format
        dtype: dtype to shape data to
    
    returns
        numpy array containing file's content represented as :param:`dtype` bytes
    '''
    fin = BytesIO(file_io.read_file_to_string(path, binary_mode=True))
    return np.frombuffer(fin.read(), dtype='uint8')

def write_file(data: np.array,
               path: str,
               ext: str = '.npy',
               ):
    '''writes a numpy array of the specified dtype to a file in a bucket on GCS
    
    args
        data: numpy array containing bytes to write to GCS
        path: path on GCS to save the file to
        ext: extension to use for saved files. default is numpy's canonical .npy.
    '''
    pre, _ = os.path.splitext(path)
    np.save(file_io.FileIO(pre + ext, 'w'), data)

def convert_to_image(data: np.array,
                     dimension: int,
                     ):
    '''implements file conversion from binary to square bytes used for grayscale image representation
    
    args:
        data: a numpy array containing a file's contents with dtype=uint8
        dimension: value used for H and W
    
    returns
        a numpy 2D numpy array containing the truncated or zero-padded bytes of shape (dimension, dimension)
    '''
    target_shape = (dimension, dimension, 1)
    total_bytes_allowed = reduce(mul, target_shape)
    image = np.zeros(shape=(total_bytes_allowed,))
    num_bytes = min(data.shape[0], total_bytes_allowed)
    image[:num_bytes] = data[:num_bytes]
    
    return image

def convert_to_images(dataset_meta: pd.DataFrame,
                      dimension: int,
                      src_prefix: str = '/raw/',
                      dst_prefix: str = '/images/',
                      ):
    '''converts images in a Google Cloud Storage bucket to their reprensentation as an image. save format at
    destination is a numpy array (*.npy).
    
    n.b. an extremely helpful ref used to help produce all of this: https://stackoverflow.com/questions/41633748/load-numpy-array-in-google-cloud-ml-job
    
    args
        dataset_meta: dataframe containing the sorel_pdf_dataset.csv data from which dest_path will be used
        dimension: value used for H and W
        src_prefix: prefix value containing raw data; should be a substring data.dest_path
        dst_prefix: prefix value where converted image data will be saved
    '''
    for _, rec in tqdm(list(dataset_meta.iterrows())):
        # counterintuitive naming: rec.dest_path is with respect to what the sorel_pdf_dataset.csv specified
        # when the data was originally copied form rec.path -> rec.dest_path to consolidate sorel + pdf datasets
        # to one bucket for a common image conversion to be applied
        src_path = rec.dest_path
        data = read_file(src_path)
        image_data = convert_to_image(data, dimension)
        
        dst_path = src_path.replace(src_prefix, dst_prefix)
        write_file(data, dst_path)
        

In [63]:
# kind of slow but works well! it's because this is using the same stuff that the tf.io.GFile API is using
convert_to_images(df, 648)

  1%|          | 172/24604 [02:21<5:00:59,  1.35it/s]