# Imports

In [12]:
from google.cloud import storage
import google.auth
import os
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

# Data

In [13]:
df = (pd.read_csv('file_sizes_analysis_data/sorel_pdf_dataset.csv')
      .assign(dest_path = '')
     )
df.head()

Unnamed: 0,size_bytes,modify_time_utc,path,size_kilobytes,size_megabytes,dataset,kind,dest_path
0,396936,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,396.936,0.396936,pdf,benign,
1,250800,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,250.8,0.2508,pdf,benign,
2,246975,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,246.975,0.246975,pdf,benign,
3,367605,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,367.605,0.367605,pdf,benign,
4,32782,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/03...,32.782,0.032782,pdf,benign,


In [14]:
dest_bucket = 'dsci591_g4mic'
dest_bucket_benign_prefix = 'raw/benign/'
dest_bucket_malicious_prefix = 'raw/malicious/'
dest_bucket_benign_path = os.path.join('gs://', dest_bucket, dest_bucket_benign_prefix)
dest_bucket_malware_path = os.path.join('gs://', dest_bucket, dest_bucket_malicious_prefix)

s_file_names = df.path.str.split('/').apply(lambda x: x[-1])
benign_mask = df.kind == 'benign'
df.loc[benign_mask, 'dest_path'] = dest_bucket_benign_path + s_file_names[benign_mask]
df.loc[~benign_mask, 'dest_path'] = dest_bucket_malware_path + s_file_names[~benign_mask]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24604 entries, 0 to 24603
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   size_bytes       24604 non-null  int64  
 1   modify_time_utc  24604 non-null  object 
 2   path             24604 non-null  object 
 3   size_kilobytes   24604 non-null  float64
 4   size_megabytes   24604 non-null  float64
 5   dataset          24604 non-null  object 
 6   kind             24604 non-null  object 
 7   dest_path        24604 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.5+ MB


# Bucket Data Copying

Split up because these are in two different projects at the moment. After fixing up perms on a DSCI591-G4 project service account so that it can manage objects in both projects, this became redundant. It could've just been one section for both file sets, since the src/dst paths are all built for both datasets. It separates info & permission sanity check for the origin datasets at least.

In [15]:
def do_copy(src_bucket, dst_bucket, data, verbose=True, exit_on_failure=True):
    '''helper to facilitate copying data from one bucket to another using paths provided in dataset'''
    to_copy = (data
               .loc[:, ['path', 'dest_path']]
               # just grabbing the prefix part of what are otherwise gs://bucket/prefix.
               # using the full gs:// path is good w/ the tensorflow API, since it works
               # with the VFS, and is quick in a Colab env. but it's slow locally.
               .apply(lambda df_x: (df_x
                                   .str.split('/', 3)
                                   .apply(lambda r_x: r_x[-1])
                                   )
                      )
               )
    skipped = 0
    copy_failures = 0
    for _, rec in tqdm(list(to_copy.iterrows())):
        dst = rec.dest_path
        src = rec.path
        
        try:
            from_blob = src_bucket.blob(src)
            _ = src_bucket.copy_blob(from_blob, dst_bucket, dst)
        except Exception as ex:
            if exit_on_failure:
                print(ex)
                return
            copy_failures += 1
        
        # too bad this is horribly slow compared to just using the storage API directly!
        #tf.io.gfile.copy(src, dst, overwrite=True)

    if verbose:
        print(f'completed copying {data.shape[0] - copy_failures - skipped}/{data.shape[0]} files with {skipped} already exists and {copy_failures} failures')

## Sorel

In [16]:
df_sorel = df.loc[lambda df_x: df_x.dataset == 'sorel']
print(f'nunique dataset (should be 1): {df_sorel.dataset.nunique()}')
display(df_sorel.groupby('kind')['kind'].agg(['count']))
display(df_sorel.head())

nunique dataset (should be 1): 1


Unnamed: 0_level_0,count
kind,Unnamed: 1_level_1
malicious,13066


Unnamed: 0,size_bytes,modify_time_utc,path,size_kilobytes,size_megabytes,dataset,kind,dest_path
11538,681788,2022-05-04T21:42:21Z,gs://dsci591_g4_sorel20m/binaries_resampled/00...,681.788,0.681788,sorel,malicious,gs://dsci591_g4mic/raw/malicious/000024a1bdbd0...
11539,116261,2022-05-04T21:40:22Z,gs://dsci591_g4_sorel20m/binaries_resampled/00...,116.261,0.116261,sorel,malicious,gs://dsci591_g4mic/raw/malicious/000148b941e11...
11540,106209,2022-05-04T21:38:53Z,gs://dsci591_g4_sorel20m/binaries_resampled/00...,106.209,0.106209,sorel,malicious,gs://dsci591_g4mic/raw/malicious/0001ee2b649f6...
11541,23503,2022-05-04T21:24:19Z,gs://dsci591_g4_sorel20m/binaries_resampled/00...,23.503,0.023503,sorel,malicious,gs://dsci591_g4mic/raw/malicious/0009398f92bb3...
11542,245423,2022-05-04T21:13:38Z,gs://dsci591_g4_sorel20m/binaries_resampled/00...,245.423,0.245423,sorel,malicious,gs://dsci591_g4mic/raw/malicious/0010e4169bb95...


In [17]:
credentials, project = google.auth.load_credentials_from_file(os.environ['GOOGLE_APPLICATION_CREDENTIALS'])
client = storage.Client(project=project)
src_bucket = client.bucket('dsci591_g4_sorel20m')
dst_bucket = client.bucket(dest_bucket)

# sanity check!
tf.io.gfile.exists(df_sorel.iloc[0].path)

True

In [18]:
do_copy(src_bucket, dst_bucket, df_sorel)

100%|██████████| 13066/13066 [18:50<00:00, 11.56it/s]

completed copying 13066/13066 files with 0 already exists and 0 failures





## PDF

In [19]:
src_bucket = client.bucket('drexel_dsci_2022_g4mic')
df_pdf = df.loc[lambda df_x: df_x.dataset == 'pdf']
print(f'nunique dataset (should be 1): {df_sorel.dataset.nunique()}')
display(df_pdf.groupby('kind')['kind'].agg(['count']))
display(df_pdf.head())

nunique dataset (should be 1): 1


Unnamed: 0_level_0,count
kind,Unnamed: 1_level_1
benign,8361
malicious,3177


Unnamed: 0,size_bytes,modify_time_utc,path,size_kilobytes,size_megabytes,dataset,kind,dest_path
0,396936,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,396.936,0.396936,pdf,benign,gs://dsci591_g4mic/raw/benign/02eounrel.pdf
1,250800,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,250.8,0.2508,pdf,benign,gs://dsci591_g4mic/raw/benign/02frrltr.pdf
2,246975,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,246.975,0.246975,pdf,benign,gs://dsci591_g4mic/raw/benign/02govbnd.pdf
3,367605,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/02...,367.605,0.367605,pdf,benign,gs://dsci591_g4mic/raw/benign/02solp.pdf
4,32782,2022-04-30T14:28:00Z,gs://drexel_dsci_2022_g4mic/Benign/unzipped/03...,32.782,0.032782,pdf,benign,gs://dsci591_g4mic/raw/benign/030.pdf


In [20]:
# sanity check!
tf.io.gfile.exists(df_pdf.iloc[0].path)

True

In [22]:
do_copy(src_bucket, dst_bucket, df_pdf, exit_on_failure=False)

100%|██████████| 11538/11538 [37:29<00:00,  5.13it/s] 

completed copying 11510/11538 files with 0 already exists and 28 failures



