## File System Feature Engineering
#### Author: Nathan Tibbetts
#### Date: 28 March 2020
#### Class: ACME Volume 3

In [68]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from datetime import datetime

In [84]:
### Engineer our Data

# Load pickled data
a = pd.read_pickle("linux_stem_filesystem0_public.pkl")

# Here and there we will drop unnecessary columns;
#   Our current analyses won't be using these.
a.drop(["Inode", "Device", "Group ID"], axis=1, inplace=True)

# Define our estimation of what's in the user-space
a["Irregular"] = (a["Is Directory"] == 0) & (a["Is Regular File"] == 0) & (a["Is Link To"] < 0)
a["Userspace"] = ((a["User ID"] != 0) &
                  (a["Sub-Hidden"] == 0) &
                  (a["User Read"] == 1) &
                  (a["User Write"] == 1) &
                  (a["Sub-Desktop-Parent"] == 1) &
                  (a["Irregular"] == 0))
print("Defined Userspace")

# Process time format
#   What we want is a representation of how often files are used, but
#   the closest approximation we can get is how long it's been since
#   they were messed with last.
a["Time"] = pd.to_datetime(a["Access Time"])
a["Time2"] = pd.to_datetime(a["Modify Time"])
newest = max(a.Time)
a["Modification Recency"] = newest - a["Time2"]
a.drop(["Time", "Time2", "Metachange Time", "Access Time", "Modify Time"],
       axis=1, inplace=True)
print("Defined Time")

Defined Userspace
Defined Time


In [85]:
# Feature engineering for tree stuff
P = list(a.columns).index("Parent")
children = np.zeros(len(a), dtype=np.uint32)
depth = np.zeros(len(a), dtype=np.uint32)

for i, row in enumerate(a.values):
    # Depth of node in Tree
    j = row[P]
    d = 0
    while j != -1:
        d += 1
        j = a.at[j, 'Parent']
    depth[i] = d
        
    # Number of children
    if row[P] >= 0: children[row[P]] += 1
        
    if i % 1000 == 0: print(i, end="\r")
        
a["Child Count"] = children
a["Depth"] = depth
print(i)
print("Defined Children and Depth")

1008683
Defined Children and Depth


In [86]:
# Do a little more necessary feature engineering, generating log_2 of file sizes.
#   We do the latter because they have such a wide range of sizes, and the distribution becomes closer to normal
#   or bimodal if we do, and is more readable/understandable.
#   We replace -inf's with -1's for graphability.
a["Size Log2"] = np.log(a.Size)/np.log(2)
a["Size Log2"] = [max(s, -1) for s in a["Size Log2"]]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [87]:
# Define non-recursive importance
a["Importance"] = ((a["User ID"] != 0).astype(int) +\
                   (a["User Read"] == 1).astype(int) +\
                   (a["User Write"] == 1).astype(int) +\
                   (1/(1+np.array([d.days for d in a["Modification Recency"]]))) +\
                   (a["Size"] > 4).astype(int) +\
                   (a["Irregular"] == 0).astype(int) +\
                   (a["Sub-Hidden"] == 0).astype(int) +\
                   (a["Sub-Desktop"] == 1).astype(int) +\
                   (a["Sub-Desktop-Parent"] == 1).astype(int)
                  ) / 9
a["Belongs to Root"] = (a["User ID"] == 0)
a.drop(["User ID"], axis=1, inplace=True)
print("Defined Importance")

Defined Importance


In [88]:
# Define recursive weight
DEPTH_FACTOR = 4
P = list(a.columns).index("Parent")
weights = np.ones(len(a))
weights_modulated = a["Importance"].copy().to_numpy()

# Bottom-up approach
for i, row in enumerate(a.values[::-1]):
    j = row[P]
    if j >= 0:
        weights[j] += weights[len(a)-1-i] / DEPTH_FACTOR
        weights_modulated[j] += weights_modulated[len(a)-1-i] / DEPTH_FACTOR
    if i % 1000 == 0: print(i, end="\r")
a["Recursive Weight"] = weights
a["Recursive Importance"] = weights_modulated

print("Defined Recursive Weight and Recursive Importance")

Defined Recursive Weight and Recursive Importance


In [89]:
# a["Recursive Weight"]
# a["Recursive Importance"]
a[a["Userspace"]==1]["Recursive Importance"]

Index
848763    60.494321
848764    29.918764
848770    13.022908
848771    21.500743
848775    21.276254
            ...    
989939     0.783951
989940     0.781250
989941     0.779184
989942     0.780864
989943     0.785185
Name: Recursive Importance, Length: 72708, dtype: float64

Recursive Weight:

This is, for each object, essentially the ratio of space (measured in ratio of areas, not widths) it will need compared to its own drawing size in order to draw all of its subsidiaries. So if the root is 742, that means that drawing the root's box can only take up 1/742 of the visual space if we are to draw ALL of its children. Perhaps we ought to set a max, saying it cannot take less than 1/50 on the top layer, 1/20 on the next, 1/10 on all others, or some such. This effectively limits the number of generations we can draw based on their size, rather than on some arbitrary promise we can't keep to draw them all, because when we get too small we simply stop. It does however give us the ratio between the children of any given node by looking at their numbers relative to each other.

If we assume all things are drawn (or replacing ratios with truncated ones as we go) and given the overall size of a drawing space, we can calculate the exact size an object will be drawn by traveling down to it in the tree, dividing the space by each number as we go down.

This can also be applied when we have Recursive Importance (recursive weight modulated by self importance). Only note that if we still draw each thing in a given generation the same size this will mean either more space between objects (less densly packed) or it will mean drawing more generations in more important branches than in less important ones, because they have been given more relative space without changing object size schemes.

In [90]:
# Correct some data types
a.index = a.index.astype(int)
a.Parent = a.Parent.astype(int)
a["Is Link To"] = a["Is Link To"].astype(int)
a["Child Count"] = a["Child Count"].astype(int)
a.Depth = a.Depth.astype(int)
### TODO! Correct others that are no longer being dropped!

# Show a bit of the user space
a[a.Userspace == 1]

Unnamed: 0_level_0,Parent,Size,Hidden,Sub-Hidden,Sticky,User Read,User Write,User Execute,Group Read,Group Write,...,Irregular,Userspace,Modification Recency,Child Count,Depth,Size Log2,Importance,Belongs to Root,Recursive Weight,Recursive Importance
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
848763,22,4096,False,False,False,True,True,True,True,False,...,False,True,0 days 00:37:20.644943,37,2,12.000000,0.888889,False,81.755875,60.494321
848764,848763,4096,False,False,False,True,True,True,True,True,...,False,True,72 days 02:15:50.955432,30,3,12.000000,0.779300,False,39.175744,29.918764
848770,848763,4096,False,False,False,True,True,True,True,False,...,False,True,29 days 18:36:28.881459,7,3,12.000000,0.892593,False,14.656820,13.022908
848771,848763,4096,False,False,False,True,True,True,True,False,...,False,True,3 days 15:06:10.032516,24,3,12.000000,0.805556,False,27.637107,21.500743
848775,848763,4096,False,False,False,True,True,True,True,False,...,False,True,66 days 10:50:52.985739,4,3,12.000000,0.779436,False,27.345947,21.276254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989939,848789,2405497,False,False,False,True,True,False,True,True,...,False,True,17 days 14:01:30.772572,0,4,21.197904,0.783951,False,1.000000,0.783951
989940,848789,2409462,False,False,False,True,True,False,True,True,...,False,True,31 days 21:03:26.888391,0,4,21.200280,0.781250,False,1.000000,0.781250
989941,848789,2784701,False,False,False,True,True,False,True,True,...,False,True,78 days 20:25:01.157452,0,4,21.409091,0.779184,False,1.000000,0.779184
989942,848789,2411318,False,False,False,True,True,False,True,True,...,False,True,35 days 23:45:49.229982,0,4,21.201390,0.780864,False,1.000000,0.780864


In [91]:
[type(a[col][0]) for col in a.columns]

[numpy.int64,
 numpy.int64,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 numpy.int64,
 numpy.bool_,
 numpy.bool_,
 numpy.bool_,
 str,
 numpy.bool_,
 numpy.bool_,
 pandas._libs.tslibs.timedeltas.Timedelta,
 numpy.int64,
 numpy.int64,
 numpy.float64,
 numpy.float64,
 numpy.bool_,
 numpy.float64,
 numpy.float64]

In [92]:
a.columns

Index(['Parent', 'Size', 'Hidden', 'Sub-Hidden', 'Sticky', 'User Read',
       'User Write', 'User Execute', 'Group Read', 'Group Write',
       'Group Execute', 'Other Read', 'Other Write', 'Other Execute',
       'Is Directory', 'Is Regular File', 'Is Link To', 'Desktop',
       'Sub-Desktop', 'Sub-Desktop-Parent', 'Path', 'Irregular', 'Userspace',
       'Modification Recency', 'Child Count', 'Depth', 'Size Log2',
       'Importance', 'Belongs to Root', 'Recursive Weight',
       'Recursive Importance'],
      dtype='object')

In [93]:
a.to_pickle("data0_ML.pkl")