In [29]:
import numpy as np
import pandas as pd

def load_dataset():
	df = pd.read_csv("auth-small.csv", sep=",", header=None)
	df.columns = ["T","SUser","DUser","SC","DC","AuthType","LogonType","AuthOrie","Success"]
	return df

In [30]:
NUM_FEATURES = 9
df = load_dataset()
df.head()

Unnamed: 0,T,SUser,DUser,SC,DC,AuthType,LogonType,AuthOrie,Success
0,1,ANONYMOUS LOGON@C586,ANONYMOUS LOGON@C586,C1250,C586,NTLM,Network,LogOn,Success
1,1,ANONYMOUS LOGON@C586,ANONYMOUS LOGON@C586,C586,C586,?,Network,LogOff,Success
2,1,C101$@DOM1,C101$@DOM1,C988,C988,?,Network,LogOff,Success
3,1,C1020$@DOM1,SYSTEM@C1020,C1020,C1020,Negotiate,Service,LogOn,Success
4,1,C1021$@DOM1,C1021$@DOM1,C1021,C625,Kerberos,Network,LogOn,Success


In [31]:
SUser_agg = {
    "SC" : {
        "F1": lambda x: x.nunique()
    },
    "DC" : {
        "F2": lambda x: x.nunique()
    },
    "Success" : {
        "F3": lambda x: (x == 'Failure').sum()
    },
    "AuthOrie" : {
        "F4": "count"
    },
    "DUser" : {
        "F5": lambda x: x.nunique()
    }
}
DUser_agg = {
	"SC" : {
		"F6" : lambda x: x.nunique()
	},
	"DC" : {
		"F7" : lambda x: x.nunique()
 	},
	"AuthOrie" : {
        "F8": "count"
    } 
}
DC_agg = {
	"SC" : {
		"F9" : lambda x: x.nunique() 
 	}	
}

In [32]:

FS1 = df.groupby(["T", "SUser"]).agg(SUser_agg).max(level=0)
FS2 = df.groupby(["T", "DUser"]).agg(DUser_agg).max(level=0)
FS3 = df.groupby(["T", "DC"]).agg(DC_agg).max(level=0)

FEATURES = {
	1 : FS1["SC"]["F1"],
	2 : FS1["DC"]["F2"],
	3 : FS1["Success"]["F3"],
	4 : FS1["AuthOrie"]["F4"],
	5 : FS1["DUser"]["F5"],
	6 : FS2["SC"]["F6"],
	7 : FS2["DC"]["F7"],
	8 : FS2["AuthOrie"]["F8"],
	9 : FS3["SC"]["F9"]
}
FEATURES

{1: T
 1         7
 2         6
 3         6
 4         4
 5         3
 6         4
 7         3
 8         5
 9         4
 10        3
 11        5
 12        3
 13        4
 14        3
 15        4
 16        4
 17        5
 18        2
 19        2
 20        3
 21        2
 22        3
 23        3
 24        4
 25        3
 26        4
 27        3
 28        3
 29        4
 30        4
          ..
 54628     7
 54629     4
 54630     8
 54631     5
 54632    10
 54633     4
 54634     4
 54635     6
 54636     4
 54637     4
 54638     9
 54639     4
 54640     3
 54641     7
 54642     4
 54643     4
 54644     7
 54645     4
 54646     4
 54647     3
 54648     9
 54649     8
 54650     4
 54651    10
 54652     6
 54653     4
 54654     7
 54655     8
 54656     6
 54657     6
 Name: F1, dtype: int64, 2: T
 1         9
 2        10
 3         8
 4         5
 5         4
 6         6
 7         4
 8         5
 9         4
 10        3
 11        4
 12        3
 13        4
 1

In [33]:
N = FS1.shape[0]

feature_set = np.zeros((N, NUM_FEATURES), dtype=np.int)

for i in range(9):
	feature_set[:, i] = FEATURES[i+1]

feature_set.shape

(54657, 9)

In [37]:
np.savetxt('auth_1s_aggregation.csv', feature_set, delimiter=',')