In [2]:
import multiprocessing
import numpy as np
import pandas as pd
import pickle
import gensim
from gensim.models import KeyedVectors

In [3]:
def operator_hadamard(u, v):
    return u * v


def process_df_function(df):

    output_df = df.copy()
    graph_structure = (1, 3, 2)
    client_to_ip_pred = []
    ip_to_service_pred = []
    for index, row in output_df.iterrows():
        try: 
            client_embedding = wv[(row[graph_structure[0]])]
            ip_embedding = wv[(row[graph_structure[1]])]
            service_embedding = wv[(row[graph_structure[2]])]
            client_to_ip_embedding = [operator_hadamard(client_embedding, ip_embedding)]
            ip_to_service_embedding = [operator_hadamard(ip_embedding, service_embedding)]
            client_to_ip = clf.predict_proba(client_to_ip_embedding)
            ip_to_service = clf.predict_proba(ip_to_service_embedding)
            client_to_ip_pred.append(client_to_ip[0][1])
            ip_to_service_pred.append(ip_to_service[0][1])
        except: 
            client_to_ip_pred.append(None)
            ip_to_service_pred.append(None)
    output_df['client_to_ip_pred'] = client_to_ip_pred
    output_df['ip_to_service_pred'] = ip_to_service_pred  

    output_df.reset_index(drop=True, inplace=True)

    return output_df


In [4]:

name='2_18_500_epochs'
with open(f'../lm-vol/{name}_lr.pkl', 'rb') as file:
    clf = pickle.load(file)
model = gensim.models.Word2Vec.load(f'../lm-vol/{name}_word2vec.model')
wv = KeyedVectors.load(f'../lm-vol/{name}_word2vec.wordvectors', mmap='r')


In [5]:
%%time

for chunk in pd.read_csv('../lm-vol/LANL_test.csv', header=None, chunksize=1000000):
    NUM_CORES = 20
    # split the raw dataframe into chunks
    df_chunks = np.array_split(chunk ,NUM_CORES)
    # use a pool to spawn multiple proecsses
    with multiprocessing.Pool(NUM_CORES) as pool:
        # concatenate all processed chunks together.
        # process_df_function is the function you defined in the previous block
        processed_df = pd.concat(pool.map(process_df_function, df_chunks), ignore_index=True)
    processed_anomalies = processed_df[(processed_df["ip_to_service_pred"].astype(float) <= 0.1) | (processed_df["client_to_ip_pred"].astype(float) <= 0.1)]


CPU times: user 1.63 s, sys: 532 ms, total: 2.17 s
Wall time: 20.1 s


In [6]:
%%time
for chunk in pd.read_csv('../lm-vol/LANL_test.csv', header=None, chunksize=1000000):
    process_df_function(chunk)
    break

CPU times: user 6min 17s, sys: 13.4 s, total: 6min 30s
Wall time: 7min 1s
