In [30]:
#!pip install sentence-transformers
import numpy as np
import pandas as pd
import pandas_gbq

In [5]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
model.max_seq_length = 512

Run on GPU by default

# Example

Similarities between user behavior daily

In [6]:
sentences = [
         "file_deleted file_deleted file_deleted login_attempt file_deleted login_successful file_deleted file_created login_successful login_attempt file_created login_attempt file_created login_attempt file_deleted login_successful unshared_link login_successful unshared_link login_successful login_successful file_written file_deleted file_deleted file_accessed login_attempt login_successful shared_link file_deleted file_updated file_deleted file_created file_deleted shared_link file_written login_successful login_attempt login_successful login_attempt login_successful login_attempt login_attempt login_attempt login_attempt login_attempt file_written login_successful",
         "file_deleted file_deleted file_deleted login_attempt file_deleted login_successful file_deleted file_created login_successful login_attempt file_created login_attempt file_created login_attempt file_deleted login_successful unshared_link login_successful unshared_link login_successful login_successful file_written file_deleted file_deleted file_accessed login_attempt login_successful shared_link file_deleted file_updated file_deleted file_created file_deleted shared_link file_written login_successful login_attempt login_successful login_attempt login_successful login_attempt login_attempt login_attempt login_attempt login_attempt file_written login_successful",
         "file_written login_attempt login_attempt login_successful file_updated file_updated file_updated login_attempt login_successful login_successful file_written login_attempt file_updated login_attempt file_written login_attempt file_updated file_updated file_written login_successful login_successful file_written login_attempt login_attempt login_attempt login_successful login_successful file_updated file_written file_written file_written file_written login_attempt file_updated file_updated file_written login_attempt login_successful file_updated file_updated file_updated file_updated file_written login_attempt login_successful file_updated file_written login_successful"
            ]
#encode the sentences 
embeddings = model.encode(sentences, convert_to_tensor=True)
#compute the similarity scores
cosine_scores = util.cos_sim(embeddings, embeddings)
#compute/find the highest similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i] 
                                                             [j]})
#sort the scores in decreasing order 
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i],
                                  sentences[j], pair['score']))

file_deleted file_deleted file_deleted login_attempt file_deleted login_successful file_deleted file_created login_successful login_attempt file_created login_attempt file_created login_attempt file_deleted login_successful unshared_link login_successful unshared_link login_successful login_successful file_written file_deleted file_deleted file_accessed login_attempt login_successful shared_link file_deleted file_updated file_deleted file_created file_deleted shared_link file_written login_successful login_attempt login_successful login_attempt login_successful login_attempt login_attempt login_attempt login_attempt login_attempt file_written login_successful 		 file_deleted file_deleted file_deleted login_attempt file_deleted login_successful file_deleted file_created login_successful login_attempt file_created login_attempt file_created login_attempt file_deleted login_successful unshared_link login_successful unshared_link login_successful login_successful file_written file_deleted 

### Getting aggregated data on daily user activities

In [7]:
project_id = "phd1-374514"
df_a = pd.io.gbq.read_gbq('''SELECT *  FROM `phd1-374514.train.anomaly_activity_type` ORDER BY DATE''', project_id=project_id, dialect='standard')
df_a

Unnamed: 0,uid,date,normal_activity,actions,anomaly
0,fine-rose-kangaroo-gambler,2017-07-07,file_updated file_written file_updated file_wr...,139,0
1,developed-harlequin-falcon-radiodirector,2017-07-07,file_accessed file_accessed file_accessed file...,671,0
2,dirty-apricot-sawfish-metalengineer,2017-07-07,login_attempt login_successful file_created fi...,102,0
3,logical-coral-pig-warehouseman,2017-07-07,file_accessed file_accessed file_accessed file...,639,0
4,ethnic-lavender-gerbil-gamingclubmanager,2017-07-07,file_accessed file_accessed,2,0
...,...,...,...,...,...
83142,surrounding-white-haddock-seamstress,2022-09-29,login_attempt login_attempt login_successful a...,21,0
83143,light-fuchsia-cattle-poolattendant,2022-09-29,login_attempt login_attempt login_attempt logi...,5,0
83144,big-maroon-lynx-radiocontroller,2022-09-29,login_attempt login_successful file_updated fi...,8,0
83145,shrill-ivory-dormouse-astrologer,2022-09-29,login_attempt,1,0


In [10]:
model.max_seq_length = 512

In [11]:
text = df_a.normal_activity
sentences = text.tolist()

In [14]:
len(sentences)

83147

Creating embeddings 384

In [15]:
batch_size = 20
num_sentences = len(sentences)
embeddings = []

for i in range(0, num_sentences, batch_size):
    batch_sentences = sentences[i : i + batch_size]
    batch_embeddings = model.encode(batch_sentences, convert_to_tensor=True)
    embeddings.append(batch_embeddings)
    # embeddings.extend(batch_embeddings)
    if i % 10000 == 0:
        print(i)

0
10000
20000
30000
40000
50000
60000
70000
80000


In [18]:
numpy_arrays = [t.cpu().numpy() for t in embeddings]
concatenated_array = np.vstack(numpy_arrays)
df_e = pd.DataFrame(concatenated_array)

In [19]:
df_e

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.001862,0.028964,-0.032206,-0.013161,-0.002489,-0.010694,0.079223,-0.005928,0.009117,0.125857,...,-0.026126,0.057509,0.073051,0.089623,0.027199,0.036084,0.075247,0.035460,-0.040925,0.050898
1,0.006868,0.044669,-0.042298,-0.036730,-0.025512,-0.018258,0.073692,-0.007901,-0.003433,0.115777,...,-0.027679,0.051347,0.062427,0.073091,0.012538,0.056702,0.087636,0.033654,-0.059923,0.056338
2,-0.005356,-0.017417,-0.009455,0.004525,0.028756,0.023458,0.061395,-0.017600,0.003139,0.107803,...,-0.000535,0.048298,0.087201,0.083798,0.037264,0.002962,0.094681,0.046697,-0.033465,0.065849
3,0.003355,0.046362,-0.048074,-0.032983,-0.026461,-0.017325,0.068941,-0.007742,-0.010320,0.112046,...,-0.028125,0.053865,0.052474,0.077065,0.005759,0.057341,0.081656,0.033664,-0.063469,0.056420
4,0.005168,0.065368,-0.074115,-0.002009,-0.015829,-0.031042,0.090803,0.065894,0.064219,0.068394,...,-0.000522,0.012184,0.039752,0.020480,-0.058370,0.042130,-0.014331,0.049943,-0.033928,0.052840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83142,0.081539,-0.002543,-0.013620,-0.060213,-0.024384,0.007030,0.065708,0.017007,-0.072044,-0.000833,...,0.015923,0.037603,0.053202,0.046457,0.041549,0.031706,0.062639,0.100987,-0.009037,0.013796
83143,0.066211,0.038113,0.011046,-0.087347,-0.021226,-0.036528,0.130147,-0.024527,-0.015163,-0.015763,...,0.006757,0.025086,0.028866,0.029029,-0.035938,-0.005973,0.110573,0.080465,0.062784,0.005670
83144,0.079238,0.020987,-0.047965,-0.059953,0.010153,-0.030729,0.098300,0.019034,0.036254,0.090081,...,-0.040990,-0.010079,0.074034,0.016206,0.048779,-0.018890,0.048820,0.038070,0.039847,0.034776
83145,0.031391,0.021276,-0.018026,-0.084589,-0.005917,-0.048382,0.122493,-0.010112,-0.021105,-0.014053,...,0.021885,-0.006977,0.003869,0.031290,-0.037791,-0.017588,0.059647,0.067484,0.000020,-0.027823


In [20]:
df_e.columns = df_e.columns.astype(str)
df_e = df_e.add_prefix('col_')
df_e.to_parquet('type_embeddings_384.parquet')

In [25]:
df_a.to_parquet('type_user_activity.parquet')

In [31]:
df = pd.concat([df_a, df_e], axis=1)
pandas_gbq.to_gbq(df, 'features.type_embeddings_384_full', project_id=project_id)

100%|██████████| 1/1 [00:00<00:00, 12372.58it/s]


In [28]:
df.head()

Unnamed: 0,uid,date,normal_activity,actions,anomaly,col_0,col_1,col_2,col_3,col_4,...,col_374,col_375,col_376,col_377,col_378,col_379,col_380,col_381,col_382,col_383
0,fine-rose-kangaroo-gambler,2017-07-07,file_updated file_written file_updated file_wr...,139,0,0.001862,0.028964,-0.032206,-0.013161,-0.002489,...,-0.026126,0.057509,0.073051,0.089623,0.027199,0.036084,0.075247,0.03546,-0.040925,0.050898
1,developed-harlequin-falcon-radiodirector,2017-07-07,file_accessed file_accessed file_accessed file...,671,0,0.006868,0.044669,-0.042298,-0.03673,-0.025512,...,-0.027679,0.051347,0.062427,0.073091,0.012538,0.056702,0.087636,0.033654,-0.059923,0.056338
2,dirty-apricot-sawfish-metalengineer,2017-07-07,login_attempt login_successful file_created fi...,102,0,-0.005356,-0.017417,-0.009455,0.004525,0.028756,...,-0.000535,0.048298,0.087201,0.083798,0.037264,0.002962,0.094681,0.046697,-0.033465,0.065849
3,logical-coral-pig-warehouseman,2017-07-07,file_accessed file_accessed file_accessed file...,639,0,0.003355,0.046362,-0.048074,-0.032983,-0.026461,...,-0.028125,0.053865,0.052474,0.077065,0.005759,0.057341,0.081656,0.033664,-0.063469,0.05642
4,ethnic-lavender-gerbil-gamingclubmanager,2017-07-07,file_accessed file_accessed,2,0,0.005168,0.065368,-0.074115,-0.002009,-0.015829,...,-0.000522,0.012184,0.039752,0.02048,-0.05837,0.04213,-0.014331,0.049943,-0.033928,0.05284


In [32]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Aug 11 07:58:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    33W /  70W |   1599MiB / 15360MiB |      0%      Default |
|                               |            