In [1]:
#Run the optimal kmeans to remove the benign cluster from data (without capture 9)
import pandas as pd
import os
from pycaret.clustering import *


In [2]:
#load data
os.chdir('../..') #to change working directory of the notebook when restarted
#Initialize empty df
df_all=pd.DataFrame()
for root, dirs, files in os.walk("./data/csv/captures_features"):
    for ctuName in files:
        #ignore capture 9
        if ctuName == 'capture20110817.binetflow.labels-positive-weights.labeled.csv':
            continue
        df=pd.read_csv("./data/csv/captures_features/"+ctuName)
        #add to general df as new rows
        df_all=df_all.append(df, ignore_index=True)
print(df_all.shape)


(2874213, 9)


In [3]:
#Change background and normal to 0 and infected values to 1
df_all['label']=df_all['label'].replace(['background','normal'],0)
df_all['label']=df_all['label'].replace(['infected'],1)
#drop node column
df_all=df_all.drop(['node'], axis=1)

#keep first 10000 rows
df_reduced=df_all.head(100)
df_reduced

Unnamed: 0,ID,OD,IDW,ODW,BC,LCC,AC,label
0,0,1,0,1639,0.0,0.0,1.0,0
1,6,0,2113,0,0.0,0.0,16.895299,0
2,1,1,2,2,0.0,0.0,0.997643,0
3,135476,140755,1397778,1398024,30805840000.0,0.0,-0.117842,0
4,1,1,7,7,0.0,0.0,0.999671,0
5,651,664,194078,160979,146235600.0,0.0,-0.004701,0
6,1,1,1,1,0.0,0.0,0.991775,0
7,19269,19665,83768,78421,5721328000.0,0.0,-0.822473,0
8,1,1,2,2,0.0,0.0,0.997643,0
9,1,1,1,1,0.0,0.0,0.998822,0


In [4]:
#Insert artificial bots for testing the algorithm
df_reduced['label'].iloc[3:5]=1
df_reduced['label'].value_counts()

0    98
1     2
Name: label, dtype: int64

In [5]:
df_nolabel=df_reduced.drop(['label'], axis=1)

cluster = setup(df_nolabel, session_id = 7652)

Unnamed: 0,Description,Value
0,session_id,7652
1,Original Data,"(100, 7)"
2,Missing Values,False
3,Numeric Features,5
4,Categorical Features,2
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(100, 28)"
9,CPU Jobs,-1


In [6]:
model = create_model('kmeans',num_clusters=3)
#Add cluster labels to the training data
kmeans_df = assign_model(model)
#add original labels to the df 
kmeans_df['label']=df_all['label']
kmeans_df

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.9852,65022.565,0.0042,0,0,0


Unnamed: 0,ID,OD,IDW,ODW,BC,LCC,AC,Cluster,label
0,0,1,0,1639,0.000000e+00,0.0,1.000000,Cluster 0,0
1,6,0,2113,0,0.000000e+00,0.0,16.895299,Cluster 0,0
2,1,1,2,2,0.000000e+00,0.0,0.997643,Cluster 0,0
3,135476,140755,1397778,1398024,3.080584e+10,0.0,-0.117842,Cluster 1,1
4,1,1,7,7,0.000000e+00,0.0,0.999671,Cluster 0,1
...,...,...,...,...,...,...,...,...,...
95,1,1,3,3,0.000000e+00,0.0,0.996465,Cluster 0,0
96,1,1,1,1,0.000000e+00,0.0,0.998822,Cluster 0,0
97,1,1,3,3,0.000000e+00,0.0,0.996465,Cluster 0,0
98,3,3,74,74,0.000000e+00,0.0,0.909281,Cluster 0,0


In [10]:
#get cluster centroids
#calculate the mean for each column for each cluster
Centroids_df=pd.DataFrame(kmeans_df.groupby('Cluster')['ID'].mean())
Centroids_df['OD']=kmeans_df.groupby('Cluster')['OD'].mean()
Centroids_df['IDW']=kmeans_df.groupby('Cluster')['IDW'].mean()
Centroids_df['ODW']=kmeans_df.groupby('Cluster')['ODW'].mean()
Centroids_df['BC']=kmeans_df.groupby('Cluster')['BC'].mean()
Centroids_df['LCC']=kmeans_df.groupby('Cluster')['LCC'].mean()
Centroids_df['AC']=kmeans_df.groupby('Cluster')['AC'].mean()
Centroids_df



Unnamed: 0_level_0,ID,OD,IDW,ODW,BC,LCC,AC
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cluster 0,53.175258,48.463918,3495.68,2755.32,13305700.0,7.282219e-08,1.11379
Cluster 1,135476.0,140755.0,1397778.0,1398024.0,30805840000.0,0.0,-0.117842
Cluster 2,18213.5,19177.5,4081080.0,2312474.0,5728189000.0,2.558235e-06,-0.411319


In [11]:
#drop data inside the benign cluster
#drop most common cluster
benign_str=kmeans_df['Cluster'].value_counts().idxmax()
phase2_data=kmeans_df[kmeans_df['Cluster']!=benign_str]
#drop Cluster column
phase2_data.drop(['Cluster'], axis=1, inplace=True)
phase2_data


Unnamed: 0,ID,OD,IDW,ODW,BC,LCC,AC,label
3,135476,140755,1397778,1398024,30805840000.0,0.0,-0.117842,1
7,19269,19665,83768,78421,5721328000.0,0.0,-0.822473,0
88,17158,18690,8078393,4546527,5735049000.0,5e-06,-0.000164,0


In [12]:
#save data to csv
phase2_data.to_csv('./data/csv/phase2_data.csv', index=False)
Centroids_df.to_csv('./data/csv/phase1_centroids.csv', index=False)
