In [1]:
import pandas as pd

# Load your dataset

# Load Veremi dataset
dataset_path = 'C:\Dataset\CICMalDroid2020.csv'  # the actual file path
df = pd.read_csv(dataset_path)
print("Original shape:", df.shape)
print("Original classes:\n", df['Class'].value_counts())

Original shape: (11598, 471)
Original classes:
 Class
3    3904
4    2546
2    2100
5    1795
1    1253
Name: count, dtype: int64


In [2]:
# Step 1: Handling Missing Values
df = df.dropna()
print("After Step 1 - Handling Missing Values:", df.shape)

After Step 1 - Handling Missing Values: (11598, 471)


In [3]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold

# Step 2: Removing Duplicates
df = df.drop_duplicates()
print("After Step 2 - Removing Duplicates:", df.shape)

After Step 2 - Removing Duplicates: (11526, 471)


In [4]:
# Step 3: Features Removal (drop features with >90% zero values)
X = df.drop('Class', axis=1)
y = df['Class']
zero_counts = (X == 0).sum()
high_zero_features = zero_counts[zero_counts > 0.9 * len(X)].index
X = X.drop(columns=high_zero_features)
print(f"After Step 3 - Removed {len(high_zero_features)} high-zero features. New shape: {X.shape}")

After Step 3 - Removed 312 high-zero features. New shape: (11526, 158)


In [5]:
# Step 4: Handle infinite and null values (if any)
X = X.replace([np.inf, -np.inf], np.nan).dropna()
print("After Step 4 - Handled infinities and nulls:", X.shape)

After Step 4 - Handled infinities and nulls: (11526, 158)


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [10]:
# a) Add label back for correlation matrix
df_corr = X.copy()
df_corr["Class"] = y

In [11]:
# b) Correlation of each feature with the target
cor_matrix = df_corr.corr()
target_corr = cor_matrix["Class"].drop("Class").abs()


In [12]:
# c) Select features strongly correlated with the class (threshold = 0.1)
selected_by_target = target_corr[target_corr > 0.1].index.tolist()

In [13]:
# d) Remove highly inter-correlated features (correlation > 0.9)
reduced_corr_matrix = df_corr[selected_by_target].corr().abs()
upper = reduced_corr_matrix.where(np.triu(np.ones(reduced_corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]


In [14]:
# e) Final selected features
selected_features_cfs = [feat for feat in selected_by_target if feat not in to_drop]

In [15]:
print("\nSelected Features by CFS (Numbered):")
for idx, feat in enumerate(selected_features_cfs, start=1):
    print(f"{idx}. {feat}")


Selected Features by CFS (Numbered):
1. CREATE_FOLDER_____
2. CREATE_PROCESS`_____
3. CREATE_THREAD_____
4. FS_ACCESS____
5. FS_ACCESS()____
6. FS_ACCESS(CREATE)____
7. FS_ACCESS(CREATE__READ)__
8. FS_ACCESS(CREATE__READ__WRITE)
9. FS_ACCESS(CREATE__WRITE)__
10. FS_ACCESS(WRITE)____
11. FS_PIPE_ACCESS___
12. FS_PIPE_ACCESS(READ)___
13. FS_PIPE_ACCESS(READ__WRITE)_
14. NETWORK_ACCESS____
15. NETWORK_ACCESS()____
16. NETWORK_ACCESS(READ__WRITE)__
17. NETWORK_ACCESS(WRITE__)__
18. __arm_nr_cacheflush
19. brk
20. chmod
21. close
22. epoll_create
23. fchmod
24. finishDrawing
25. flock
26. fstat64
27. getActivityInfo
28. getDataNetworkType
29. getIccSerialNumber
30. getStreamVolume
31. getSubscriberId
32. getdents64
33. getegid32
34. getsockopt
35. hasSystemFeature
36. ioctl
37. isSpeakerphoneOn
38. listen
39. lseek
40. madvise
41. mkdir
42. mmap2
43. mprotect
44. msync
45. prctl
46. pwrite64
47. registerContentObserver
48. remove
49. setpgid
50. setsockopt
51. sigaction
52. socket
53. stat

In [None]:

#Save
X_cfs = X[selected_features_cfs].copy()
X_cfs["Class"] = y
output_csv = 'C:\Dataset\cleaned_DF_with_CFS.csv'  # Output file path
X_cfs.to_csv(output_csv, index=False)
print("\n Dataset saved to cleaned_DF_with_CFS.csv")


💾 Dataset saved to cleaned_DF_with_CFS.csv
