In [2]:
import pandas as pd

# Load your dataset

# Load Veremi dataset
dataset_path = 'C:\Dataset\CICMalDroid2020.csv'  # the actual file path
df = pd.read_csv(dataset_path)
print("Original shape:", df.shape)
print("Original classes:\n", df['Class'].value_counts())


Original shape: (11598, 471)
Original classes:
 Class
3    3904
4    2546
2    2100
5    1795
1    1253
Name: count, dtype: int64


In [3]:
# Step 1: Handling Missing Values
df = df.dropna()
print("After Step 1 - Handling Missing Values:", df.shape)

After Step 1 - Handling Missing Values: (11598, 471)


In [4]:

import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold

# Step 2: Removing Duplicates
df = df.drop_duplicates()
print("After Step 2 - Removing Duplicates:", df.shape)

After Step 2 - Removing Duplicates: (11526, 471)


In [5]:
# Step 3: Features Removal (drop features with >90% zero values)
X = df.drop('Class', axis=1)
y = df['Class']
zero_counts = (X == 0).sum()
high_zero_features = zero_counts[zero_counts > 0.9 * len(X)].index
X = X.drop(columns=high_zero_features)
print(f"After Step 3 - Removed {len(high_zero_features)} high-zero features. New shape: {X.shape}")


After Step 3 - Removed 312 high-zero features. New shape: (11526, 158)


In [6]:
# Step 4: Handle infinite and null values (if any)
X = X.replace([np.inf, -np.inf], np.nan).dropna()
print("After Step 4 - Handled infinities and nulls:", X.shape)

After Step 4 - Handled infinities and nulls: (11526, 158)


In [8]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [16]:
# GA Parameters
POP_SIZE = 30
N_GEN = 50
MUTPB = 0.1
CXPB = 0.7
TOURN_SIZE = 3
EARLY_STOPPING_PATIENCE = 5  # generations
N_FEATURES = X.shape[1]

In [17]:
# Fitness function
def fitness(individual):
    selected = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected) == 0:
        return 0
    X_sel = X.iloc[:, selected]
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    score = cross_val_score(clf, X_sel, y, cv=3, scoring='accuracy', n_jobs=-1)
    return score.mean()

In [18]:
# Tournament selection
def tournament_selection(pop, fitnesses):
    selected = []
    for _ in range(TOURN_SIZE):
        i = random.randint(0, len(pop) - 1)
        selected.append((pop[i], fitnesses[i]))
    return max(selected, key=lambda x: x[1])[0]

In [19]:
# Crossover
def crossover(p1, p2):
    point = random.randint(1, N_FEATURES - 1)
    return p1[:point] + p2[point:], p2[:point] + p1[point:]

In [20]:
# Mutation
def mutate(ind):
    return [1 - bit if random.random() < MUTPB else bit for bit in ind]


In [21]:
from copy import deepcopy
# Initialize population
population = [[random.randint(0, 1) for _ in range(N_FEATURES)] for _ in range(POP_SIZE)]
best_overall = None
best_overall_score = 0
no_improvement_counter = 0

for gen in range(N_GEN):
    fitnesses = [fitness(ind) for ind in population]
    best_idx = np.argmax(fitnesses)
    best_score = fitnesses[best_idx]

    print(f"Generation {gen + 1}: Best Accuracy = {best_score:.4f}")

    if best_score > best_overall_score:
        best_overall_score = best_score
        best_overall = deepcopy(population[best_idx])
        no_improvement_counter = 0
    else:
        no_improvement_counter += 1

    if no_improvement_counter >= EARLY_STOPPING_PATIENCE:
        print("Early stopping triggered.")
        break

    new_population = []
    while len(new_population) < POP_SIZE:
        parent1 = tournament_selection(population, fitnesses)
        parent2 = tournament_selection(population, fitnesses)
        if random.random() < CXPB:
            child1, child2 = crossover(parent1, parent2)
        else:
            child1, child2 = parent1[:], parent2[:]
        new_population.extend([mutate(child1), mutate(child2)])

    population = new_population[:POP_SIZE]


Generation 1: Best Accuracy = 0.9413
Generation 2: Best Accuracy = 0.9428
Generation 3: Best Accuracy = 0.9422
Generation 4: Best Accuracy = 0.9429
Generation 5: Best Accuracy = 0.9428
Generation 6: Best Accuracy = 0.9433
Generation 7: Best Accuracy = 0.9432
Generation 8: Best Accuracy = 0.9435
Generation 9: Best Accuracy = 0.9439
Generation 10: Best Accuracy = 0.9439
Generation 11: Best Accuracy = 0.9435
Generation 12: Best Accuracy = 0.9433
Generation 13: Best Accuracy = 0.9429
Generation 14: Best Accuracy = 0.9442
Generation 15: Best Accuracy = 0.9445
Generation 16: Best Accuracy = 0.9436
Generation 17: Best Accuracy = 0.9443
Generation 18: Best Accuracy = 0.9442
Generation 19: Best Accuracy = 0.9448
Generation 20: Best Accuracy = 0.9436
Generation 21: Best Accuracy = 0.9440
Generation 22: Best Accuracy = 0.9445
Generation 23: Best Accuracy = 0.9447
Generation 24: Best Accuracy = 0.9447
Early stopping triggered.


In [23]:
# Final feature selection
selected_indices = [i for i, bit in enumerate(best_overall) if bit == 1]
selected_features = X.columns[selected_indices]

print("\nSelected Features with GA + Early Stopping (Numbered):")
for idx, feat in enumerate(selected_features, start=1):
    print(f"{idx}. {feat}")



Selected Features with GA + Early Stopping (Numbered):
1. ACCESS_PERSONAL_INFO___
2. CREATE_THREAD_____
3. EXECUTE_____
4. FS_ACCESS____
5. FS_ACCESS(CREATE)____
6. FS_ACCESS(CREATE__READ__WRITE)
7. FS_ACCESS(READ)____
8. FS_PIPE_ACCESS(READ)___
9. NETWORK_ACCESS____
10. NETWORK_ACCESS(WRITE)____
11. TERMINATE_THREAD
12. __arm_nr_cacheflush
13. _newselect
14. addClient
15. addToDisplay
16. bind
17. brk
18. capset
19. checkPermission
20. close
21. connect
22. dup
23. dup2
24. fdatasync
25. flock
26. fork
27. fsync
28. ftruncate
29. getActiveNetworkInfo
30. getActivePhoneType
31. getActivityInfo
32. getApplicationInfo
33. getConnectionInfo
34. getDisplayInfo
35. getIccSerialNumber
36. getInTouchMode
37. getInputDeviceIds
38. getInstalledPackages
39. getLine1Number
40. getNetworkInfo
41. getPackageInfo
42. getProxy
43. getSubscriberId
44. getegid32
45. getsockopt
46. hasSystemFeature
47. isSpellCheckerEnabled
48. listen
49. lseek
50. lstat64
51. madvise
52. mkdir
53. mprotect
54. munmap


In [26]:
# Save the selected feature dataset
X_ga = X[selected_features].copy()
X_ga['Class'] = y
output_csv = 'C:\Dataset\cleaned_DF_with_GA.csv'  # Output file path

X_ga.to_csv(output_csv, index=False)
print("\nSaved to cleaned_DF_with_GA.csv")


Saved to cleaned_DF_with_GA.csv
