#Training

In [1]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
import xgboost as xgb
from sklearn.svm import SVC, OneClassSVM
from sklearn.ensemble import RandomForestClassifier

In [4]:
data = np.load('/content/bodmas.npz')
X = pd.DataFrame(data['X'])
y = pd.DataFrame(data['y'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print((X_train.shape, X_test.shape))

((94104, 2381), (40331, 2381))


In [6]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    verbosity=1,
    device='cuda'
)
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [7]:
y_pred_xgb = xgb_model.predict(X_test)

In [8]:
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy XGB: {accuracy_xgb}")
print(classification_report(y_test, y_pred_xgb))

Accuracy XGB: 0.9950410354318019
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23170
           1       0.99      0.99      0.99     17161

    accuracy                           1.00     40331
   macro avg       0.99      0.99      0.99     40331
weighted avg       1.00      1.00      1.00     40331



In [9]:
with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

In [11]:
model_RF = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=100)
model_RF.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [12]:
y_pred_RF = model_RF.predict(X_test)

In [13]:
accuracy_RF = accuracy_score(y_test, y_pred_RF)
print(f"Accuracy XGB: {accuracy_RF}")
print(classification_report(y_test, y_pred_RF))

Accuracy XGB: 0.986313257791773
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     23170
           1       0.98      0.98      0.98     17161

    accuracy                           0.99     40331
   macro avg       0.99      0.99      0.99     40331
weighted avg       0.99      0.99      0.99     40331



In [14]:
with open('randomForest_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

In [15]:
xgb_probas = xgb_model.predict_proba(X_test)
rf_probas = model_RF.predict_proba(X_test)

In [24]:
xgbindices = []
for i in range(len(xgb_probas)):
  maxproba = max(xgb_probas[i])
  if maxproba < 0.85:
    xgbindices.append(i)

In [25]:
rfindices = []
for i in range(len(rfindices)):
  maxproba = max(rfindices[i])
  if maxproba < 0.85:
    rfindices.append(i)

In [26]:
print(len(xgbindices))
print(len(rfindices))

633
0


### XGBoost is more accurate but unsure; whereas Random Forest is less accurate but more sure of it's predictions! We take XGB as our soft classifier and RF as the hard classifier
### Threshold values is 85%

#Family Detection

In [22]:
metadata = pd.read_csv('/content/bodmas_malware_category.csv')
metadata.head()

Unnamed: 0,sha256,category
0,6a695877f571d043fe08d3cc715d9d4b4af85ffe837fa0...,worm
1,9ef9439795cac85e711b59df296a19e7ac43c144035f2f...,trojan
2,32de655f9010d8d152db16c6e5bbad215fa09286a08ff1...,worm
3,a68f7fb26ad84859625002395cf67f22ea0956996ed9c8...,downloader
4,d5c74472adfda20166a65f8b2886819a014ebcb67b999e...,trojan


In [24]:
print(len(metadata['category'].unique()))

14


In [25]:
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score

In [26]:
H_train, H_test, c_train, c_test = train_test_split(metadata['sha256'], metadata['category'], test_size=0.3, random_state=42)

In [27]:
spectral_model = SpectralClustering(n_clusters=14, affinity='nearest_neighbors', assign_labels='discretize', random_state=42)

In [28]:
cluster_labels = spectral_model.fit_predict(H_train)

ValueError: could not convert string to float: 'ea40592d96e9b8b3d52bacb32c8c18f5b2e0cc4adc7eddc739b99253d3c24762'

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Load your data
data = pd.read_csv('/content/bodmas_malware_category.csv')

# Step 1: Function to calculate character frequency
def hash_to_features(hash_string):
    features = np.zeros(16)  # One feature for each hexadecimal character (0-9, a-f)
    for char in hash_string:
        if char.isdigit():
            index = int(char)
        elif char.isalpha():
            index = ord(char.lower()) - ord('a') + 10
        else:
            continue
        features[index] += 1
    return features / len(hash_string)  # Normalize by hash length

# Apply the function to each SHA-256 hash
X = np.array([hash_to_features(h) for h in data['sha256']])

# Step 2: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Apply Spectral Clustering
spectral_model = SpectralClustering(n_clusters=14, affinity='nearest_neighbors', assign_labels='discretize', random_state=42)
cluster_labels = spectral_model.fit_predict(X_scaled)

# Step 4: Evaluate the clustering (optional)
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print(f"Silhouette Score: {silhouette_avg}")

# Add cluster labels to data
data['cluster'] = cluster_labels
print(data[['sha256', 'category', 'cluster']])
