Preparing and Training the Model

In [4]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib  # For saving the trained model

# 1. Load the Dataset
# Replace 'MalwareData.csv' with the actual path to your dataset file.
maldata = pd.read_csv("MalwareData.csv", sep="|")

# Separate features (X) and labels (y)
X = maldata.drop(['Name', 'md5', 'legitimate'], axis=1).values
y = maldata['legitimate'].values

# 2. Feature Selection with ExtraTreesClassifier
# ExtraTreesClassifier identifies the most important features.
extratrees = ExtraTreesClassifier().fit(X, y)
select = SelectFromModel(extratrees, prefit=True)
X_selected = select.transform(X)  # Reduced feature set

# 3. Split the Data
# Split into training and test sets for model validation.
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# 4. Train the Model with RandomForestClassifier
# Random forests are good for binary classification (malware vs. legit).
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

# Evaluate the Model
train_accuracy = accuracy_score(y_train, model.predict(X_train)) * 100
test_accuracy = accuracy_score(y_test, model.predict(X_test)) * 100
print(f"Training Accuracy: {train_accuracy:.2f}%")
print(f"Test Accuracy: {test_accuracy:.2f}%")

# 5. Save the Trained Model for Later Use
# Weâ€™ll save the model so we can use it to scan files.
joblib.dump(model, 'malware_detection_model.pkl')


# Save the selected feature indices
selected_features = select.get_support(indices=True)
joblib.dump(selected_features, 'selected_features.pkl')


Training Accuracy: 99.99%
Test Accuracy: 99.42%


['selected_features.pkl']

 File Scanning Function

In [5]:
def scan_file(file_features):
    # Load the trained model and selected features
    model = joblib.load('malware_detection_model.pkl')
    selected_features = joblib.load('selected_features.pkl')
    
    # Select only the required features from the input file features
    file_features_selected = [file_features[i] for i in selected_features]
    
    # Transform to the expected format for prediction
    file_features_selected = [file_features_selected]  # Reshape to 2D array
    
    # Predict using the model
    prediction = model.predict(file_features_selected)
    
    # Output the result
    if prediction[0] == 1:
        print("This file is legitimate.")
    else:
        print("Warning: This file is detected as malware!")


Testing the Scanner

In [11]:
sample_file_features = [
    224, 258, 9, 361984, 115712, 0, 6135, 4096, 372736, 4194304,
    4096, 512, 0, 0, 1, 0, 1036288, 1024, 485887, 16, 1024, 1048576,
    4096, 1048576, 4096, 0, 16, 8, 5.76, 3.60, 7.22, 59712, 1024,
    325120, 126875.87, 896, 551848, 0, 0, 0, 0, 4, 3.26, 2.56, 3.53,
    8797, 216, 18032, 0, 16, 1
]


scan_file(sample_file_features)


IndexError: list index out of range

In [9]:
pyinstaller --onefile malware_detection_script.py


SyntaxError: invalid syntax (3525510705.py, line 1)