In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
import xgboost as xgb

# --------------------------
# Load and Prepare the Data
# --------------------------
# Replace 'your_dataset.csv' with your actual dataset path
df = pd.read_csv('data.csv')

# Select 10 important columns and the target label
features = ['Machine', 'DebugSize', 'MajorImageVersion', 'ExportSize', 
            'IatVRA', 'NumberOfSections', 'SizeOfStackReserve', 
            'DllCharacteristics', 'ResourceSize', 'BitcoinAddresses']
target = 'Benign'

X = df[features]
y = df[target]

# Split the data into training and validation sets (80/20 split)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# Define the XGBoost Model
# --------------------------
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# --------------------------
# Train and Evaluate Model
# --------------------------
print("\nTraining XGBoost...")
model.fit(X_train, y_train)

# Predictions on the validation set
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"Accuracy for XGBoost: {acc:.4f}")
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("\n--------------------------------")
print(f"Model: XGBoost with Accuracy: {acc:.4f}")
print("--------------------------------")

# Save the model to disk
joblib.dump(model, 'xgboost_model.pkl')
print("Model saved to 'xgboost_model.pkl'")

# --------------------------
# Prediction on Sample Data
# --------------------------
# Load the saved model
model = joblib.load('xgboost_model.pkl')
print("Loaded XGBoost model from 'xgboost_model.pkl'.")

# Define the 10 feature names used in the model
features = ['Machine', 'DebugSize', 'MajorImageVersion', 'ExportSize',
            'IatVRA', 'NumberOfSections', 'SizeOfStackReserve',
            'DllCharacteristics', 'ResourceSize', 'BitcoinAddresses']

# Provide a sample input as a dictionary (update these values as needed)
sample_data = {
    'Machine': 332,
    'DebugSize': 0,
    'MajorImageVersion': 0,
    'ExportSize': 0,
    'IatVRA': 2073,
    'NumberOfSections': 32768,
    'SizeOfStackReserve': 12,
    'DllCharacteristics': 1048576,
    'ResourceSize': 24044,
    'BitcoinAddresses': 0
}

# Create a DataFrame for the single sample
sample_df = pd.DataFrame([sample_data])
print("\nInput Data:")
print(sample_df)

# Make a prediction
prediction = model.predict(sample_df)[0]

# Get the probabilities
probabilities = model.predict_proba(sample_df)[0]
confidence = max(probabilities)

# Map the prediction (assuming 1 = Benign, 0 = Malicious)
label_mapping = {1: "Benign", 0: "Malicious"}
predicted_label = label_mapping.get(prediction, "Unknown")

print("\nPrediction:")
print(f"Predicted Class: {predicted_label}")
print(f"Confidence: {confidence * 100:.2f}%")


Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy for XGBoost: 0.9951
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7073
           1       0.99      0.99      0.99      5424

    accuracy                           1.00     12497
   macro avg       1.00      1.00      1.00     12497
weighted avg       1.00      1.00      1.00     12497


--------------------------------
Model: XGBoost with Accuracy: 0.9951
--------------------------------
Model saved to 'xgboost_model.pkl'
Loaded XGBoost model from 'xgboost_model.pkl'.

Input Data:
   Machine  DebugSize  MajorImageVersion  ExportSize  IatVRA  \
0      332          0                  0           0    2073   

   NumberOfSections  SizeOfStackReserve  DllCharacteristics  ResourceSize  \
0             32768                  12             1048576         24044   

   BitcoinAddresses  
0                 0  

Prediction:
Predicted Class: Malicious
Confidence: 97.78%


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
import xgboost as xgb

# --------------------------
# Load and Prepare the Data
# --------------------------
df = pd.read_csv('data.csv')

features = ['Machine', 'DebugSize', 'MajorImageVersion', 'ExportSize', 
            'IatVRA', 'NumberOfSections', 'SizeOfStackReserve', 
            'DllCharacteristics', 'ResourceSize', 'BitcoinAddresses']
target = 'Benign'

X = df[features]
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# Define and Train Model
# --------------------------
model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)

print("\nTraining XGBoost...")
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"Accuracy for XGBoost: {acc:.4f}")
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("\n--------------------------------")
print(f"Model: XGBoost with Accuracy: {acc:.4f}")
print("--------------------------------")

# Save model
joblib.dump(model, 'xgboost_model.pkl')
print("Model saved to 'xgboost_model.pkl'.")

# --------------------------
# Load Model and Take Dynamic Input
# --------------------------
model = joblib.load('xgboost_model.pkl')
print("\nLoaded XGBoost model from 'xgboost_model.pkl'.")

# Now take dynamic input from user
input_data = {}

print("\nPlease enter the following details:")
for feature in features:
    value = float(input(f"Enter value for {feature}: "))
    input_data[feature] = value

# Create DataFrame from input
sample_df = pd.DataFrame([input_data])

print("\nInput Data:")
print(sample_df)

# Make prediction
prediction = model.predict(sample_df)[0]
probabilities = model.predict_proba(sample_df)[0]
confidence = max(probabilities)

# Map prediction
label_mapping = {1: "Benign", 0: "Malicious"}
predicted_label = label_mapping.get(prediction, "Unknown")

print("\nPrediction:")
print(f"Predicted Class: {predicted_label}")
print(f"Confidence: {confidence * 100:.2f}%")

SyntaxError: invalid non-printable character U+00A0 (2260701710.py, line 78)