In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import optuna

# Charger le dataset Malware
data = pd.read_csv("../OLD_SCRIPT/DatasetmalwareExtrait.csv")

# Séparer les caractéristiques (X) et la cible (y)
X = data.drop(columns=["legitimate"])
y = data["legitimate"]

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fonction objectif pour Optuna
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 10, 30, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    return accuracy

# Créer une étude Optuna et optimiser
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, n_jobs=-1)

# Meilleurs hyperparamètres trouvés
print("Meilleurs hyperparamètres :", study.best_params)

# Entraîner le modèle avec les meilleurs hyperparamètres
best_params = study.best_params
best_model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42
)
best_model.fit(X_train, y_train)

# Prédictions sur l'ensemble de test
y_pred = best_model.predict(X_test)

# Rapport de classification
print("\nRapport de classification :\n")
print(classification_report(y_test, y_pred))

# Méthode pour évaluer le modèle
def evaluate_model(model, X_test, y_test):
    accuracy = accuracy_score(y_test, model.predict(X_test))
    print(f"\nPrécision du modèle : {accuracy:.2f}")
    return accuracy

# Évaluer le modèle
evaluate_model(best_model, X_test, y_test)

[I 2024-12-18 14:05:53,280] A new study created in memory with name: no-name-8fce449b-4ad0-4ae8-8ae5-3a34e2358190
[I 2024-12-18 14:06:03,562] Trial 8 finished with value: 0.9919604205318491 and parameters: {'n_estimators': 79, 'max_depth': 23, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 8 with value: 0.9919604205318491.
[I 2024-12-18 14:06:04,810] Trial 0 finished with value: 0.9881770890174252 and parameters: {'n_estimators': 96, 'max_depth': 27, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 8 with value: 0.9919604205318491.
[I 2024-12-18 14:06:05,862] Trial 9 finished with value: 0.9872312561388191 and parameters: {'n_estimators': 107, 'max_depth': 26, 'min_samples_split': 5, 'min_samples_leaf': 5}. Best is trial 8 with value: 0.9919604205318491.
[I 2024-12-18 14:06:06,260] Trial 10 finished with value: 0.9875950380152061 and parameters: {'n_estimators': 107, 'max_depth': 16, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 8 with value:

Meilleurs hyperparamètres : {'n_estimators': 79, 'max_depth': 23, 'min_samples_split': 2, 'min_samples_leaf': 1}

Rapport de classification :

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     19186
           1       0.99      0.99      0.99      8303

    accuracy                           0.99     27489
   macro avg       0.99      0.99      0.99     27489
weighted avg       0.99      0.99      0.99     27489


Précision du modèle : 0.99


0.9919604205318491

In [2]:
import pickle

In [3]:
with open('streamlit/model_pickle', 'wb') as f:
    pickle.dump(best_model, f)

In [4]:
with open('streamlit/model_pickle', 'rb') as f:
    n_model = pickle.load(f)

In [None]:
import numpy as np

In [13]:
import pefile

def extract_pe_characteristics(file_path):
    # Load the PE file
    pe = pefile.PE(file_path)

    # Extract the characteristics
    characteristics = {
        "AddressOfEntryPoint": pe.OPTIONAL_HEADER.AddressOfEntryPoint,
        "MajorLinkerVersion": pe.OPTIONAL_HEADER.MajorLinkerVersion,
        "MajorImageVersion": pe.OPTIONAL_HEADER.MajorImageVersion,
        "MajorOperatingSystemVersion": pe.OPTIONAL_HEADER.MajorOperatingSystemVersion,
        "DllCharacteristics": pe.OPTIONAL_HEADER.DllCharacteristics,
        "SizeOfStackReserve": pe.OPTIONAL_HEADER.SizeOfStackReserve,
        "NumberOfSections": len(pe.sections),  # Count of sections in the file
        "ResourceSize": 0  # Placeholder for Resource Size
    }

    # Extract the resource size if available
    try:
        resource_directory = pe.DIRECTORY_ENTRY_RESOURCE
        characteristics["ResourceSize"] = resource_directory.struct.Size
    except AttributeError:
        characteristics["ResourceSize"] = 0  # If no resource directory, set to 0

    # Close the PE file
    pe.close()

    return characteristics

In [None]:
# Example usage
file_path = "exe_files/test2.exe"  # Path to your PE file
pe_characteristics = extract_pe_characteristics(file_path)
final_array = list(pe_characteristics.values())
print(final_array)

In [33]:
entry_array = np.array(final_array).reshape(1, -1)
prediction = n_model.predict(entry_array)
print(prediction)

[0]


