In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [70]:
# charger le fichier log_export.parquet
import pandas as pd

# Charger le fichier .parquet
df = pd.read_parquet('log_export.parquet')

# Afficher les premières lignes pour vérifier le chargement
print(df.head())


                  date           ipsrc          ipdst proto  portsrc  portdst  \
0  2024-03-20 01:29:24    94.102.61.47  159.84.146.99   TCP  52502.0   3178.0   
1  2024-03-20 01:29:25  176.111.174.85  159.84.146.99   TCP  48739.0   2231.0   
2  2024-03-20 01:29:27   66.249.65.106  159.84.146.99   TCP  50501.0    443.0   
3  2024-03-20 01:29:34   89.248.163.75  159.84.146.99   TCP  43312.0   8845.0   
4  2024-03-20 01:29:38   42.58.163.244  159.84.146.99   TCP   9746.0     23.0   

   regle  action interface_in Interface_out  divers  
0  999.0    DENY         eth0          None     6.0  
1  999.0    DENY         eth0          None     6.0  
2    1.0  PERMIT         eth0          None     6.0  
3  999.0    DENY         eth0          None     6.0  
4    7.0    DENY         eth0          None     6.0  


In [71]:
df.drop(columns=['Interface_out', 'interface_in', 'divers'])  

Unnamed: 0,date,ipsrc,ipdst,proto,portsrc,portdst,regle,action
0,2024-03-20 01:29:24,94.102.61.47,159.84.146.99,TCP,52502.0,3178.0,999.0,DENY
1,2024-03-20 01:29:25,176.111.174.85,159.84.146.99,TCP,48739.0,2231.0,999.0,DENY
2,2024-03-20 01:29:27,66.249.65.106,159.84.146.99,TCP,50501.0,443.0,1.0,PERMIT
3,2024-03-20 01:29:34,89.248.163.75,159.84.146.99,TCP,43312.0,8845.0,999.0,DENY
4,2024-03-20 01:29:38,42.58.163.244,159.84.146.99,TCP,9746.0,23.0,7.0,DENY
...,...,...,...,...,...,...,...,...
11997647,2025-02-12 10:08:30,23.22.35.162,159.84.146.99,TCP,12680.0,443.0,1.0,PERMIT
11997648,2025-02-12 10:08:33,114.45.140.91,159.84.146.99,TCP,42640.0,443.0,1.0,PERMIT
11997649,2025-02-12 10:08:34,3.224.220.101,159.84.146.99,TCP,24581.0,443.0,1.0,PERMIT
11997650,2025-02-12 10:08:36,192.44.68.176,159.84.146.99,TCP,13575.0,443.0,1.0,PERMIT


In [72]:
df = df.dropna()

In [73]:
df

Unnamed: 0,date,ipsrc,ipdst,proto,portsrc,portdst,regle,action,interface_in,Interface_out,divers


In [57]:
# Conversion des adresses IP en entiers
def ip_to_int(ip):
    try:
        return int(ipaddress.ip_address(ip))
    except ValueError:
        return 0

df['ipsrc'] = df['ipsrc'].apply(ip_to_int)
df['ipdst'] = df['ipdst'].apply(ip_to_int)

# Définir la variable cible : si l'action est DENY plusieurs fois, l'IP est suspecte
df['action'] = df['action'].apply(lambda x: 1 if x == 'DENY' else 0)

# Encodage des variables catégorielles avec One-Hot Encoding
df = pd.get_dummies(df, columns=['proto', 'interface_in'], drop_first=True)

KeyboardInterrupt: 

In [None]:
df

Unnamed: 0,date,ipsrc,ipdst,portsrc,portdst,regle,action,Interface_out,divers,proto_TCP,interface_in_eth0


In [51]:
# Vérification des valeurs manquantes par colonne
missing_values = df.isnull().sum()
print(missing_values)

date                 0
ipsrc                0
ipdst                0
portsrc              0
portdst              0
regle                0
action               0
Interface_out        0
divers               0
proto_TCP            0
interface_in_eth0    0
dtype: int64


In [52]:
df['action']

Series([], Name: action, dtype: int64)

In [42]:
# Sélection des features (X) et de la cible (y)
X = df.drop(columns=['action'])  # Garder les colonnes pertinentes
y = df['action']  # Nouvelle colonne cible

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
df.info()

In [None]:
# Modèle Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [None]:
y_pred_rf = rf_model.predict(X_test)
print('Random Forest Classification Report:')
print(classification_report(y_test, y_pred_rf))

In [None]:
# Visualisation : Carte thermique
conf_matrix = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Prédiction')
plt.ylabel('Réel')
plt.title('Matrice de Confusion - Random Forest')
plt.show()

In [None]:
# Liste dynamique des adresses IP suspectes les plus actives
ip_activity = df[df['action'] == 0]['ipsrc'].value_counts().head(10)
plt.figure(figsize=(10,6))
sns.barplot(x=ip_activity.values, y=ip_activity.index, palette='Reds')
plt.xlabel('Nombre d'activités suspectes')
plt.title('Top 10 des adresses IP suspectes les plus actives')
plt.show()