In [28]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [19]:
# 1. Charger les Données
data = {
    "Adresse IP Source": ["192.168.1.1", "10.0.0.2", "192.168.1.1", "172.16.0.3", "10.0.0.2"],
    "Port Source": [5000, 80, 5000, 443, 80],
    "Adresse IP Destination": ["192.168.1.2", "10.0.0.1", "192.168.1.3", "172.16.0.4", "10.0.0.1"],
    "Port Destination": [80, 5000, 443, 80, 5000],
    "Protocole": ["TCP", "HTTP", "TCP", "HTTPS", "HTTP"],
    "Durée (s)": [10, 5, 8, 12, 6],
    "Paquets": [100, 50, 80, 120, 60],
    "Label": ["Normal", "Malveillant", "Normal", "Malveillant", "Normal"]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Adresse IP Source,Port Source,Adresse IP Destination,Port Destination,Protocole,Durée (s),Paquets,Label
0,192.168.1.1,5000,192.168.1.2,80,TCP,10,100,Normal
1,10.0.0.2,80,10.0.0.1,5000,HTTP,5,50,Malveillant
2,192.168.1.1,5000,192.168.1.3,443,TCP,8,80,Normal
3,172.16.0.3,443,172.16.0.4,80,HTTPS,12,120,Malveillant
4,10.0.0.2,80,10.0.0.1,5000,HTTP,6,60,Normal


In [20]:
# 2. Nettoyage des Données
df["Durée (s)"].fillna(df["Durée (s)"].mean(), inplace=True)
df.drop_duplicates(inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Durée (s)"].fillna(df["Durée (s)"].mean(), inplace=True)


Unnamed: 0,Adresse IP Source,Port Source,Adresse IP Destination,Port Destination,Protocole,Durée (s),Paquets,Label
0,192.168.1.1,5000,192.168.1.2,80,TCP,10,100,Normal
1,10.0.0.2,80,10.0.0.1,5000,HTTP,5,50,Malveillant
2,192.168.1.1,5000,192.168.1.3,443,TCP,8,80,Normal
3,172.16.0.3,443,172.16.0.4,80,HTTPS,12,120,Malveillant
4,10.0.0.2,80,10.0.0.1,5000,HTTP,6,60,Normal


In [21]:
# 3. Encodage des Données
df = pd.get_dummies(df, columns=["Protocole"], prefix="Protocole")
encoder = LabelEncoder()
df["Label Encode"] = encoder.fit_transform(df["Label"])
df

Unnamed: 0,Adresse IP Source,Port Source,Adresse IP Destination,Port Destination,Durée (s),Paquets,Label,Protocole_HTTP,Protocole_HTTPS,Protocole_TCP,Label Encode
0,192.168.1.1,5000,192.168.1.2,80,10,100,Normal,False,False,True,1
1,10.0.0.2,80,10.0.0.1,5000,5,50,Malveillant,True,False,False,0
2,192.168.1.1,5000,192.168.1.3,443,8,80,Normal,False,False,True,1
3,172.16.0.3,443,172.16.0.4,80,12,120,Malveillant,False,True,False,0
4,10.0.0.2,80,10.0.0.1,5000,6,60,Normal,True,False,False,1


In [25]:
# 4. Normalisation
scaler = MinMaxScaler()
df[["Durée (s)", "Paquets"]] = scaler.fit_transform(df[["Durée (s)", "Paquets"]])
df

Unnamed: 0,Adresse IP Source,Port Source,Adresse IP Destination,Port Destination,Durée (s),Paquets,Label,Protocole_HTTP,Protocole_HTTPS,Protocole_TCP,Label Encode
0,192.168.1.1,5000,192.168.1.2,80,0.714286,0.714286,Normal,False,False,True,1
1,10.0.0.2,80,10.0.0.1,5000,0.0,0.0,Malveillant,True,False,False,0
2,192.168.1.1,5000,192.168.1.3,443,0.428571,0.428571,Normal,False,False,True,1
3,172.16.0.3,443,172.16.0.4,80,1.0,1.0,Malveillant,False,True,False,0
4,10.0.0.2,80,10.0.0.1,5000,0.142857,0.142857,Normal,True,False,False,1


In [27]:
def extract_octets(ip):
    return list(map(int, ip.split('.')))
df[["Octet1 Source", "Octet2 Source", "Octet3 Source", "Octet4 Source"]] = df["Adresse IP Source"].apply(lambda x: pd.Series(extract_octets(x)))
df[["Octet1 Destination", "Octet2 Destination", "Octet3 Destination", "Octet4 Destination"]] = df["Adresse IP Destination"].apply(lambda x: pd.Series(extract_octets(x)))
def classifier_port(port):
    if 0 <= port <= 1023:
        return "Connu"
    elif 1024 <= port <= 49151:
        return "Enregistré"
    else:
        return "Dynamique"
df["Type de Port Source"] = df["Port Source"].apply(classifier_port)
df["Type de Port Destination"] = df["Port Destination"].apply(classifier_port)
df = pd.get_dummies(df, columns=["Type de Port Source", "Type de Port Destination"], prefix=["PortSrc", "PortDst"])
df

Unnamed: 0,Adresse IP Source,Port Source,Adresse IP Destination,Port Destination,Durée (s),Paquets,Label,Protocole_HTTP,Protocole_HTTPS,Protocole_TCP,...,Octet3 Source,Octet4 Source,Octet1 Destination,Octet2 Destination,Octet3 Destination,Octet4 Destination,PortSrc_Connu,PortSrc_Enregistré,PortDst_Connu,PortDst_Enregistré
0,192.168.1.1,5000,192.168.1.2,80,0.714286,0.714286,Normal,False,False,True,...,1,1,192,168,1,2,False,True,True,False
1,10.0.0.2,80,10.0.0.1,5000,0.0,0.0,Malveillant,True,False,False,...,0,2,10,0,0,1,True,False,False,True
2,192.168.1.1,5000,192.168.1.3,443,0.428571,0.428571,Normal,False,False,True,...,1,1,192,168,1,3,False,True,True,False
3,172.16.0.3,443,172.16.0.4,80,1.0,1.0,Malveillant,False,True,False,...,0,3,172,16,0,4,True,False,True,False
4,10.0.0.2,80,10.0.0.1,5000,0.142857,0.142857,Normal,True,False,False,...,0,2,10,0,0,1,True,False,False,True


In [31]:
# 6. Entraînement du Modèle
X = df.drop(columns=["Label", "Label Encode", "Adresse IP Source", "Adresse IP Destination"])
y = df["Label Encode"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("\nRapport de classification :")
print(classification_report(y_test, y_pred))
print("\nMatrice de confusion :")
print(confusion_matrix(y_test, y_pred))


Rapport de classification :
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0


Matrice de confusion :
[[0 1]
 [0 0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
