In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [190]:
#librerias que se van a utilizar
import os
import numpy as np
from matplotlib import pyplot
from scipy import optimize
import pandas as pd # para el preprocesamiento
from sklearn import preprocessing
import csv
%matplotlib inline

In [191]:
#cargado de datos
df = pd.read_csv("/content/drive/MyDrive/IA/laboratorios/MontañoMolinaJuanJose/Firewall_Rule_Classification.csv", sep=",")
#uso de diccionarios
diccionario = {"allow": 0, "deny": 1, "drop":2, "reset-both":3}
df["Class"] = df["Class"].replace(diccionario)
#se eliminan los datos que entran en conflicto con las operaciones
df = df.drop('Unnamed: 0', axis=1)
df.drop_duplicates()
df.dropna(axis=0, thresh=2) #Elimina filas con n o más valores nulos.

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received,Class
0,33313,443,29395,443,9675,2389,7286,43,143,21,22,0
1,53871,443,15841,443,7797,1807,5990,15,135,7,8,0
2,56628,443,13354,443,8240,3521,4719,25,18,15,10,0
3,50270,445,0,0,70,70,0,1,0,1,0,2
4,64113,53,33491,53,294,134,160,2,30,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
262123,61522,64147,0,0,66,66,0,1,0,1,0,1
262124,54276,39004,0,0,66,66,0,1,0,1,0,1
262125,56601,445,0,0,70,70,0,1,0,1,0,2
262126,53007,445,0,0,70,70,0,1,0,1,0,2


In [213]:
#ayuda a utilizar de manera mas eficiente los diccionarios
df["Class"].value_counts()

Class
0    150314
1     60053
2     51539
3       222
Name: count, dtype: int64

In [193]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262128 entries, 0 to 262127
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   Source Port           262128 non-null  int64
 1   Destination Port      262128 non-null  int64
 2   NAT Source Port       262128 non-null  int64
 3   NAT Destination Port  262128 non-null  int64
 4   Bytes                 262128 non-null  int64
 5   Bytes Sent            262128 non-null  int64
 6   Bytes Received        262128 non-null  int64
 7   Packets               262128 non-null  int64
 8   Elapsed Time (sec)    262128 non-null  int64
 9   pkts_sent             262128 non-null  int64
 10  pkts_received         262128 non-null  int64
 11  Class                 262128 non-null  int64
dtypes: int64(12)
memory usage: 24.0 MB
None


In [194]:
# La entrada es de 12 elemento contando con x0
input_layer_size  = 12

num_labels = 4
#Se asignan los valore de X y X_test respectivamente 70 30
X = df.iloc[:183490, :-1]
X_test = df.iloc[183490:, :-1]
#Se asignan los valore de y e y_test respectivamente 70 30
y = df.iloc[:183490, df.columns.get_loc("Class")]
y_test = df.iloc[183490:, df.columns.get_loc("Class")]

In [195]:
print(X.iloc[0,: ])
print(y)

Source Port             33313
Destination Port          443
NAT Source Port         29395
NAT Destination Port      443
Bytes                    9675
Bytes Sent               2389
Bytes Received           7286
Packets                    43
Elapsed Time (sec)        143
pkts_sent                  21
pkts_received              22
Name: 0, dtype: int64
0         0
1         0
2         0
3         2
4         0
         ..
183485    2
183486    2
183487    0
183488    0
183489    0
Name: Class, Length: 183490, dtype: int64


In [196]:
#Se normalizan los datos para agilizar el proceso
def  featureNormalize(X):
    X_norm = X.copy()
    mu = np.zeros(X.shape[1])
    sigma = np.zeros(X.shape[1])

    mu = np.mean(X, axis = 0)
    sigma = np.std(X, axis = 0)
    X_norm = (X - mu) / sigma

    return X_norm, mu, sigma

In [197]:
X_norm, mu, sigma = featureNormalize(X)

In [198]:
# Se agrega la columna de 1 e usa para representar el término de sesgo en la regresión lineal o logística.
m, n = X.shape
X = X_norm

In [206]:
#Esta función se utiliza para modelar la probabilidad de un evento en la regresión logística.
def sigmoid(z):
    z = np.clip(z, -5, 5)  # Evita valores demasiado grandes o pequeños
    g = 1.0 / (1.0 + np.exp(-z))
    return g + np.finfo(float).eps  # Agrega un valor pequeño para evitar errores de 'log(0)'

In [207]:
#calcula el coste de la función
def lrCostFunction(theta, X, y, lambda_):
    m = y.size
    if y.dtype == bool:
        y = y.astype(int)  # Convierte a entero si es booleano

    J = 0
    grad = np.zeros(theta.shape)

    h = sigmoid(X.dot(theta.T))

    temp = theta
    temp[0] = 0  # No penaliza el término de sesgo
    J = (1 / m) * np.sum(-y * np.log(h) - (1 - y) * np.log(1 - h)) + (lambda_ / (2 * m)) * np.sum(np.square(temp))

    grad = (1 / m) * (h - y).dot(X)
    grad = grad + (lambda_ / m) * temp

    return J, grad

In [208]:
#Esta técnica permite entrenar un clasificador multiclase a partir de clasificadores binarios.
def oneVsAll(X, y, num_labels, lambda_):
    m, n = X.shape

    all_theta = np.zeros((num_labels, n + 1))
    X = np.concatenate([np.ones((m, 1)), X], axis=1)

    for c in np.arange(num_labels):
        initial_theta = np.zeros(n + 1)
        options = {'maxiter': 50}
        res = optimize.minimize(lrCostFunction,
                                initial_theta,
                                (X, (y == c), lambda_),
                                jac=True,
                                method='CG',
                                options=options)

        all_theta[c] = res.x

    return all_theta

In [209]:
lambda_ = 0.1
all_theta = oneVsAll(X, y, num_labels, lambda_)

In [210]:
print(all_theta)
print(all_theta.shape)

[[ 1.13954277e+02 -1.04265519e+00 -8.02199398e+01  1.48355047e+02
   5.47526238e+01  2.03246425e+00  4.10129100e-02  4.48895814e+00
   3.03695071e+00  7.96518376e+01  1.72420990e+00  4.49427377e+00]
 [-5.09976254e+00 -7.82388389e-01  1.21450027e+01 -7.69386001e+00
  -6.37107858e+00 -1.09644558e-01 -1.76841338e-02 -2.18547326e-01
  -1.44716059e-01 -3.27886456e+00 -8.39453263e-02 -2.11531800e-01]
 [-2.16086620e+01  2.59653961e+00 -1.23950812e+01 -1.80038691e+01
   1.02359955e+00 -1.72681309e-01  4.35826570e-02 -4.53234790e-01
  -2.52632430e-01 -5.71999670e+00 -1.15604689e-01 -4.14863208e-01]
 [-4.20358770e+00 -1.84956817e-02  7.39155794e-03 -5.63476256e-03
  -4.25867050e-03  5.85766516e-04  5.93282006e-04  4.06164789e-04
   5.42667290e-04 -2.67306109e-03  5.65999406e-04  4.23050564e-04]]
(4, 12)


In [211]:
#realiza predicciones con un modelo multiclase entrenado con la técnica
def predictOneVsAll(all_theta, X):

    m = X.shape[0];
    num_labels = all_theta.shape[0]

    p = np.zeros(m)

    # Add ones to the X data matrix
    X = np.concatenate([np.ones((m, 1)), X], axis=1)
    p = np.argmax(sigmoid(X.dot(all_theta.T)), axis = 1)

    return p

In [212]:
#analizar la precisión del modelo en el conjunto de entrenamiento y comparar las predicciones con las etiquetas reales en el conjunto de prueba para evaluar su rendimiento en datos nuevos.
print(X.shape)
pred = predictOneVsAll(all_theta, X)
print('Precision del conjuto de entrenamiento: {:.2f}%'.format(np.mean(pred == y) * 100))
XPrueba = X_test.copy()
print(XPrueba.shape)
XPrueba = np.concatenate([np.ones((78638, 1)), XPrueba], axis=1)
print(XPrueba.shape)
p = np.argmax(sigmoid(XPrueba.dot(all_theta.T)), axis = 1)
print(p)
print(y_test)

(183490, 11)
Precision del conjuto de entrenamiento: 98.35%
(78638, 11)
(78638, 12)
[2 2 0 ... 2 2 1]
183490    2
183491    2
183492    0
183493    0
183494    0
         ..
262123    1
262124    1
262125    2
262126    2
262127    1
Name: Class, Length: 78638, dtype: int64
