## **Proyecto de Clasificación con Machine Learning**

In [1]:
# Librerías y dependencias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
# Ignorar warnings
warnings.filterwarnings('ignore')

# Evitamos la notación científica y rendondeamos los valores a 2 dígitos después de la coma
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
data = pd.read_parquet('https://github.com/jorge-robledo11/Datasets/blob/main/Datasets/TotalFeatures-ISCXFlowMeter.parquet?raw=true')
data.head()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward,calss
0,1020586,668,1641,35692,2276876,52,52,679,1390,53.43,...,0.0,-1,0.0,2,4194240,1853440,1640,668,32,benign
1,80794,1,1,75,124,75,124,75,124,75.0,...,0.0,-1,0.0,2,0,0,0,1,0,benign
2,998,3,0,187,0,52,-1,83,-1,62.33,...,0.0,-1,0.0,4,101888,-1,0,3,32,benign
3,189868,9,9,1448,6200,52,52,706,1390,160.89,...,0.0,-1,0.0,2,4194240,2722560,8,9,32,benign
4,110577,4,6,528,1422,52,52,331,1005,132.0,...,0.0,-1,0.0,2,155136,31232,5,4,32,benign


In [4]:
data.shape

(631955, 80)

In [5]:
data.isnull().sum()

duration                   0
total_fpackets             0
total_bpackets             0
total_fpktl                0
total_bpktl                0
                          ..
Init_Win_bytes_backward    0
RRT_samples_clnt           0
Act_data_pkt_forward       0
min_seg_size_forward       0
calss                      0
Length: 80, dtype: int64

In [6]:
data.dtypes

duration                    int64
total_fpackets              int64
total_bpackets              int64
total_fpktl                 int64
total_bpktl                 int64
                            ...  
Init_Win_bytes_backward     int64
RRT_samples_clnt            int64
Act_data_pkt_forward        int64
min_seg_size_forward        int64
calss                      object
Length: 80, dtype: object

### **Renombrar**

In [7]:
# Renombramos
data = data.rename(columns = {
    'calss': 'class'
})

In [8]:
data.describe()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,min_idle,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward
count,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,...,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0
mean,21952447.25,6.73,10.43,954.02,12060.42,141.48,44.36,263.68,183.25,174.96,...,19973267.78,20312275.63,20752376.06,466387.49,2.36,962079.57,310451.9,9.73,6.72,19.97
std,190057828.67,174.16,349.42,82350.4,482471.61,157.68,89.1,289.64,371.86,162.02,...,189798628.15,189790187.1,189972060.0,6199704.33,3.04,1705655.08,664795.64,347.88,174.14,14.91
min,-18.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,...,-1.0,0.0,-1.0,0.0,2.0,-1.0,-1.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,69.0,0.0,52.0,-1.0,52.0,-1.0,52.0,...,-1.0,0.0,-1.0,0.0,2.0,0.0,-1.0,0.0,1.0,0.0
50%,24450.0,1.0,0.0,184.0,0.0,52.0,-1.0,83.0,-1.0,83.0,...,-1.0,0.0,-1.0,0.0,2.0,87616.0,-1.0,0.0,1.0,32.0
75%,1759751.0,3.0,1.0,427.0,167.0,108.0,52.0,421.0,115.0,356.0,...,1013498.0,1291379.0,1306116.0,0.0,2.0,304640.0,90496.0,1.0,3.0,32.0
max,44310755494.0,48255.0,74768.0,40496443.0,103922175.0,1390.0,1390.0,1500.0,1390.0,1390.0,...,44310722867.0,44300000000.0,44310722867.0,847000000.0,2269.0,4194240.0,4194240.0,74524.0,48255.0,44.0


In [9]:
data['class'].value_counts()

benign            471597
asware            155613
GeneralMalware      4745
Name: class, dtype: int64

In [10]:
# Seleccionamos features
X = data.drop(['class'], axis=1)
y = data[['class']]

In [11]:
# Dimensionamos
X.shape, y.shape

((631955, 79), (631955, 1))

### **Etiquetar**

In [12]:
# Etiquetamos
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y['class'] = le.fit_transform(y['class'])

In [13]:
# Resultados
y.value_counts()

class
2        471597
1        155613
0          4745
dtype: int64

### **Escalar**

In [14]:
from sklearn.preprocessing import RobustScaler

# Instanciamos
robust = RobustScaler()

# Atributos a escalar
atr = X

# Transformación
X_esc = robust.fit_transform(atr)

In [15]:
header = ['duration', 'total_fpackets', 'total_bpackets', 'total_fpktl',
       'total_bpktl', 'min_fpktl', 'min_bpktl', 'max_fpktl', 'max_bpktl',
       'mean_fpktl', 'mean_bpktl', 'std_fpktl', 'std_bpktl', 'total_fiat',
       'total_biat', 'min_fiat', 'min_biat', 'max_fiat', 'max_biat',
       'mean_fiat', 'mean_biat', 'std_fiat', 'std_biat', 'fpsh_cnt',
       'bpsh_cnt', 'furg_cnt', 'burg_cnt', 'total_fhlen', 'total_bhlen',
       'fPktsPerSecond', 'bPktsPerSecond', 'flowPktsPerSecond',
       'flowBytesPerSecond', 'min_flowpktl', 'max_flowpktl', 'mean_flowpktl',
       'std_flowpktl', 'min_flowiat', 'max_flowiat', 'mean_flowiat',
       'std_flowiat', 'flow_fin', 'flow_syn', 'flow_rst', 'flow_psh',
       'flow_ack', 'flow_urg', 'flow_cwr', 'flow_ece', 'downUpRatio',
       'avgPacketSize', 'fAvgSegmentSize', 'fHeaderBytes', 'fAvgBytesPerBulk',
       'fAvgPacketsPerBulk', 'fAvgBulkRate', 'bVarianceDataBytes',
       'bAvgSegmentSize', 'bAvgBytesPerBulk', 'bAvgPacketsPerBulk',
       'bAvgBulkRate', 'sflow_fpacket', 'sflow_fbytes', 'sflow_bpacket',
       'sflow_bbytes', 'min_active', 'mean_active', 'max_active', 'std_active',
       'min_idle', 'mean_idle', 'max_idle', 'std_idle', 'FFNEPD',
       'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'RRT_samples_clnt',
       'Act_data_pkt_forward', 'min_seg_size_forward']


X = pd.DataFrame(X_esc, columns=header)
X.head()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,min_idle,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward
0,0.57,333.5,1641.0,99.18,13633.99,0.0,1.0,1.62,11.99,-0.1,...,0.0,0.0,0.0,0.0,0.0,13.48,20.48,1640.0,333.5,0.0
1,0.03,0.0,1.0,-0.3,0.74,0.41,2.36,-0.02,1.08,-0.03,...,0.0,0.0,0.0,0.0,0.0,-0.29,0.0,0.0,0.0,-1.0
2,-0.01,1.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,-0.07,...,0.0,0.0,0.0,0.0,2.0,0.05,0.0,0.0,1.0,0.0
3,0.09,4.0,9.0,3.53,37.13,0.0,1.0,1.69,11.99,0.26,...,0.0,0.0,0.0,0.0,0.0,13.48,30.08,8.0,4.0,0.0
4,0.05,1.5,6.0,0.96,8.51,0.0,1.0,0.67,8.67,0.16,...,0.0,0.0,0.0,0.0,0.0,0.22,0.35,5.0,1.5,0.0


In [16]:
X.shape, y.shape

((631955, 79), (631955, 1))

### **Modelar**

In [17]:
# Reducir dimensionalidad
from sklearn.decomposition import PCA

# Número de componentes a 99.9% de la varianza haremos predicciones más eficientes
pca = PCA(n_components=0.999, random_state=0)
modelo = pca.fit_transform(X)

In [18]:
# Número de dimensiones del nuevo conjunto
print('Número de componentes', pca.n_components_)

Número de componentes 8


In [19]:
# Calculamos la proporción de la varianza que se ha preservado
varianza = pca.explained_variance_ratio_

for i in varianza:
    print('Varianza por cada componente es {}'.format(round(i,4)))

Varianza por cada componente es 0.8497
Varianza por cada componente es 0.1217
Varianza por cada componente es 0.0105
Varianza por cada componente es 0.0076
Varianza por cada componente es 0.0048
Varianza por cada componente es 0.0034
Varianza por cada componente es 0.0012
Varianza por cada componente es 0.001


### **Datos de entrenamiento, prueba y validación**

In [20]:
from sklearn.model_selection import train_test_split

# Train y Test
X_train, X_test, y_train, y_test = train_test_split(modelo, y, test_size = 0.3, random_state = 0, shuffle=True)

# Test y Validación
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0, shuffle=True)

print('Datos de train:', X_train.shape, y_train.shape)
print('Datos de test:', X_test.shape, y_test.shape)
print('Datos de validation:', X_val.shape, y_val.shape)

# Casting
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
X_val = np.array(X_val)
y_val = np.array(y_val)

Datos de train: (442368, 8) (442368, 1)
Datos de test: (94794, 8) (94794, 1)
Datos de validation: (94793, 8) (94793, 1)


### **Entrenar**

In [21]:
from sklearn.ensemble import RandomForestClassifier

clf_rfc = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=0, n_jobs=-1)
clf_rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=30, n_estimators=200, n_jobs=-1,
                       random_state=0)

### **Predecir**

In [22]:
# Predicción
y_pred_rfc = clf_rfc.predict(X_val)
y_pred_rfc

array([2, 2, 2, ..., 2, 2, 2])

In [23]:
# Dimensionamos
y_val.shape, y_pred_rfc.shape

((94793, 1), (94793,))

In [24]:
# Redimensionamos
y_pred = y_pred_rfc.reshape((94793, 1))
y_val = y_val.reshape((94793, 1))

### **Evaluar**

In [25]:
from sklearn.metrics import classification_report

# Métricas de evaluación
print(classification_report(y_val, y_pred_rfc))

              precision    recall  f1-score   support

           0       0.70      0.35      0.47       724
           1       0.78      0.44      0.56     23359
           2       0.84      0.96      0.89     70710

    accuracy                           0.83     94793
   macro avg       0.77      0.58      0.64     94793
weighted avg       0.82      0.83      0.81     94793



In [26]:
from sklearn.metrics import f1_score

# F1 Score validation set
# f1_score_test = f1_score(y_pred, y_test, average='weighted')
# f1_score_test = round(f1_score_test, 4)
# print('F1 Score validation set:',f1_score_test*100, '%')

# F1 Score validation set
f1_score_val = f1_score(y_pred_rfc, y_val, average='weighted')
f1_score_val = round(f1_score_val, 4)
print('F1 Score validation set:',f1_score_val*100, '%')

F1 Score validation set: 84.6 %


In [27]:
# Accuracy de los resultados
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_val, y_pred_rfc)
acc = round(acc, 4)
print('Accuracy del', acc*100, '%')

Accuracy del 82.75 %


### **Entrenar**

In [28]:
from sklearn.ensemble import GradientBoostingClassifier

clf_gbc = GradientBoostingClassifier(learning_rate=0.01, validation_fraction=0.15, random_state=0)
clf_gbc.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.01, random_state=0,
                           validation_fraction=0.15)

### **Predecir**

In [29]:
# Predicción
y_pred_gbc = clf_gbc.predict(X_val)
y_pred_gbc

array([2, 2, 2, ..., 2, 2, 2])

In [30]:
# Dimensionamos
y_val.shape, y_pred_gbc.shape

((94793, 1), (94793,))

In [31]:
# Redimensionamos
y_pred_gbc = y_pred_gbc.reshape((94793, 1))
y_val = y_val.reshape((94793, 1))

### **Evaluar**

In [32]:
# Métricas de evaluación
print(classification_report(y_val, y_pred_gbc))

              precision    recall  f1-score   support

           0       0.98      0.09      0.16       724
           1       0.81      0.15      0.25     23359
           2       0.77      0.99      0.87     70710

    accuracy                           0.77     94793
   macro avg       0.86      0.41      0.43     94793
weighted avg       0.78      0.77      0.71     94793



In [33]:
# F1 Score validation set
f1_score_val2 = f1_score(y_pred_gbc, y_val, average='weighted')
f1_score_val2 = round(f1_score_val2, 3)
print('F1 Score validation set:',f1_score_val2*100, '%')

F1 Score validation set: 84.0 %


In [34]:
# Casteamos a Dataframes
y_val = pd.DataFrame(y_val, columns=['Datos de Validación'])
y_pred_rfc = pd.DataFrame(y_pred_rfc, columns=['Predicciones RFC'])
y_pred_gbp = pd.DataFrame(y_pred_gbc, columns=['Predicciones GBC'])

### **Comparar**

In [35]:
# Comparando los datos de validación y las predicciones
comparacion = pd.concat([y_val, y_pred_rfc, y_pred_gbp], axis=1)
comparacion.head(20)

Unnamed: 0,Datos de validación,Predicciones RFC,Predicciones GBC
0,2,2,2
1,2,2,2
2,1,2,2
3,2,2,2
4,2,2,2
5,1,2,2
6,2,2,2
7,2,2,2
8,2,2,2
9,2,2,2


### **Exportar**

In [36]:
# CSV de exportación
comparacion.to_csv(r'D:\Python Scripts & Notebooks\Jupyter Notebooks\Artificial Intelligence\Machine Learning\Proyectos de Machine Learning\resultados.csv', index=False, encode='utf-8')