In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from termcolor import colored
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer



from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

# Datos Puros

In [24]:
data = pd.read_csv('')
data.head(1)

Unnamed: 0,Netflows,First_Protocol,Second_Protocol,Third_Protocol,p1_d,p2_d,p3_d,duration,max_d,min_d,...,second_dp,third_dp,p1_ip,p2_ip,p3_ip,p1_ib,p2_ib,p3_ib,Type,Cryptocurrency
0,82,TCP,UDP,,2.9595,103.6315,135.19425,6709.539,184.327,0.0,...,5355.0,80.0,3.0,5.0,5.75,84.0,160.0,248.0,not_mine,Does not have


In [25]:
data = data.drop(['Second_Protocol', 'Third_Protocol','Cryptocurrency'], axis = 1)

## Imputación de valores ausentes

In [26]:
punter = pd.concat([data['second_sp'],data['second_dp'],data['third_sp'],data['third_dp']], axis = 1)
imputer = SimpleImputer(missing_values = np.nan, strategy = "median")
values = imputer.fit_transform(punter)

In [27]:
punter = pd.DataFrame(values, columns = punter.columns)
data['second_sp'] = punter['second_sp']
data['second_dp'] = punter['second_dp']
data['third_sp'] = punter['third_sp']
data['third_dp'] = punter['third_dp']
data.head(2)

Unnamed: 0,Netflows,First_Protocol,p1_d,p2_d,p3_d,duration,max_d,min_d,#packets,Avg_bps,...,first_dp,second_dp,third_dp,p1_ip,p2_ip,p3_ip,p1_ib,p2_ib,p3_ib,Type
0,82,TCP,2.9595,103.6315,135.19425,6709.539,184.327,0.0,407,1266,...,443,5355.0,80.0,3.0,5.0,5.75,84.0,160.0,248.0,not_mine
1,117,TCP,57.953,64.0,90.113,7092.561,148.642,0.0,462,1301,...,443,5355.0,123.0,2.0,3.0,4.0,84.0,96.0,160.0,not_mine


# Exclusión de variables con varianza próxima a cero

## OneHotEncoder

In [28]:
data_categoric = data[data.select_dtypes(include=['object']).columns.to_list()]
one_hot =  OneHotEncoder(drop="first")
one_hot.fit_transform(data_categoric)

<4733x4 sparse matrix of type '<class 'numpy.float64'>'
	with 7233 stored elements in Compressed Sparse Row format>

In [29]:
one_hot.categories_

[array(['ICMP', 'ICMP6', 'TCP', 'UDP'], dtype=object),
 array(['mine', 'not_mine'], dtype=object)]

In [30]:
dataDummy = pd.get_dummies(data_categoric)
dataDummy.head()

Unnamed: 0,First_Protocol_ICMP,First_Protocol_ICMP6,First_Protocol_TCP,First_Protocol_UDP,Type_mine,Type_not_mine
0,0,0,1,0,0,1
1,0,0,1,0,0,1
2,0,0,1,0,0,1
3,0,0,1,0,1,0
4,0,0,1,0,0,1


# Multicolinealidad

In [31]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
multicolinialidad = pd.concat([data, dataDummy], axis = 1)
multicolinialidad.columns

Index(['Netflows', 'First_Protocol', 'p1_d', 'p2_d', 'p3_d', 'duration',
       'max_d', 'min_d', '#packets', 'Avg_bps', 'Avg_pps', 'Avg_bpp', '#Bytes',
       '#sp', '#dp', 'first_sp', 'second_sp', 'third_sp', 'first_dp',
       'second_dp', 'third_dp', 'p1_ip', 'p2_ip', 'p3_ip', 'p1_ib', 'p2_ib',
       'p3_ib', 'Type', 'First_Protocol_ICMP', 'First_Protocol_ICMP6',
       'First_Protocol_TCP', 'First_Protocol_UDP', 'Type_mine',
       'Type_not_mine'],
      dtype='object')

In [33]:
multicolinialidad = multicolinialidad.drop(['Type_not_mine','Type_mine','Type','First_Protocol'],axis = 1)

In [34]:
vif_data = pd.DataFrame() 
vif_data["feature"] = multicolinialidad.columns
vif_data["VIF"] = [variance_inflation_factor(multicolinialidad.values, i) for i in range(len(multicolinialidad.columns))] 

In [35]:
vif_data

Unnamed: 0,feature,VIF
0,Netflows,4.675045
1,p1_d,117.174609
2,p2_d,125.662776
3,p3_d,93.853328
4,duration,4.794896
5,max_d,26.144148
6,min_d,73.665596
7,#packets,65.891817
8,Avg_bps,15.425059
9,Avg_pps,3.594882


## Estandarización

In [42]:
data_numeric = data[data.select_dtypes(include=['float64', 'int64']).columns.to_list()]

In [43]:
preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), data_numeric.columns), 
], remainder='passthrough')

In [44]:
values = preprocessor.fit_transform(data_numeric)
values

array([[-0.08478692, -0.68540958, -0.24470927, ..., -0.4171446 ,
        -0.47609459, -0.55869393],
       [ 0.09792905, -0.3966615 , -0.4587796 , ..., -0.4171446 ,
        -0.48255709, -0.56545917],
       [-0.02214144, -0.63218965, -0.07436965, ..., -0.41066924,
        -0.47367115, -0.42331227],
       ...,
       [ 2.03993881, -0.70094869, -0.80447685, ..., -0.4247973 ,
        -0.48901959, -0.56822677],
       [-0.50764388,  2.4109369 ,  2.39686337, ..., 10.59508937,
         7.07210498,  5.181304  ],
       [-0.34580973, -0.70094869, -0.80446875, ..., -0.42192754,
        -0.48255709, -0.56691985]])

In [45]:
data_standarizada =  pd.DataFrame(values, columns = data_numeric.columns)
data_standarizada.head(1)

Unnamed: 0,Netflows,p1_d,p2_d,p3_d,duration,max_d,min_d,#packets,Avg_bps,Avg_pps,...,third_sp,first_dp,second_dp,third_dp,p1_ip,p2_ip,p3_ip,p1_ib,p2_ib,p3_ib
0,-0.084787,-0.68541,-0.244709,-0.196223,0.361252,-0.132463,-0.625504,-0.317443,-0.359303,-0.336093,...,2.777318,-0.507971,-0.598943,-0.36157,-0.338855,-0.343648,-0.398552,-0.417145,-0.476095,-0.558694


### Concatenación de los conjuntos de datos


In [46]:
data_p = pd.concat([data_standarizada, dataDummy], axis = 1)
data_p.columns

Index(['Netflows', 'p1_d', 'p2_d', 'p3_d', 'duration', 'max_d', 'min_d',
       '#packets', 'Avg_bps', 'Avg_pps', 'Avg_bpp', '#Bytes', '#sp', '#dp',
       'first_sp', 'second_sp', 'third_sp', 'first_dp', 'second_dp',
       'third_dp', 'p1_ip', 'p2_ip', 'p3_ip', 'p1_ib', 'p2_ib', 'p3_ib',
       'First_Protocol_ICMP', 'First_Protocol_ICMP6', 'First_Protocol_TCP',
       'First_Protocol_UDP', 'Type_mine', 'Type_not_mine'],
      dtype='object')

In [47]:
data_p = data_p.drop('Type_not_mine',axis = 1)

In [48]:
data_p.to_csv('', index=False)

# Data No Pura

In [49]:
data_n = pd.read_csv('').drop(['Name'],axis = 1)
data_n.head(1)

Unnamed: 0,Netflows,First_Protocol,Second_Protocol,Third_Protocol,p1_d,p2_d,p3_d,duration,max_d,min_d,...,first_dp,second_dp,third_dp,p1_ip,p2_ip,p3_ip,p1_ib,p2_ib,p3_ib,Type
0,1564,TCP,UDP,ICMP,0.0,0.101,1.721,10701.93,177.706,0.0,...,53,443.0,80.0,1.0,2.0,16.0,40.0,115.5,1883.5,mine


In [50]:
data_n = data_n.drop(['Second_Protocol','Third_Protocol'], axis = 1)

## Identificar valores nulos

In [51]:
data_n.isnull().sum()

Netflows              0
First_Protocol        0
p1_d                  0
p2_d                  0
p3_d                  0
duration              0
max_d                 0
min_d                 0
#packets              0
Avg_bps               0
Avg_pps               0
Avg_bpp               0
#Bytes                0
#sp                   0
#dp                   0
first_sp              0
second_sp          7495
third_sp          10953
first_dp              0
second_dp          1705
third_dp           7064
p1_ip                 0
p2_ip                 0
p3_ip                 0
p1_ib                 0
p2_ib                 0
p3_ib                 0
Type                  0
dtype: int64

## Imputar valores

In [52]:
punter = pd.concat([data_n['second_sp'],data_n['third_sp'],data_n['second_dp'],data_n['third_dp']], axis = 1)
imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
values = imputer.fit_transform(punter)

In [53]:
values = pd.DataFrame(values, columns = punter.columns)

In [54]:
data_n['second_sp'] = values['second_sp']
data_n['third_sp'] = values['third_sp']
data_n['second_dp'] = values['second_dp']
data_n['third_dp'] = values['third_dp']

## OneHotEncoder

In [55]:
data_categoric = data_n.select_dtypes(['object'])
data_categoric.columns

Index(['First_Protocol', 'Type'], dtype='object')

In [56]:
one_hot =  OneHotEncoder()
one_hot.fit_transform(data_categoric)

<37053x5 sparse matrix of type '<class 'numpy.float64'>'
	with 74106 stored elements in Compressed Sparse Row format>

In [57]:
dataDummy = pd.get_dummies(data_categoric)
dataDummy.head()

Unnamed: 0,First_Protocol_ICMP,First_Protocol_ICMP6,First_Protocol_TCP,First_Protocol_UDP,Type_mine
0,0,0,1,0,1
1,0,0,1,0,1
2,0,0,1,0,1
3,0,0,1,0,1
4,0,0,1,0,1


## Estandarización

In [58]:
data_numeric = data_n.select_dtypes(['int64','float64'])

In [59]:
preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), data_numeric.columns), 
], remainder='passthrough')

In [60]:
values = preprocessor.fit_transform(data_numeric)

In [61]:
data_estandarizada = pd.DataFrame(values,columns= data_numeric.columns)
data_estandarizada.head(1)

Unnamed: 0,Netflows,p1_d,p2_d,p3_d,duration,max_d,min_d,#packets,Avg_bps,Avg_pps,...,third_sp,first_dp,second_dp,third_dp,p1_ip,p2_ip,p3_ip,p1_ib,p2_ib,p3_ib
0,4.553656,-0.037927,-0.055406,-0.142137,15.85042,-0.126105,-0.01812,43.732192,127.789591,32.184816,...,-2.089595,-3.320085,-0.610708,-0.918693,-0.140926,0.725482,5.594429,-0.046617,0.189553,3.916348


### Concatenación de los conjuntos de datos

In [62]:
data_n = pd.concat([data_estandarizada,dataDummy],axis = 1)
data_n.head()

Unnamed: 0,Netflows,p1_d,p2_d,p3_d,duration,max_d,min_d,#packets,Avg_bps,Avg_pps,...,p2_ip,p3_ip,p1_ib,p2_ib,p3_ib,First_Protocol_ICMP,First_Protocol_ICMP6,First_Protocol_TCP,First_Protocol_UDP,Type_mine
0,4.553656,-0.037927,-0.055406,-0.142137,15.85042,-0.126105,-0.01812,43.732192,127.789591,32.184816,...,0.725482,5.594429,-0.046617,0.189553,3.916348,0,0,1,0,1
1,150.05602,-0.013185,-0.053547,-0.17892,17.251858,-0.711131,-0.01812,146.492862,71.507445,165.641598,...,0.725482,0.096911,-0.046617,-0.035718,-0.233964,0,0,1,0,1
2,-0.083846,75.568495,34.015101,8.265983,0.330741,0.782195,91.815849,-0.015422,-0.003079,-0.015512,...,29.051386,13.448026,23.838681,25.276246,17.906751,0,0,1,0,1
3,-0.030404,-0.037927,0.321143,-0.004401,-0.676051,-0.920795,-0.01812,0.187019,7.298901,2.894156,...,10.31974,6.183449,0.01842,3.118086,16.694365,0,0,1,0,1
4,0.866219,-0.037927,0.11721,0.453844,9.373725,0.476526,-0.01812,4.984884,3.312031,1.536311,...,3.466698,3.63103,0.013882,1.049877,4.835751,0,0,1,0,1


In [64]:
data_n.to_csv('', index = False)