In [2]:
import pandas as pd
import numpy as np
import json

# Lectura de los datos

**Cargamos los datasets**

In [3]:
with open('ISCX.json', 'r') as file:
    ISCX_data = json.load(file)
df_ISCX = pd.DataFrame(data=ISCX_data['dataroot']['TestbedSunJun13Flows'])

In [4]:
df_ISCX.head(5)

Unnamed: 0,Tag,appName,destination,destinationPayloadAsBase64,destinationPayloadAsUTF,destinationPort,destinationTCPFlagsDescription,direction,protocolName,source,sourcePayloadAsBase64,sourcePayloadAsUTF,sourcePort,sourceTCPFlagsDescription,startDateTime,stopDateTime,totalDestinationBytes,totalDestinationPackets,totalSourceBytes,totalSourcePackets
0,Normal,Unknown_UDP,224.0.0.251,,,5353,,L2R,udp_ip,192.168.5.122,,,5353,,2010-06-12T23:57:24,2010-06-13T09:24:52,0,0,2633658,28971
1,Normal,Unknown_UDP,224.0.0.251,,,5353,,L2R,udp_ip,192.168.5.122,,,5353,,2010-06-12T23:57:24,2010-06-13T09:24:52,0,0,2633658,28971
2,Normal,HTTPWeb,192.168.5.122,,,80,R,L2L,tcp_ip,192.168.2.113,,,4191,"F,A",2010-06-12T23:57:38,2010-06-12T23:59:20,128,2,64,1
3,Normal,HTTPWeb,192.168.5.122,,,80,R,L2L,tcp_ip,192.168.2.113,,,4191,"F,A",2010-06-12T23:57:38,2010-06-12T23:59:20,128,2,64,1
4,Normal,HTTPWeb,207.241.148.80,,,80,"F,A",L2R,tcp_ip,192.168.2.113,,,4192,"F,A",2010-06-12T23:57:40,2010-06-12T23:59:20,64,1,128,2


**Nos quedamos las siguientes cinco columnas: Source IP, Source Port, Destination IP, Destination Port y class. Además, normalizamos los valores de la columna class a 'Attack' y 'Normal'**

In [5]:
df_ISCX = df_ISCX[['source', 'sourcePort', 'destination', 'destinationPort', 'Tag']]
df_ISCX.columns = ['Src IP Addr', 'Src Pt', 'Dst IP Addr', 'Dst Pt', 'class']

**Comprobamos que coincide**

In [6]:
print(df_ISCX['class'].unique())

['Normal' 'Attack']


**Funciones que calculan las nuevas columnas: Same_ip_src, Same_ip_dest, Same_ip_src_port_dest, Same_ip_dest_port_src**

In [7]:
def valuesNewColumns(source, destination, sourceP, destinationP):
    return ip_src[source], ip_dst[destination], ip_src_port_dst.loc[source,destinationP], ip_dst_port_src.loc[destination,sourceP]

def calculoNewsColumns(dataframe):
    dataframe['Same_ip_src'], dataframe['Same_ip_dst'], dataframe['Same_ip_src_port_dst'], dataframe['Same_ip_dst_port_src'] = [np.nan,np.nan,np.nan,np.nan]
    dataframe[['Same_ip_src', 'Same_ip_dst', 'Same_ip_src_port_dst', 'Same_ip_dst_port_src']] = dataframe.apply(lambda x: valuesNewColumns(x['Src IP Addr'], x['Dst IP Addr'], x['Src Pt'], x['Dst Pt']), axis=1, result_type='expand')
    

In [8]:
ip_src = df_ISCX.groupby(['Src IP Addr']).size()
ip_dst = df_ISCX.groupby(['Dst IP Addr']).size()
ip_src_port_dst = df_ISCX.groupby(['Src IP Addr', 'Dst Pt']).size()
ip_dst_port_src = df_ISCX.groupby(['Dst IP Addr', 'Src Pt']).size()

calculoNewsColumns(df_ISCX)

**Función que nos permite tener la clase (o tag) de tipo de traza en valor numérico (0:Normal, 1:Ataque)**

In [9]:
def create_class_numeric(dataframe):
    dataframe['class_numeric'] = dataframe['class'].copy()
    dataframe.loc[:, 'class_numeric'].replace(['Normal'], 0, inplace=True)
    dataframe.loc[:, 'class_numeric'].replace(['Attack'], 1, inplace=True)

In [10]:
create_class_numeric(df_ISCX)

In [11]:
df_ISCX.head(5)

Unnamed: 0,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,class,Same_ip_src,Same_ip_dst,Same_ip_src_port_dst,Same_ip_dst_port_src,class_numeric
0,192.168.5.122,5353,224.0.0.251,5353,Normal,37698,16,14,16,0
1,192.168.5.122,5353,224.0.0.251,5353,Normal,37698,16,14,16,0
2,192.168.2.113,4191,192.168.5.122,80,Normal,3026,27078,988,12,0
3,192.168.2.113,4191,192.168.5.122,80,Normal,3026,27078,988,12,0
4,192.168.2.113,4192,207.241.148.80,80,Normal,3026,154,988,2,0


In [12]:
df_ISCX.to_csv('ISCX_trabajados.csv')