In [1]:
import pandas as pd
import numpy as np

# Lectura de los datos

**Cargamos los datasets**

In [2]:
df_UNSW = pd.read_csv('UNSW.csv', names=np.arange(1,50), skiprows=1)

In [3]:
df_UNSW.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,40,41,42,43,44,45,46,47,48,49
0,59.166.0.9,7045,149.171.126.7,25,tcp,FIN,0.201886,37552,3380,31,...,,2,2,7,4,1,1,3,,0
1,59.166.0.9,9685,149.171.126.2,80,tcp,FIN,5.864748,19410,1087890,31,...,,3,1,4,4,1,1,1,,0
2,59.166.0.2,1421,149.171.126.4,53,udp,CON,0.001391,146,178,31,...,,3,5,2,7,1,1,4,,0
3,59.166.0.2,21553,149.171.126.2,25,tcp,FIN,0.053948,37812,3380,31,...,,1,1,4,7,1,1,3,,0
4,59.166.0.8,45212,149.171.126.4,53,udp,CON,0.000953,146,178,31,...,,2,5,2,1,1,1,2,,0


**Nos quedamos las siguientes cinco columnas: Source IP, Source Port, Destination IP, Destination Port y class. Además, normalizamos los valores de la columna class a 'Attack' y 'Normal'**

In [4]:
df_UNSW = df_UNSW[[1,2,3,4,49]]
df_UNSW.columns = ['Src IP Addr', 'Src Pt', 'Dst IP Addr', 'Dst Pt', 'class']
df_UNSW.loc[:, 'class'].replace(['normal', 0], 'Normal', inplace=True)
df_UNSW.loc[:, 'class'].replace(['attacker', 1], 'Attack', inplace=True)

**Comprobamos que coincide**

In [5]:
print(df_UNSW['class'].unique())

['Normal' 'Attack']


**Funciones que calculan las nuevas columnas: Same_ip_src, Same_ip_dest, Same_ip_src_port_dest, Same_ip_dest_port_src**

In [6]:
def valuesNewColumns(source, destination, sourceP, destinationP):
    return ip_src[source], ip_dst[destination], ip_src_port_dst.loc[source,destinationP], ip_dst_port_src.loc[destination,sourceP]

def calculoNewsColumns(dataframe):
    dataframe['Same_ip_src'], dataframe['Same_ip_dst'], dataframe['Same_ip_src_port_dst'], dataframe['Same_ip_dst_port_src'] = [np.nan,np.nan,np.nan,np.nan]
    dataframe[['Same_ip_src', 'Same_ip_dst', 'Same_ip_src_port_dst', 'Same_ip_dst_port_src']] = dataframe.apply(lambda x: valuesNewColumns(x['Src IP Addr'], x['Dst IP Addr'], x['Src Pt'], x['Dst Pt']), axis=1, result_type='expand')
    

In [7]:
ip_src = df_UNSW.groupby(['Src IP Addr']).size()
ip_dst = df_UNSW.groupby(['Dst IP Addr']).size()
ip_src_port_dst = df_UNSW.groupby(['Src IP Addr', 'Dst Pt']).size()
ip_dst_port_src = df_UNSW.groupby(['Dst IP Addr', 'Src Pt']).size()
calculoNewsColumns(df_UNSW)

**Función que nos permite tener la clase (o tag) de tipo de traza en valor numérico (0:Normal, 1:Ataque)**

In [8]:
def create_class_numeric(dataframe):
    dataframe['class_numeric'] = dataframe['class'].copy()
    dataframe.loc[:, 'class_numeric'].replace(['Normal'], 0, inplace=True)
    dataframe.loc[:, 'class_numeric'].replace(['Attack'], 1, inplace=True)

In [9]:
create_class_numeric(df_UNSW)

In [10]:
df_UNSW.head(5)

Unnamed: 0,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,class,Same_ip_src,Same_ip_dst,Same_ip_src_port_dst,Same_ip_dst_port_src,class_numeric
0,59.166.0.9,7045,149.171.126.7,25,Normal,26398,26075,1120,1,0
1,59.166.0.9,9685,149.171.126.2,80,Normal,26398,26990,2705,2,0
2,59.166.0.2,1421,149.171.126.4,53,Normal,27050,27120,5249,1,0
3,59.166.0.2,21553,149.171.126.2,25,Normal,27050,26990,1138,1,0
4,59.166.0.8,45212,149.171.126.4,53,Normal,26024,27120,5072,1,0


In [11]:
df_UNSW.to_csv('UNSW_trabajados.csv')