#### Import packages

In [None]:
import os
import re
import io

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import socket

from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer

from ipwhois import IPWhois

## Reads the dataset

#### Datasets directly from Netflow

In [None]:
filePath = "../../project_course_data/"
fileName = "owndata.txt"

with open(filePath + fileName, "r") as f:
    content = f.read().replace("->", " ").replace(" K ", "K ").replace(" M ", "M ").replace(" G ", "G ")
    
csvStringIO = io.StringIO(content)
columnNames = ["Datetime", "Time", "Duration",  "Proto", "Src IP Addr:Port", "Dst IP Addr:Port", "Packets", "Bytes", "Flows"]

# dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f')

data = pd.read_csv(csvStringIO, sep = '\s+', names = columnNames, header = None, usecols=range(len(columnNames)), parse_dates=True, engine = "python")
data = data.iloc[1:-4] #Removes the summary lines and col names

data["Datetime"] += " " + data["Time"]
data = data.drop(columns = ["Time"])


## SORTING AND SPLITTING AND COMBINING

In [None]:
 # Convert the datetime column to pandas datetime format
data['Datetime'] = pd.to_datetime(data['Datetime'])

# Sort the DataFrame by the datetime column
data = data.sort_values(by='Datetime')

# Split IP address and port to two columns, and drops the old column
data[['Src IP Addr', 'Src Port']] = data['Src IP Addr:Port'].str.split(':', n=1, expand=True)
data[['Dst IP Addr', 'Dst Port']] = data['Dst IP Addr:Port'].str.split(':', n=1, expand=True)
data = data.drop(columns=["Src IP Addr:Port", "Dst IP Addr:Port"])

print(data)

#### Preprocess data

In [None]:
# Changes Duration column to float
data['Duration'] = data['Duration'].astype(float)
# Drop outliers
data = data.drop(data[data.Duration > 10000].index)

# Find clients IP address
client = data['Src IP Addr'].value_counts().idxmax().split('.')
client = '.'.join(client[:3])

# Initialize 'Host IP'-column from 'Src Ip Addr'
data['Host IP'] = data['Src IP Addr']
data['Client IP'] = data['Dst IP Addr']

# Removes all internal flows
# If the destination IP is not equal to the clients IP, adds it to 'Host IP'-column
for index, row in data.iterrows():
    if client in row['Dst IP Addr'] and client in row['Src IP Addr']:
        data.drop(index, inplace=True)
    elif client not in row['Dst IP Addr']:
        data.at[index, 'Host IP'] = row['Dst IP Addr']
        data.at[index, 'Client IP'] = row['Src IP Addr']
data = data.reset_index(drop=True)
print(data)

#### Prefix processing

In [None]:
def convert_bytes(value):
    multipliers = {'K': 1000, 'M': 1000000, 'G': 1000000000}

    # Split the value into numerical part and prefix (if present)
    parts = re.split(r'(\d+)', value)
    parts = [item for item in parts if item]
    num_part = float(parts[0]) #if parts[0].isdigit() else None
    prefix = parts[1] if len(parts) > 1 else None

    # Check if a valid prefix is present
    if prefix and prefix in multipliers:
        return num_part * multipliers[prefix]
    elif num_part is not None:
        # If no valid prefix is found but there is a numerical part, return it as is
        return num_part
    else:
        # If neither numerical part nor valid prefix is found, return the original value
        return float(value)

# Apply the conversion function to the 'Bytes' column
data['Bytes'] = data['Bytes'].apply(convert_bytes)
print(data)

#### Reverse DNS-lookup

In [None]:
# Find all unique addresses
unique_ip = data['Host IP'].unique()

# Creates a new dataframe
data_DNS = pd.DataFrame(columns=['IP', 'Host'])

data_DNS['IP']=unique_ip
host = []

# for-loop for doing reverse DNS lookup
i=0
for ip in unique_ip:
    try:
        host_name = socket.gethostbyaddr(ip)[0]
        host.append(host_name)
    except socket.herror:
        host.append(None)
    
    if i % 100 == 0: # Used to keep track how far along we've come
        print(f"{i} / {len(unique_ip)}")
    i += 1

# Adds the corresponding domain names to the IP-addresses and creates a CSV-file
data_DNS['Host'] = host
# data_DNS.to_csv('./host_names', index=False)
    
print(data_DNS)

In [None]:
DNS_dict = {}
for index, row in data_DNS.iterrows():
    DNS_dict[row["IP"]] = row["Host"]

print(DNS_dict)


dataWithDomains = data.copy()
dataWithDomains["Domain Name"] = dataWithDomains.apply(lambda row: DNS_dict[row["Host IP"]], axis= 1)

print(dataWithDomains)


#### Changes Protocol into feature-columns

In [None]:
dataWithDummies = dataWithDomains.copy()

# Get the dummies and store it in a variable
dummies = pd.get_dummies(dataWithDummies.Proto).astype(int)
 
# Concatenate the dummies to original dataframe
dataWithDummies = pd.concat([dataWithDummies, dummies], axis='columns')

# drop the values
dataWithDummies = dataWithDummies.drop(['Proto'], axis='columns')
dataWithDummies = dataWithDummies.fillna('0')

print(dataWithDummies)

### Change PORT into feature columns

In [None]:
#  Creates a column for non-client ports
dataWithDummies['Host Port'] = None

# Adds the non-client port to the new column by checking that the ports (Src & Dst) does not contain the client IP
for index, row in dataWithDummies.iterrows():
    if row["Host IP"] ==  row['Src IP Addr']:
        dataWithDummies.at[index, 'Host Port'] = row['Src Port']
    else:
        dataWithDummies.at[index, 'Host Port'] = row['Dst Port'] 

# Get the dummies and store it in a variable
dummies = pd.get_dummies(dataWithDummies["Host Port"]).astype(int)
 
# Concatenate the dummies to original dataframe
dataWithDummies = pd.concat([dataWithDummies, dummies], axis='columns')

# drop the values
dataWithDummies = dataWithDummies.fillna('0')

print(dataWithDummies)
        

### BAG OF WORDS FOR THE DOMAIN NAMES

In [None]:
dataWithBOW = dataWithDummies.copy()

# Create and fit vectorizer
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(dataWithBOW['Domain Name'])

# vectorize
df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# find columns with only numeric names and drop them
numeric_columns = df_bow.columns[df_bow.columns.str.isnumeric()]
df_bow.drop(numeric_columns, axis=1, inplace=True)

dataWithBOW.reset_index(drop=True, inplace=True)
df_bow.reset_index(drop=True, inplace=True)
dataWithBOW = pd.concat([dataWithBOW, df_bow], axis=1)

### CERTIFICATE LOOKUP

In [None]:
# WHOIS FETCH FUNCTION
def get_ip_info(ip):
    
    ipwhois_obj = IPWhois(ip)

    result = ipwhois_obj.lookup_rdap()

    # Available information from ipwhois:

    # print("IP Address:", result['query'])
    # print("ASN:", result['asn'])
    # print("CIDR:", result['asn_cidr'])
    # print("Name:", result['network']['name'])
    # print("Country:", result['asn_country_code'])
    # print("Description:", result['asn_description'])

    return {"name": result['network']['name'], "country": result['asn_country_code']}

In [None]:
dataWithWhoIs = dataWithBOW.copy()

sz = len(dataWithWhoIs)
t = 0
memo = {}
## ITERATING THROUGH DATASET / CERTIFICATE LOOKUP OF IP
for index, row in dataWithWhoIs.iterrows():

    host_IP = row["Host IP"]

    if(host_IP not in memo): # NEW IP
        print("not in")
        try: 
            # fetch ipwhois info
            whoIsResult = get_ip_info(row["Host IP"])

            # add info
            dataWithWhoIs.at[index, "ipwhois_name"] = whoIsResult["name"]
            dataWithWhoIs.at[index, "ipwhois_country"] = whoIsResult["country"]

            # save for later
            memo[host_IP] = whoIsResult

        except:
            # default option
            dataWithWhoIs.at[index, "ipwhois_name"] = np.nan
            dataWithWhoIs.at[index, "ipwhois_country"] = np.nan

    else: # IP ALREADY CHECKED
        # get saved whois info
        whoIsResult = memo[host_IP]

        # add info
        dataWithWhoIs.at[index, "ipwhois_name"] = whoIsResult["name"]
        dataWithWhoIs.at[index, "ipwhois_country"] = whoIsResult["country"]

    t+=1
    
    print(f"{t} / {sz}")

In [None]:
# MAKE DUMMIES FROM WHOIS DATA

# get the dummies and store it in a variable
dummies_name = pd.get_dummies(dataWithWhoIs.ipwhois_name).astype(int)
dummies_country = pd.get_dummies(dataWithWhoIs.ipwhois_country).astype(int)

 
# Concatenate the dummies to original dataframe
dataWithWhoIs = pd.concat([dataWithWhoIs, dummies_name], axis='columns')
dataWithWhoIs = pd.concat([dataWithWhoIs, dummies_country], axis='columns')


# drop the values
dataWithWhoIs = dataWithWhoIs.drop(["ipwhois_name", "ipwhois_country"], axis='columns')
dataWithWhoIs = dataWithWhoIs.fillna(0)

### SAVE TO FILE

In [None]:
newFileName = "preprocessedData.csv"
dataWithWhoIs.to_csv(filePath + newFileName, sep="\t", index = False)