# Library Import


In [3]:
import pandas as pd

In [None]:
import pandas as pd
import polars as pl 
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

pd.set_option("display.max_columns", None)

# DATA EXPLORATION AND ANALYSIS


##  Load the dataset

In [8]:
path = "../data/raw/cybersecurity_attacks.csv"
# Load the dataset
df = pd.read_csv(path)

## profile report

In [None]:

profile = ProfileReport(df, title = "Cyber Security Attack Type Detection")
profile.to_notebook_iframe()

## columns inspection


- The profile report indicates that there are three different kinds of attacks: DDoS, Malware and Intrusion. Furthermore, approximately half of the data has a malware indicator, and approximately half the data has an alert.
- The source IP address is a unique variable as is the destination IP address. 
- There are no correlations between the variables. 
- Alerts are something that are triggered by the attack, it is assumable that this variable cannot be used to predict the attack type as they arrive after the attack is deployed
- The same can be said for the action taken after the attack

In [11]:
df.describe()


Unnamed: 0,Source Port,Destination Port,Packet Length,Anomaly Scores
count,40000.0,40000.0,40000.0,40000.0
mean,32970.35645,33150.86865,781.452725,50.113473
std,18560.425604,18574.668842,416.044192,28.853598
min,1027.0,1024.0,64.0,0.0
25%,16850.75,17094.75,420.0,25.15
50%,32856.0,33004.5,782.0,50.345
75%,48928.25,49287.0,1143.0,75.03
max,65530.0,65535.0,1500.0,100.0


- half of the entries have duplicated user information, but with different source and destination IP addresses (unique variables)


In [12]:
df[df["User Information"].duplicated(keep=False)].head(5)


Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Payload Data,...,Action Taken,Severity Level,User Information,Device Information,Network Segment,Geo-location Data,Proxy Information,Firewall Logs,IDS/IPS Alerts,Log Source
1,2020-08-26 07:08:30,78.199.217.198,66.191.137.154,17245,48166,ICMP,1174,Data,HTTP,Aperiam quos modi officiis veritatis rem. Omni...,...,Blocked,Low,Sumer Rana,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment B,"Bilaspur, Nagaland",,Log Data,,Firewall
9,2021-08-15 22:29:04,114.109.149.113,160.88.194.172,37918,50039,UDP,224,Data,HTTP,Consequatur ipsum autem reprehenderit quae. Do...,...,Blocked,Medium,Mishti Chaudhuri,Mozilla/5.0 (Windows; U; Windows NT 6.0) Apple...,Segment A,"Rampur, Mizoram",87.128.245.244,,,Server
10,2022-07-20 13:28:50,177.21.83.200,196.218.124.169,35538,35006,ICMP,661,Data,HTTP,Sequi maxime voluptate ea. Eius officiis eaque...,...,Ignored,Medium,Hunar Sem,Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3 like M...,Segment B,"Gangtok, Haryana",29.161.99.247,,,Server
11,2022-06-26 15:15:50,92.4.25.171,112.43.185.24,10903,36817,TCP,281,Control,HTTP,Nihil praesentium asperiores omnis ullam liber...,...,Ignored,Low,Mehul Raj,Opera/8.38.(X11; Linux x86_64; pt-BR) Presto/2...,Segment B,"Nandyal, Mizoram",,,,Firewall
12,2020-09-30 21:35:31,57.91.207.84,98.96.110.38,53471,38048,ICMP,64,Control,DNS,Earum sit est et eaque ipsam. Vero repellendus...,...,Blocked,Medium,Vaibhav Kala,Opera/8.54.(Windows NT 6.0; tg-TJ) Presto/2.9....,Segment B,"Silchar, Kerala",,Log Data,Alert Data,Server


### Inspect action taken for attack type

In [None]:
#plot the action taken for each attack type 

fig = px.bar(df, x= "Attack Type", color = "Action Taken", barmode = 'group', color_discrete_sequence= ["red", "blue", "green"])
fig.show()

In [None]:
fig, ax = plt.subplots(1,3, figsize=(18,5))

bar_container = ax[0].bar(df["Action Taken"].unique(), df["Action Taken"].loc[df["Attack Type"]=="DDoS"].value_counts())
bar_container = ax[1].bar(df["Action Taken"].unique(), df["Action Taken"].loc[df["Attack Type"]=="Malware"].value_counts())
bar_container = ax[2].bar(df["Action Taken"].unique(), df["Action Taken"].loc[df["Attack Type"]=="Intrusion"].value_counts())

ax[0].set_title("DDoS")
ax[1].set_title("Malware")
ax[2].set_title("Intrusion")

In [None]:
df["Attack Type"].value_counts()

### Inspect alerts triggered by attack type

In [None]:
#tabulate the alerts triggered for each attack type
print(f"number of alerts triggered for each attack type:"
      f" DDoS: {df["Alerts/Warnings"].loc[df["Attack Type"]=="DDoS"].count()} for {len(df[df["Attack Type"]=="DDoS"])} attacks," 
      f" Malware: {df["Alerts/Warnings"].loc[df["Attack Type"]=="Malware"].count()} for {len(df[df["Attack Type"]=="Malware"])} attacks," 
      f" Intrusion: {df["Alerts/Warnings"].loc[df["Attack Type"]=="Intrusion"].count()} for {len(df[df["Attack Type"]=="Intrusion"])} attacks")

- half of the action taken is not triggered by an alert/warning

In [None]:
df["Action Taken"].loc[df["Alerts/Warnings"].isna()].count(), df["Action Taken"].count()

### Inspect protocol and attack type

In [None]:
fig, ax = plt.subplots(1,3, figsize=(18,5))

bar_container = ax[0].bar(df["Protocol"].unique(), df["Protocol"].loc[df["Attack Type"]=="DDoS"].value_counts())
bar_container = ax[1].bar(df["Protocol"].unique(), df["Protocol"].loc[df["Attack Type"]=="Malware"].value_counts())
bar_container = ax[2].bar(df["Protocol"].unique(), df["Protocol"].loc[df["Attack Type"]=="Intrusion"].value_counts())

ax[0].set_title("DDoS")
ax[1].set_title("Malware")
ax[2].set_title("Intrusion")

### Inspect packets

In [None]:
fig, ax = plt.subplots(1,2, figsize = (16,5))

ax[0].hist(df["Packet Length"].loc[df["Packet Type"]=="Control"], bins=50)
ax[1].hist(df["Packet Length"].loc[df["Packet Type"]=="Data"], bins=50)

ax[0].set_title("packet type: Control")
ax[1].set_title("packet type: Data")

# DATA CLEANING

## information about the table 

- It has 40,000 rows and 25 columns.
- TimeStamps is a data type column 
- Several columns have missing values (e.g., Malware Indicators, Alerts/Warnings, Proxy Information,Firewall logs, IDS/IPS Alerts).
- numerical column ( Packet Length, Anamaly Scores) numerical analyses can be made on it 
- string column (Source IP Address, Destination IP Address, Source Port, Destination Port, Payload Data, User information, Device Information, Geo-location Data )
- Some categorical features (e.g., Protocol, Packet Type, Traffic Type,Attack Type,Attack Signature,Action Taken,  Severity Level, Network Segment, Log Source) will require encoding for ML.
- columns we will transform to categorial (Malware Indicators, Alerts/warnings, Firewall logs, IDS/IPS Alerts) 
- Proxy Information is a string column , but will be removed not relevant for our prediction and has many NaN value more than half of our set  

## Missing values

### Columns we need to add values

- Malware Indicator
- Alerts/Warnings
- FireWall Logs
- IDS/IPS Alerts



In [None]:
## ading a value "not ioc detected" to the Malware Indicators column where the value is NaN

df['Malware Indicators'] = df['Malware Indicators'].fillna('No IoC Detected')

# adding a value "No Alert triggered" to the Alerts/Warnings column where the value is NaN

df['Alerts/Warnings'] = df['Alerts/Warnings'].fillna('No Alert Triggered')

# adding a value "No Log Data" to the Firewall logs column where the value is NaN

df['Firewall Logs'] = df['Firewall Logs'].fillna('No Log Data')

# adding a value "No Alert Data" to the IDS/IPS Alerts column where the value is NaN

df['IDS/IPS Alerts'] = df['IDS/IPS Alerts'].fillna('No Alert Data')

### Drop columns


- Proxy Information : This column is not really relevant for our model and has many nan 


In [None]:
# Delete Proxy Information column

df = df.drop('Proxy Information', axis=1)

### casting data type

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Timestamp               40000 non-null  object  
 1   Source IP Address       40000 non-null  object  
 2   Destination IP Address  40000 non-null  object  
 3   Source Port             40000 non-null  int64   
 4   Destination Port        40000 non-null  int64   
 5   Protocol                40000 non-null  category
 6   Packet Length           40000 non-null  int64   
 7   Packet Type             40000 non-null  category
 8   Traffic Type            40000 non-null  category
 9   Payload Data            40000 non-null  object  
 10  Malware Indicators      20000 non-null  category
 11  Anomaly Scores          40000 non-null  float64 
 13  Attack Type             40000 non-null  category
 14  Attack Signature        40000 non-null  category
 15  Action Taken          

In [14]:
column_dtypes_for_new = {   
    "Protocol": "category",
    "Packet Length": int,
    "Packet Type": "category",
    "Traffic Type": "category",
    "Malware Indicators": "category",
    "Anomaly Scores": float,
    "Alerts/Warnings": "category",
    "Attack Type": "category",
    "Attack Signature": "category",
    "Action Taken": "category",
    "Severity Level": "category",
    "Network Segment": "category",
    "Firewall Logs": "category",
    "IDS/IPS Alerts": "category",
    "Log Source": "category",
    #"attack_Index": int
    }

# Convert all other columns based on dictionary
for col, dtype in column_dtypes_for_new.items():
    if dtype == str:
        df[col] = df[col].astype("string")  # Explicitly enforce str type
    else:
        df[col] = df[col].astype(dtype)

<b> Data stored in folder data/processed <b>

-   40 000 rows * 24 columns 
