# clean data 

## importing library

In [2]:
import pandas as pd


In [3]:
# load the csv file 
df = pd.read_csv(r"C:\Users\Vivian\Documents\Project 1\Project 1\cybersecurity_attacks.csv")

## display information 

In [6]:
df.head(5)



In [7]:
df.describe()



In [8]:
df.info()



## information about the table 

- It has 40,000 rows and 25 columns.
- TimeStamps is a data type column 
- Several columns have missing values (e.g., Malware Indicators, Alerts/Warnings, Proxy Information,Firewall logs, IDS/IPS Alerts).
- numerical column ( Packet Length, Anamaly Scores) numerical analyses can be made on it 
- string column (Source IP Address, Destination IP Address, Source Port, Destination Port, Payload Data, User information, Device Information, Geo-location Data )
- Some categorical features (e.g., Protocol, Packet Type, Traffic Type,Attack Type,Attack Signature,Action Taken,  Severity Level, Network Segment, Log Source) will require encoding for ML.
- columns we will transform to categorial (Malware Indicators, Alerts/warnings, Firewall logs, IDS/IPS Alerts) 
- Proxy Information is a string column , but will be removed not relevant for our prediction and has many NaN value more than half of our set  

## missing column values

### Malware Indicator


In [4]:
## ading a value "not ioc detected" to the Malware Indicators column where the value is NaN

df['Malware Indicators'] = df['Malware Indicators'].fillna('No IoC Detected')

### Alerts/Warnings

In [5]:
# adding a value "No Alert triggered" to the Alerts/Warnings column where the value is NaN

df['Alerts/Warnings'] = df['Alerts/Warnings'].fillna('No Alert Triggered')

### Proxy Information 
     This column is not really relevant for our model and has many nan 

In [6]:
# Delete Proxy Information column

df = df.drop('Proxy Information', axis=1)

### FireWall Logs


In [7]:
# adding a value "No Log Data" to the Firewall logs column where the value is NaN

df['Firewall Logs'] = df['Firewall Logs'].fillna('No Log Data')

### IDS/IPS Alerts 

In [8]:
# adding a value "No Alert Data" to the IDS/IPS Alerts column where the value is NaN

df['IDS/IPS Alerts'] = df['IDS/IPS Alerts'].fillna('No Alert Data')

In [9]:
## we verify that the changes have been made

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Timestamp               40000 non-null  object 
 1   Source IP Address       40000 non-null  object 
 2   Destination IP Address  40000 non-null  object 
 3   Source Port             40000 non-null  int64  
 4   Destination Port        40000 non-null  int64  
 5   Protocol                40000 non-null  object 
 6   Packet Length           40000 non-null  int64  
 7   Packet Type             40000 non-null  object 
 8   Traffic Type            40000 non-null  object 
 9   Payload Data            40000 non-null  object 
 10  Malware Indicators      40000 non-null  object 
 11  Anomaly Scores          40000 non-null  float64
 13  Attack Type             40000 non-null  object 
 14  Attack Signature        40000 non-null  object 
 15  Action Taken            40000 non-null

## Data conversion 

### timestamps column 


In [10]:
## timestamp column is not in the right format, we will convert it to datetime format, done separately for pandas 

df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [18]:
df.info()



In [11]:
# Dictionary defining column names and their data types
column_dtypes = {   
    "Source IP Address":'string',
    "Destination IP Address": 'string',
    "Source Port": 'string',
    "Destination Port": 'string',
    "Protocol": "category",
    "Packet Length": int,
    "Packet Type": "category",
    "Traffic Type": "category",
    "Payload Data": 'string',
    "Malware Indicators": "category",
    "Anomaly Scores": float,
    "Alerts/Warnings": "category",
    "Attack Type": "category",
    "Attack Signature": "category",
    "Action Taken": "category",
    "Severity Level": "category",
    "User Information": 'string',
    "Device Information": 'string',
    "Network Segment": "category",
    "Geo-location Data": 'string',  
    "Firewall Logs": "category",
    "IDS/IPS Alerts": "category",
    "Log Source": "category"
}

# Convert all other columns based on dictionary
for col, dtype in column_dtypes.items():
    if dtype == str:
        df[col] = df[col].astype("string")  # Explicitly enforce str type
    else:
        df[col] = df[col].astype(dtype)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Timestamp               40000 non-null  datetime64[ns]
 1   Source IP Address       40000 non-null  string        
 2   Destination IP Address  40000 non-null  string        
 3   Source Port             40000 non-null  string        
 4   Destination Port        40000 non-null  string        
 5   Protocol                40000 non-null  category      
 6   Packet Length           40000 non-null  int32         
 7   Packet Type             40000 non-null  category      
 8   Traffic Type            40000 non-null  category      
 9   Payload Data            40000 non-null  string        
 10  Malware Indicators      40000 non-null  category      
 11  Anomaly Scores          40000 non-null  float64       
 13  Attack Type             40000 non-null  catego

## storing the cleaned data frame in csv 

In [13]:
## we will now store the cleaned data in a new csv file

df.to_csv("cleaned_cybersecurity_attacks.csv", index=False)

# Outliers 

## graphs to detect some outliers 

# Feature Eng

## import the cleaned csv 

In [14]:
df_cleaned = pd.read_csv("cleaned_cybersecurity_attacks.csv")
df_cleaned.head(5)

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Payload Data,...,Attack Signature,Action Taken,Severity Level,User Information,Device Information,Network Segment,Geo-location Data,Firewall Logs,IDS/IPS Alerts,Log Source
0,2023-05-30 06:33:58,103.216.15.12,84.9.164.252,31225,17616,ICMP,503,Data,HTTP,Qui natus odio asperiores nam. Optio nobis ius...,...,Known Pattern B,Logged,Low,Reyansh Dugal,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment A,"Jamshedpur, Sikkim",Log Data,No Alert Data,Server
1,2020-08-26 07:08:30,78.199.217.198,66.191.137.154,17245,48166,ICMP,1174,Data,HTTP,Aperiam quos modi officiis veritatis rem. Omni...,...,Known Pattern A,Blocked,Low,Sumer Rana,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment B,"Bilaspur, Nagaland",Log Data,No Alert Data,Firewall
2,2022-11-13 08:23:25,63.79.210.48,198.219.82.17,16811,53600,UDP,306,Control,HTTP,Perferendis sapiente vitae soluta. Hic delectu...,...,Known Pattern B,Ignored,Low,Himmat Karpe,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Segment C,"Bokaro, Rajasthan",Log Data,Alert Data,Firewall
3,2023-07-02 10:38:46,163.42.196.10,101.228.192.255,20018,32534,UDP,385,Data,HTTP,Totam maxime beatae expedita explicabo porro l...,...,Known Pattern B,Blocked,Medium,Fateh Kibe,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_11_5; ...,Segment B,"Jaunpur, Rajasthan",No Log Data,Alert Data,Firewall
4,2023-07-16 13:11:07,71.166.185.76,189.243.174.238,6131,26646,TCP,1462,Data,DNS,Odit nesciunt dolorem nisi iste iusto. Animi v...,...,Known Pattern B,Blocked,Low,Dhanush Chad,Mozilla/5.0 (compatible; MSIE 5.0; Windows NT ...,Segment C,"Anantapur, Tripura",No Log Data,Alert Data,Firewall


In [15]:
df_cleaned_copy = df_cleaned.copy()
df_cleaned_copy.head(5)

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Payload Data,...,Attack Signature,Action Taken,Severity Level,User Information,Device Information,Network Segment,Geo-location Data,Firewall Logs,IDS/IPS Alerts,Log Source
0,2023-05-30 06:33:58,103.216.15.12,84.9.164.252,31225,17616,ICMP,503,Data,HTTP,Qui natus odio asperiores nam. Optio nobis ius...,...,Known Pattern B,Logged,Low,Reyansh Dugal,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment A,"Jamshedpur, Sikkim",Log Data,No Alert Data,Server
1,2020-08-26 07:08:30,78.199.217.198,66.191.137.154,17245,48166,ICMP,1174,Data,HTTP,Aperiam quos modi officiis veritatis rem. Omni...,...,Known Pattern A,Blocked,Low,Sumer Rana,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment B,"Bilaspur, Nagaland",Log Data,No Alert Data,Firewall
2,2022-11-13 08:23:25,63.79.210.48,198.219.82.17,16811,53600,UDP,306,Control,HTTP,Perferendis sapiente vitae soluta. Hic delectu...,...,Known Pattern B,Ignored,Low,Himmat Karpe,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Segment C,"Bokaro, Rajasthan",Log Data,Alert Data,Firewall
3,2023-07-02 10:38:46,163.42.196.10,101.228.192.255,20018,32534,UDP,385,Data,HTTP,Totam maxime beatae expedita explicabo porro l...,...,Known Pattern B,Blocked,Medium,Fateh Kibe,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_11_5; ...,Segment B,"Jaunpur, Rajasthan",No Log Data,Alert Data,Firewall
4,2023-07-16 13:11:07,71.166.185.76,189.243.174.238,6131,26646,TCP,1462,Data,DNS,Odit nesciunt dolorem nisi iste iusto. Animi v...,...,Known Pattern B,Blocked,Low,Dhanush Chad,Mozilla/5.0 (compatible; MSIE 5.0; Windows NT ...,Segment C,"Anantapur, Tripura",No Log Data,Alert Data,Firewall


add a column attack_id

In [16]:
df_cleaned_copy['attack_Index'] = df_cleaned_copy.index
df_cleaned_copy.head(5)

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Payload Data,...,Action Taken,Severity Level,User Information,Device Information,Network Segment,Geo-location Data,Firewall Logs,IDS/IPS Alerts,Log Source,attack_Index
0,2023-05-30 06:33:58,103.216.15.12,84.9.164.252,31225,17616,ICMP,503,Data,HTTP,Qui natus odio asperiores nam. Optio nobis ius...,...,Logged,Low,Reyansh Dugal,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment A,"Jamshedpur, Sikkim",Log Data,No Alert Data,Server,0
1,2020-08-26 07:08:30,78.199.217.198,66.191.137.154,17245,48166,ICMP,1174,Data,HTTP,Aperiam quos modi officiis veritatis rem. Omni...,...,Blocked,Low,Sumer Rana,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment B,"Bilaspur, Nagaland",Log Data,No Alert Data,Firewall,1
2,2022-11-13 08:23:25,63.79.210.48,198.219.82.17,16811,53600,UDP,306,Control,HTTP,Perferendis sapiente vitae soluta. Hic delectu...,...,Ignored,Low,Himmat Karpe,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Segment C,"Bokaro, Rajasthan",Log Data,Alert Data,Firewall,2
3,2023-07-02 10:38:46,163.42.196.10,101.228.192.255,20018,32534,UDP,385,Data,HTTP,Totam maxime beatae expedita explicabo porro l...,...,Blocked,Medium,Fateh Kibe,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_11_5; ...,Segment B,"Jaunpur, Rajasthan",No Log Data,Alert Data,Firewall,3
4,2023-07-16 13:11:07,71.166.185.76,189.243.174.238,6131,26646,TCP,1462,Data,DNS,Odit nesciunt dolorem nisi iste iusto. Animi v...,...,Blocked,Low,Dhanush Chad,Mozilla/5.0 (compatible; MSIE 5.0; Windows NT ...,Segment C,"Anantapur, Tripura",No Log Data,Alert Data,Firewall,4


for our FE, to feed in numerical data to our model we will need to drop some non numerical columns like :
- Timestamp
- Source IP Address
- Destination Address
- Source Port 
- Destination Port
- Pay Load 
- User Information
- Device Information
- Geo-location Data


In [17]:
def drop_columns(df):
    columns_to_drop = ["Timestamp", "Source IP Address", "Destination IP Address", "Source Port", "Destination Port", "Payload Data", "User Information", "Device Information", "Geo-location Data"]
    df = df.drop(columns=columns_to_drop)
    return df

df_cleaned_copy = drop_columns(df_cleaned_copy)
df_cleaned_copy.head(5)

Unnamed: 0,Protocol,Packet Length,Packet Type,Traffic Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,Attack Type,Attack Signature,Action Taken,Severity Level,Network Segment,Firewall Logs,IDS/IPS Alerts,Log Source,attack_Index
0,ICMP,503,Data,HTTP,IoC Detected,28.67,No Alert Triggered,Malware,Known Pattern B,Logged,Low,Segment A,Log Data,No Alert Data,Server,0
1,ICMP,1174,Data,HTTP,IoC Detected,51.5,No Alert Triggered,Malware,Known Pattern A,Blocked,Low,Segment B,Log Data,No Alert Data,Firewall,1
2,UDP,306,Control,HTTP,IoC Detected,87.42,Alert Triggered,DDoS,Known Pattern B,Ignored,Low,Segment C,Log Data,Alert Data,Firewall,2
3,UDP,385,Data,HTTP,No IoC Detected,15.79,Alert Triggered,Malware,Known Pattern B,Blocked,Medium,Segment B,No Log Data,Alert Data,Firewall,3
4,TCP,1462,Data,DNS,No IoC Detected,0.52,Alert Triggered,DDoS,Known Pattern B,Blocked,Low,Segment C,No Log Data,Alert Data,Firewall,4


In [23]:
df_cleaned_copy = df_cleaned_copy.drop(columns=["attack_Index"])

KeyError: "['attack_Index'] not found in axis"

In [22]:
df_cleaned_copy.to_csv("cybersecurity_attacks_index.csv", index=False)

In [21]:
new_df = pd.read_csv("cybersecurity_attacks_index.csv", index_col="attack_Index")
new_df.head(5)

ValueError: Index attack_Index invalid


implicite Int data type for Packet Length  , no extreme outliers for this column
implicite Int casting for Anomaly Score


In [24]:
def cast_int_columns(df): ## didn't use this one beccause it takes the attack_Index as a column 
    df['Packet Length'] = df['Packet Length'].astype(int)
    df['Anomaly Scores'] = df['Anomaly Scores'].astype(float)
    return df

# Apply the function to the dataframe
new_df = cast_int_columns(df_cleaned_copy)
new_df.head(5)

Unnamed: 0,Protocol,Packet Length,Packet Type,Traffic Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,Attack Type,Attack Signature,Action Taken,Severity Level,Network Segment,Firewall Logs,IDS/IPS Alerts,Log Source
0,ICMP,503,Data,HTTP,IoC Detected,28.67,No Alert Triggered,Malware,Known Pattern B,Logged,Low,Segment A,Log Data,No Alert Data,Server
1,ICMP,1174,Data,HTTP,IoC Detected,51.5,No Alert Triggered,Malware,Known Pattern A,Blocked,Low,Segment B,Log Data,No Alert Data,Firewall
2,UDP,306,Control,HTTP,IoC Detected,87.42,Alert Triggered,DDoS,Known Pattern B,Ignored,Low,Segment C,Log Data,Alert Data,Firewall
3,UDP,385,Data,HTTP,No IoC Detected,15.79,Alert Triggered,Malware,Known Pattern B,Blocked,Medium,Segment B,No Log Data,Alert Data,Firewall
4,TCP,1462,Data,DNS,No IoC Detected,0.52,Alert Triggered,DDoS,Known Pattern B,Blocked,Low,Segment C,No Log Data,Alert Data,Firewall


these following columns will be encoded:
- Protocol : vector encoding 
- Packet type : vector encoding 
- Traffic Type : vector encoding
- Malware Indicator : vector encoding
- Alerts/Warnings : vector encoding 
- Attack Type : vector encoding 
- Attack Signature : vector encoding 
- Action Taken : vector encoding 
- Severity Level : ordinal 
- Network Segment : vector encoding
- Firewall Logs : vector encoding
- IDS/IPS Alerts : vector encoding 
- Log Source : vector encoding



In [25]:
column_dtypes_for_new = {   
    "Protocol": "category",
    "Packet Length": int,
    "Packet Type": "category",
    "Traffic Type": "category",
    "Malware Indicators": "category",
    "Anomaly Scores": float,
    "Alerts/Warnings": "category",
    "Attack Type": "category",
    "Attack Signature": "category",
    "Action Taken": "category",
    "Severity Level": "category",
    "Network Segment": "category",
    "Firewall Logs": "category",
    "IDS/IPS Alerts": "category",
    "Log Source": "category",
    #"attack_Index": int
    }

# Convert all other columns based on dictionary
for col, dtype in column_dtypes_for_new.items():
    if dtype == str:
        new_df[col] = new_df[col].astype("string")  # Explicitly enforce str type
    else:
        new_df[col] = new_df[col].astype(dtype)

In [26]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Protocol            40000 non-null  category
 1   Packet Length       40000 non-null  int32   
 2   Packet Type         40000 non-null  category
 3   Traffic Type        40000 non-null  category
 4   Malware Indicators  40000 non-null  category
 5   Anomaly Scores      40000 non-null  float64 
 7   Attack Type         40000 non-null  category
 8   Attack Signature    40000 non-null  category
 9   Action Taken        40000 non-null  category
 10  Severity Level      40000 non-null  category
 11  Network Segment     40000 non-null  category
 12  Firewall Logs       40000 non-null  category
 13  IDS/IPS Alerts      40000 non-null  category
 14  Log Source          40000 non-null  category
dtypes: category(13), float64(1), int32(1)
memory usage: 978.3 KB


## Vector encoding 

In [27]:
# Encoding categorical columns using get_dummies
categorical_columns = ['Protocol', 'Packet Type', 'Traffic Type', 'Malware Indicators', 'Alerts/Warnings', 'Attack Signature', 'Action Taken', 'Network Segment', 'Firewall Logs', 'IDS/IPS Alerts', 'Log Source']

#df_encoded = new_df.copy()

for col in categorical_columns:
    dummies = pd.get_dummies(new_df[col], prefix=col, dtype=int)
    new_df = pd.concat([new_df, dummies], axis=1)
    #df_encoded = df_encoded.drop(columns=[col])



In [28]:
new_df.head(5)


Unnamed: 0,Protocol,Packet Length,Packet Type,Traffic Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,Attack Type,Attack Signature,Action Taken,...,Action Taken_Logged,Network Segment_Segment A,Network Segment_Segment B,Network Segment_Segment C,Firewall Logs_Log Data,Firewall Logs_No Log Data,IDS/IPS Alerts_Alert Data,IDS/IPS Alerts_No Alert Data,Log Source_Firewall,Log Source_Server
0,ICMP,503,Data,HTTP,IoC Detected,28.67,No Alert Triggered,Malware,Known Pattern B,Logged,...,1,1,0,0,1,0,0,1,0,1
1,ICMP,1174,Data,HTTP,IoC Detected,51.5,No Alert Triggered,Malware,Known Pattern A,Blocked,...,0,0,1,0,1,0,0,1,1,0
2,UDP,306,Control,HTTP,IoC Detected,87.42,Alert Triggered,DDoS,Known Pattern B,Ignored,...,0,0,0,1,1,0,1,0,1,0
3,UDP,385,Data,HTTP,No IoC Detected,15.79,Alert Triggered,Malware,Known Pattern B,Blocked,...,0,0,1,0,0,1,1,0,1,0
4,TCP,1462,Data,DNS,No IoC Detected,0.52,Alert Triggered,DDoS,Known Pattern B,Blocked,...,0,0,0,1,0,1,1,0,1,0


## ordinal encoding

In [29]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the "Severity Level" column
new_df['Severity Level'] = label_encoder.fit_transform(new_df['Severity Level'])



In [29]:
# Display the first few rows to verify the encoding
new_df.head(20)

Unnamed: 0_level_0,Protocol,Packet Length,Packet Type,Traffic Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,Attack Type,Attack Signature,Action Taken,...,Action Taken_Logged,Network Segment_Segment A,Network Segment_Segment B,Network Segment_Segment C,Firewall Logs_Log Data,Firewall Logs_No Log Data,IDS/IPS Alerts_Alert Data,IDS/IPS Alerts_No Alert Data,Log Source_Firewall,Log Source_Server
attack_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,ICMP,503,Data,HTTP,IoC Detected,28.67,No Alert Triggered,Malware,Known Pattern B,Logged,...,1,1,0,0,1,0,0,1,0,1
1,ICMP,1174,Data,HTTP,IoC Detected,51.5,No Alert Triggered,Malware,Known Pattern A,Blocked,...,0,0,1,0,1,0,0,1,1,0
2,UDP,306,Control,HTTP,IoC Detected,87.42,Alert Triggered,DDoS,Known Pattern B,Ignored,...,0,0,0,1,1,0,1,0,1,0
3,UDP,385,Data,HTTP,No IoC Detected,15.79,Alert Triggered,Malware,Known Pattern B,Blocked,...,0,0,1,0,0,1,1,0,1,0
4,TCP,1462,Data,DNS,No IoC Detected,0.52,Alert Triggered,DDoS,Known Pattern B,Blocked,...,0,0,0,1,0,1,1,0,1,0
5,UDP,1423,Data,HTTP,No IoC Detected,5.76,No Alert Triggered,Malware,Known Pattern A,Logged,...,1,0,0,1,0,1,0,1,0,1
6,TCP,379,Data,DNS,No IoC Detected,31.55,No Alert Triggered,DDoS,Known Pattern B,Ignored,...,0,1,0,0,1,0,0,1,0,1
7,ICMP,1022,Data,DNS,IoC Detected,54.05,Alert Triggered,Intrusion,Known Pattern A,Logged,...,1,1,0,0,1,0,1,0,1,0
8,TCP,1281,Control,FTP,IoC Detected,56.34,Alert Triggered,Intrusion,Known Pattern A,Blocked,...,0,0,1,0,1,0,1,0,0,1
9,UDP,224,Data,HTTP,No IoC Detected,16.51,Alert Triggered,Malware,Known Pattern B,Blocked,...,0,1,0,0,0,1,0,1,0,1


## dropping useless columns


In [30]:
columns_to_drop = ['Protocol', 'Packet Type', 'Traffic Type', 'Malware Indicators', 'Alerts/Warnings', 'Attack Signature', 'Action Taken', 'Network Segment', 'Firewall Logs', 'IDS/IPS Alerts', 'Log Source']
new_df = new_df.drop(columns=columns_to_drop)
new_df.head(5)

Unnamed: 0,Packet Length,Anomaly Scores,Attack Type,Severity Level,Protocol_ICMP,Protocol_TCP,Protocol_UDP,Packet Type_Control,Packet Type_Data,Traffic Type_DNS,...,Action Taken_Logged,Network Segment_Segment A,Network Segment_Segment B,Network Segment_Segment C,Firewall Logs_Log Data,Firewall Logs_No Log Data,IDS/IPS Alerts_Alert Data,IDS/IPS Alerts_No Alert Data,Log Source_Firewall,Log Source_Server
0,503,28.67,Malware,1,1,0,0,0,1,0,...,1,1,0,0,1,0,0,1,0,1
1,1174,51.5,Malware,1,1,0,0,0,1,0,...,0,0,1,0,1,0,0,1,1,0
2,306,87.42,DDoS,1,0,0,1,1,0,0,...,0,0,0,1,1,0,1,0,1,0
3,385,15.79,Malware,2,0,0,1,0,1,0,...,0,0,1,0,0,1,1,0,1,0
4,1462,0.52,DDoS,1,0,1,0,0,1,1,...,0,0,0,1,0,1,1,0,1,0


In [31]:
new_df = new_df.drop(columns=['attack_Index']) ## see the part of new df loading making this column the index column 
new_df.head(5)

KeyError: "['attack_Index'] not found in axis"

In [32]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 30 columns):
 #   Column                              Non-Null Count  Dtype   
---  ------                              --------------  -----   
 0   Packet Length                       40000 non-null  int32   
 1   Anomaly Scores                      40000 non-null  float64 
 2   Attack Type                         40000 non-null  category
 3   Severity Level                      40000 non-null  int32   
 4   Protocol_ICMP                       40000 non-null  int32   
 5   Protocol_TCP                        40000 non-null  int32   
 6   Protocol_UDP                        40000 non-null  int32   
 7   Packet Type_Control                 40000 non-null  int32   
 8   Packet Type_Data                    40000 non-null  int32   
 9   Traffic Type_DNS                    40000 non-null  int32   
 10  Traffic Type_FTP                    40000 non-null  int32   
 11  Traffic Type_HTTP           

# Model Training 


## dataset splitting 

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
df_train, df_test = train_test_split(new_df,test_size = 0.2, random_state=42)

lets see the length of our different datasets 

In [35]:
len(df_train), len(df_test)

(32000, 8000)

In [36]:
print(new_df['Severity Level'].mean())
print(df_train['Severity Level'].mean())
print(df_test['Severity Level'].mean())

1.001325
0.9975625
1.016375


In [37]:
df_train.columns


Index(['Packet Length', 'Anomaly Scores', 'Attack Type', 'Severity Level',
       'Protocol_ICMP', 'Protocol_TCP', 'Protocol_UDP', 'Packet Type_Control',
       'Packet Type_Data', 'Traffic Type_DNS', 'Traffic Type_FTP',
       'Traffic Type_HTTP', 'Malware Indicators_IoC Detected',
       'Attack Signature_Known Pattern A', 'Attack Signature_Known Pattern B',
       'Action Taken_Blocked', 'Action Taken_Ignored', 'Action Taken_Logged',
       'Network Segment_Segment A', 'Network Segment_Segment B',
       'Network Segment_Segment C', 'Firewall Logs_Log Data',
       'Firewall Logs_No Log Data', 'IDS/IPS Alerts_Alert Data',
       'IDS/IPS Alerts_No Alert Data', 'Log Source_Firewall',
       'Log Source_Server'],
      dtype='object')

In [38]:
df_test.columns

Index(['Packet Length', 'Anomaly Scores', 'Attack Type', 'Severity Level',
       'Protocol_ICMP', 'Protocol_TCP', 'Protocol_UDP', 'Packet Type_Control',
       'Packet Type_Data', 'Traffic Type_DNS', 'Traffic Type_FTP',
       'Traffic Type_HTTP', 'Malware Indicators_IoC Detected',
       'Attack Signature_Known Pattern A', 'Attack Signature_Known Pattern B',
       'Action Taken_Blocked', 'Action Taken_Ignored', 'Action Taken_Logged',
       'Network Segment_Segment A', 'Network Segment_Segment B',
       'Network Segment_Segment C', 'Firewall Logs_Log Data',
       'Firewall Logs_No Log Data', 'IDS/IPS Alerts_Alert Data',
       'IDS/IPS Alerts_No Alert Data', 'Log Source_Firewall',
       'Log Source_Server'],
      dtype='object')

dividing our target and feature columns

In [39]:
# list of columns features to be used for training
features = [
    'Packet Length', 
    'Anomaly Scores', 
    'Severity Level', 
    'Protocol_ICMP',
    'Protocol_TCP', 
    'Protocol_UDP', 
    'Packet Type_Control',
    'Packet Type_Data', 
    'Traffic Type_DNS', 
    'Traffic Type_FTP',
    'Traffic Type_HTTP', 
    'Malware Indicators_IoC Detected',
    'Malware Indicators_No IoC Detected', 
    'Alerts/Warnings_Alert Triggered',
    'Alerts/Warnings_No Alert Triggered', 
    'Attack Signature_Known Pattern A', 
    'Attack Signature_Known Pattern B',
    'Action Taken_Blocked', 
    'Action Taken_Ignored',
    'Action Taken_Logged',
    'Network Segment_Segment A', 
    'Network Segment_Segment B',
    'Network Segment_Segment C',
    'Firewall Logs_Log Data',
    'Firewall Logs_No Log Data',
    'IDS/IPS Alerts_Alert Data',
    'IDS/IPS Alerts_No Alert Data',
    'Log Source_Firewall',
    'Log Source_Server',
    'Protocol_ICMP',
    'Protocol_TCP',
    'Protocol_UDP',
    'Packet Type_Control',
    'Packet Type_Data',
    'Traffic Type_DNS',
    'Traffic Type_FTP',
    'Traffic Type_HTTP',
    'Malware Indicators_IoC Detected',
    'Malware Indicators_No IoC Detected',
    'Alerts/Warnings_Alert Triggered',
    'Alerts/Warnings_No Alert Triggered',
    'Attack Signature_Known Pattern A', 
    'Attack Signature_Known Pattern B',
    'Action Taken_Blocked', 
    'Action Taken_Ignored', 'Action Taken_Logged',
    'Network Segment_Segment A', 
    'Network Segment_Segment B',
    'Network Segment_Segment C', 
    'Firewall Logs_Log Data',
    'Firewall Logs_No Log Data', 
    'IDS/IPS Alerts_Alert Data',
    'IDS/IPS Alerts_No Alert Data',
    'Log Source_Firewall',
    'Log Source_Server'
]

In [40]:
target_column = 'attack type'

In [41]:
df_train.head(20)

Unnamed: 0,Packet Length,Anomaly Scores,Attack Type,Severity Level,Protocol_ICMP,Protocol_TCP,Protocol_UDP,Packet Type_Control,Packet Type_Data,Traffic Type_DNS,...,Action Taken_Logged,Network Segment_Segment A,Network Segment_Segment B,Network Segment_Segment C,Firewall Logs_Log Data,Firewall Logs_No Log Data,IDS/IPS Alerts_Alert Data,IDS/IPS Alerts_No Alert Data,Log Source_Firewall,Log Source_Server
14307,729,7.17,Intrusion,2,1,0,0,1,0,0,...,0,0,0,1,0,1,1,0,0,1
17812,1447,54.0,Malware,0,0,1,0,1,0,0,...,1,1,0,0,0,1,1,0,0,1
11020,1119,82.75,Malware,2,0,1,0,0,1,1,...,0,1,0,0,0,1,0,1,1,0
15158,706,86.47,Malware,0,1,0,0,0,1,0,...,1,0,0,1,1,0,1,0,0,1
24990,1202,63.21,Intrusion,0,1,0,0,0,1,0,...,0,0,1,0,1,0,0,1,1,0
5980,1017,12.49,Intrusion,2,0,0,1,0,1,1,...,0,0,0,1,0,1,1,0,0,1
30334,97,41.31,DDoS,1,0,1,0,1,0,0,...,1,1,0,0,0,1,0,1,1,0
26175,485,70.27,DDoS,2,1,0,0,0,1,1,...,1,1,0,0,0,1,0,1,0,1
11195,334,94.28,Malware,0,0,1,0,1,0,1,...,0,1,0,0,0,1,0,1,1,0
20033,129,78.77,Malware,2,0,0,1,1,0,0,...,0,1,0,0,1,0,0,1,1,0


###  get the values of the features columns for the training data


In [42]:
X_train = df_train.loc[:,features].values
y_train = df_train['Attack Type'].values

### get the values of the columns for the test data


In [43]:
x_test = df_test.loc[:,features].values
y_test = df_test['Attack Type'].values

## Testing models 

In [44]:
from sklearn.linear_model import LogisticRegression

In [45]:
# here we initialize the model
lr_model = LogisticRegression(random_state=42,max_iter=1500)

In [46]:
# here we train the model on the training data
lr_model.fit(X=X_train, y=y_train)

In [47]:
y_test_predicted = lr_model.predict(x_test)
y_test_predicted

array(['Malware', 'Intrusion', 'DDoS', ..., 'DDoS', 'DDoS', 'Intrusion'],
      dtype=object)

In [48]:
from sklearn.metrics import confusion_matrix

In [49]:
cf = pd.DataFrame(
    columns=["y_test_DDoS","y_test_Intrusion","y_test_Malware"],index=["y_predicted_DDoS","y_predicted_Intrusion","y_predicted_Malware"]
)

In [50]:
cf.loc[:,:] = confusion_matrix(y_true= y_test,y_pred= y_test_predicted)
cf

Unnamed: 0,y_test_DDoS,y_test_Intrusion,y_test_Malware
y_predicted_DDoS,1126,625,885
y_predicted_Intrusion,1120,657,944
y_predicted_Malware,1033,643,967


In [51]:
cf/len(y_test)

Unnamed: 0,y_test_DDoS,y_test_Intrusion,y_test_Malware
y_predicted_DDoS,0.14075,0.078125,0.110625
y_predicted_Intrusion,0.14,0.082125,0.118
y_predicted_Malware,0.129125,0.080375,0.120875


In [52]:
from sklearn.metrics import recall_score, precision_score, f1_score

In [53]:
from sklearn.metrics import classification_report

In [54]:
report =classification_report(y_true=y_test, y_pred=y_test_predicted)

In [55]:
print(report)

              precision    recall  f1-score   support

        DDoS       0.34      0.43      0.38      2636
   Intrusion       0.34      0.24      0.28      2721
     Malware       0.35      0.37      0.36      2643

    accuracy                           0.34      8000
   macro avg       0.34      0.34      0.34      8000
weighted avg       0.34      0.34      0.34      8000



In [56]:
from sklearn.tree import DecisionTreeClassifier

In [57]:
dt_model = DecisionTreeClassifier()

In [58]:
dt_model.fit(X=X_train,y=y_train)

In [59]:
y_test_predicted_dt = dt_model.predict(x_test)

In [60]:
report_dt = classification_report(y_pred=y_test_predicted_dt,y_true=y_test)
print(report_dt)

              precision    recall  f1-score   support

        DDoS       0.33      0.33      0.33      2636
   Intrusion       0.35      0.34      0.34      2721
     Malware       0.33      0.33      0.33      2643

    accuracy                           0.33      8000
   macro avg       0.33      0.33      0.33      8000
weighted avg       0.33      0.33      0.33      8000



In [65]:
from sklearn.ensemble import RandomForestClassifier
rm_model = RandomForestClassifier(random_state=42,n_estimators=100)
lr_model = LogisticRegression(random_state=42,max_iter=1500)
rm_model.fit(X=X_train, y=y_train)

In [69]:
y_test_predicted = rm_model.predict(x_test)
y_test_predicted

array(['Intrusion', 'Intrusion', 'Intrusion', ..., 'Malware', 'Malware',
       'DDoS'], dtype=object)

In [70]:
from sklearn.metrics import confusion_matrix

In [71]:
cf = pd.DataFrame(
    columns=["y_test_DDoS","y_test_Intrusion","y_test_Malware"],index=["y_predicted_DDoS","y_predicted_Intrusion","y_predicted_Malware"]
)

In [72]:
cf.loc[:,:] = confusion_matrix(y_true= y_test,y_pred= y_test_predicted)
cf

Unnamed: 0,y_test_DDoS,y_test_Intrusion,y_test_Malware
y_predicted_DDoS,907,844,885
y_predicted_Intrusion,935,873,913
y_predicted_Malware,903,851,889


In [74]:
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import classification_report
report =classification_report(y_true=y_test, y_pred=y_test_predicted)
print(report)

              precision    recall  f1-score   support

        DDoS       0.33      0.34      0.34      2636
   Intrusion       0.34      0.32      0.33      2721
     Malware       0.33      0.34      0.33      2643

    accuracy                           0.33      8000
   macro avg       0.33      0.33      0.33      8000
weighted avg       0.33      0.33      0.33      8000

