## importing library

In [2]:
import pandas as pd
from pyspark.sql import SparkSession

In [3]:
# load the csv file 

df = pd.read_csv("cybersecurity_attacks.csv")

## display information 

In [6]:
df.head(5)

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Payload Data,...,Action Taken,Severity Level,User Information,Device Information,Network Segment,Geo-location Data,Proxy Information,Firewall Logs,IDS/IPS Alerts,Log Source
0,2023-05-30 06:33:58,103.216.15.12,84.9.164.252,31225,17616,ICMP,503,Data,HTTP,Qui natus odio asperiores nam. Optio nobis ius...,...,Logged,Low,Reyansh Dugal,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment A,"Jamshedpur, Sikkim",150.9.97.135,Log Data,,Server
1,2020-08-26 07:08:30,78.199.217.198,66.191.137.154,17245,48166,ICMP,1174,Data,HTTP,Aperiam quos modi officiis veritatis rem. Omni...,...,Blocked,Low,Sumer Rana,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment B,"Bilaspur, Nagaland",,Log Data,,Firewall
2,2022-11-13 08:23:25,63.79.210.48,198.219.82.17,16811,53600,UDP,306,Control,HTTP,Perferendis sapiente vitae soluta. Hic delectu...,...,Ignored,Low,Himmat Karpe,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Segment C,"Bokaro, Rajasthan",114.133.48.179,Log Data,Alert Data,Firewall
3,2023-07-02 10:38:46,163.42.196.10,101.228.192.255,20018,32534,UDP,385,Data,HTTP,Totam maxime beatae expedita explicabo porro l...,...,Blocked,Medium,Fateh Kibe,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_11_5; ...,Segment B,"Jaunpur, Rajasthan",,,Alert Data,Firewall
4,2023-07-16 13:11:07,71.166.185.76,189.243.174.238,6131,26646,TCP,1462,Data,DNS,Odit nesciunt dolorem nisi iste iusto. Animi v...,...,Blocked,Low,Dhanush Chad,Mozilla/5.0 (compatible; MSIE 5.0; Windows NT ...,Segment C,"Anantapur, Tripura",149.6.110.119,,Alert Data,Firewall


In [7]:
df.describe()

Unnamed: 0,Source Port,Destination Port,Packet Length,Anomaly Scores
count,40000.0,40000.0,40000.0,40000.0
mean,32970.35645,33150.86865,781.452725,50.113473
std,18560.425604,18574.668842,416.044192,28.853598
min,1027.0,1024.0,64.0,0.0
25%,16850.75,17094.75,420.0,25.15
50%,32856.0,33004.5,782.0,50.345
75%,48928.25,49287.0,1143.0,75.03
max,65530.0,65535.0,1500.0,100.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Timestamp               40000 non-null  object 
 1   Source IP Address       40000 non-null  object 
 2   Destination IP Address  40000 non-null  object 
 3   Source Port             40000 non-null  int64  
 4   Destination Port        40000 non-null  int64  
 5   Protocol                40000 non-null  object 
 6   Packet Length           40000 non-null  int64  
 7   Packet Type             40000 non-null  object 
 8   Traffic Type            40000 non-null  object 
 9   Payload Data            40000 non-null  object 
 10  Malware Indicators      20000 non-null  object 
 11  Anomaly Scores          40000 non-null  float64
 13  Attack Type             40000 non-null  object 
 14  Attack Signature        40000 non-null  object 
 15  Action Taken            40000 non-null

## information about the table 

- It has 40,000 rows and 25 columns.
- TimeStamps is a data type column 
- Several columns have missing values (e.g., Malware Indicators, Alerts/Warnings, Proxy Information,Firewall logs, IDS/IPS Alerts).
- numerical column ( Packet Length, Anamaly Scores) numerical analyses can be made on it 
- string column (Source IP Address, Destination IP Address, Source Port, Destination Port, Payload Data, User information, Device Information, Geo-location Data )
- Some categorical features (e.g., Protocol, Packet Type, Traffic Type,Attack Type,Attack Signature,Action Taken,  Severity Level, Network Segment, Log Source) will require encoding for ML.
- columns we will transform to categorial (Malware Indicators, Alerts/warnings, Firewall logs, IDS/IPS Alerts) 
- Proxy Information is a string column , but will be removed not relevant for our prediction and has many NaN value more than half of our set  

## missing column values

### Malware Indicator


In [11]:
## ading a value "not ioc detected" to the Malware Indicators column where the value is NaN

df['Malware Indicators'] = df['Malware Indicators'].fillna('No IoC Detected')

### Alerts/Warnings

In [12]:
# adding a value "No Alert triggered" to the Alerts/Warnings column where the value is NaN

df['Alerts/Warnings'] = df['Alerts/Warnings'].fillna('No Alert Triggered')

### Proxy Information 
     This column is not really relevant for our model and has many nan 

In [13]:
# Delete Proxy Information column

df = df.drop('Proxy Information', axis=1)

### FireWall Logs


In [14]:
# adding a value "No Log Data" to the Firewall logs column where the value is NaN

df['Firewall Logs'] = df['Firewall Logs'].fillna('No Log Data')

### IDS/IPS Alerts 

In [15]:
# adding a value "No Alert Data" to the IDS/IPS Alerts column where the value is NaN

df['IDS/IPS Alerts'] = df['IDS/IPS Alerts'].fillna('No Alert Data')

In [16]:
## we verify that the changes have been made

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Timestamp               40000 non-null  object 
 1   Source IP Address       40000 non-null  object 
 2   Destination IP Address  40000 non-null  object 
 3   Source Port             40000 non-null  int64  
 4   Destination Port        40000 non-null  int64  
 5   Protocol                40000 non-null  object 
 6   Packet Length           40000 non-null  int64  
 7   Packet Type             40000 non-null  object 
 8   Traffic Type            40000 non-null  object 
 9   Payload Data            40000 non-null  object 
 10  Malware Indicators      40000 non-null  object 
 11  Anomaly Scores          40000 non-null  float64
 13  Attack Type             40000 non-null  object 
 14  Attack Signature        40000 non-null  object 
 15  Action Taken            40000 non-null

## Data conversion 

### timestamps column 


In [17]:
## timestamp column is not in the right format, we will convert it to datetime format, done separately for pandas 

df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Timestamp               40000 non-null  datetime64[ns]
 1   Source IP Address       40000 non-null  object        
 2   Destination IP Address  40000 non-null  object        
 3   Source Port             40000 non-null  int64         
 4   Destination Port        40000 non-null  int64         
 5   Protocol                40000 non-null  object        
 6   Packet Length           40000 non-null  int64         
 7   Packet Type             40000 non-null  object        
 8   Traffic Type            40000 non-null  object        
 9   Payload Data            40000 non-null  object        
 10  Malware Indicators      40000 non-null  object        
 11  Anomaly Scores          40000 non-null  float64       
 13  Attack Type             40000 non-null  object

In [24]:
# Dictionary defining column names and their data types
column_dtypes = {   
    "Source IP Address":'string',
    "Destination IP Address": 'string',
    "Source Port": 'string',
    "Destination Port": 'string',
    "Protocol": "category",
    "Packet Length": int,
    "Packet Type": "category",
    "Traffic Type": "category",
    "Payload Data": 'string',
    "Malware Indicators": "category",
    "Anomaly Scores": float,
    "Alerts/Warnings": "category",
    "Attack Type": "category",
    "Attack Signature": "category",
    "Action Taken": "category",
    "Severity Level": "category",
    "User Information": 'string',
    "Device Information": 'string',
    "Network Segment": "category",
    "Geo-location Data": 'string',  
    "Firewall Logs": "category",
    "IDS/IPS Alerts": "category",
    "Log Source": "category"
}

# Convert all other columns based on dictionary
for col, dtype in column_dtypes.items():
    if dtype == str:
        df[col] = df[col].astype("string")  # Explicitly enforce str type
    else:
        df[col] = df[col].astype(dtype)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Timestamp               40000 non-null  datetime64[ns]
 1   Source IP Address       40000 non-null  string        
 2   Destination IP Address  40000 non-null  string        
 3   Source Port             40000 non-null  string        
 4   Destination Port        40000 non-null  string        
 5   Protocol                40000 non-null  category      
 6   Packet Length           40000 non-null  int64         
 7   Packet Type             40000 non-null  category      
 8   Traffic Type            40000 non-null  category      
 9   Payload Data            40000 non-null  string        
 10  Malware Indicators      40000 non-null  category      
 11  Anomaly Scores          40000 non-null  float64       
 13  Attack Type             40000 non-null  catego

## storing the cleaned data frame in csv 

In [27]:
## we will now store the cleaned data in a new csv file

df.to_csv("cleaned_cybersecurity_attacks.csv", index=False)