# Cybersecurity Data Analysis with Pandas
<h3>Network Traffic Data Analysis </h3>

# 1. Setting Up: Installing and Importing Pandas

In [7]:
!pip install pandas



You should consider upgrading via the 'C:\Users\DUP\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [8]:
import pandas as pd

# 2. Load and Import Data from GitHub

In [63]:
# Load Dataset from Github
url ="https://raw.githubusercontent.com/ritaafrica/data/refs/heads/main/network_traffic_data.csv"

df = pd.read_csv(url)

# Display the first 5 rows
df.head()

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status,Threat_Level
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,TCP,,5411,8989,Blocked,Low
1,2025-03-19 13:03:40,192.168.1.13,172.217.169.46,ICMP,443.0,4999,11808,Allowed,Medium
2,2025-03-19 13:03:10,10.0.0.5,203.0.113.99,HTTP,443.0,6360,10852,Allowed,Medium
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,TCP,,4011,14314,Blocked,Low
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,FTP,,5254,8718,Blocked,Medium


# 3. Preliminary Data Examination

In [64]:
# Check the number of rows and columns
print(f"Dataset Shape: {df.shape}")

# Get column names 
print("\nColumns Names: ")
print(df.columns)

# Display basic info about the dataset
print("\nDataset info: ")
df.info()

# Show summary statistics
print("\nSummary Statistics: ")
print(df.describe())

Dataset Shape: (1000, 9)

Columns Names: 
Index(['Timestamp', 'Source_IP', 'Destination_IP', 'Protocol', 'Port',
       'Bytes_Sent', 'Bytes_Received', 'Status', 'Threat_Level'],
      dtype='object')

Dataset info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Timestamp       1000 non-null   object 
 1   Source_IP       1000 non-null   object 
 2   Destination_IP  1000 non-null   object 
 3   Protocol        1000 non-null   object 
 4   Port            874 non-null    float64
 5   Bytes_Sent      1000 non-null   int64  
 6   Bytes_Received  1000 non-null   int64  
 7   Status          1000 non-null   object 
 8   Threat_Level    1000 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 70.4+ KB

Summary Statistics: 
              Port   Bytes_Sent  Bytes_Received
count   874.000000  1000.000000     1000.000000
me

# 4. Extracting Key Columns for Analysis

In [65]:
# Select important columns
selected_columns = df[["Timestamp", "Source_IP", "Destination_IP", "Status"]]

# Display the first few rows
selected_columns.head()

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Status
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,Blocked
1,2025-03-19 13:03:40,192.168.1.13,172.217.169.46,Allowed
2,2025-03-19 13:03:10,10.0.0.5,203.0.113.99,Allowed
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,Blocked
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,Blocked


# 5. Extracting and Organizing Data with Pandas Series and DataFrames

In [47]:
# Store a single column as a Series
source_ips = df["Source_IP"]

# Store multiple columns in a DataFrame
network_activity = df[["Source_IP", "Destination_IP", "Protocol", "Threat_Level"]]

# Display the first few rows
network_activity.head()

Unnamed: 0,Source_IP,Destination_IP,Protocol,Threat_Level
0,10.0.0.15,192.168.1.20,TCP,Low
1,192.168.1.13,172.217.169.46,ICMP,Medium
2,10.0.0.5,203.0.113.99,HTTP,Medium
3,10.0.0.9,192.168.1.20,TCP,Low
4,192.168.1.4,172.217.169.46,FTP,Medium


# 6. Filtering Data - Filtering and Storing Blocked Network Traffic

In [49]:
# Filter only blocked traffic
blocked_traffic = df[df["Status"] == "Blocked"]

# Selcect key details for analysis
blocked_summary = blocked_traffic[["Timestamp", "Source_IP", "Destination_IP", "Threat_Level"]]

# Display first few rows
blocked_summary.head()

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Threat_Level
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,Low
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,Low
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,Medium
9,2025-03-19 12:59:40,10.0.0.43,10.0.0.5,Low
10,2025-03-19 12:59:10,10.0.0.33,203.0.113.99,Medium


# 7. Filtering Suspicious Network traffic

In [50]:
# Filter high-risk (Critical) traffic
high_risk_traffic = df[df["Threat_Level"] == "Critical"]

# Display summary
high_risk_traffic.head()

Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status,Threat_Level
59,2025-03-19 12:34:40,10.0.0.47,192.168.1.20,ICMP,,5885,463,Allowed,Critical
96,2025-03-19 12:16:10,192.168.1.35,203.0.113.99,FTP,8080.0,9371,7189,Allowed,Critical
134,2025-03-19 11:57:10,192.168.1.17,172.217.169.46,DNS,22.0,6714,13124,Blocked,Critical
150,2025-03-19 11:49:10,192.168.1.42,10.0.0.5,HTTP,53.0,2702,634,Allowed,Critical
209,2025-03-19 11:19:40,10.0.0.17,203.0.113.99,TCP,3389.0,5085,10014,Blocked,Critical


# 8. Filtering Traffic with High Data Transfer (where Bytes Sent is greater than 5000)

In [51]:
# Filter traffic where Bytes_Sent is greater than 5000
high_data_transfer = df[df["Bytes_Sent"] > 5000]

# Show the number of such events
print(f"Number of high-data transfers: {len(high_data_transfer)}")

# Display the first few rows
high_data_transfer.head()

Number of high-data transfers: 518


Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status,Threat_Level
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,TCP,,5411,8989,Blocked,Low
2,2025-03-19 13:03:10,10.0.0.5,203.0.113.99,HTTP,443.0,6360,10852,Allowed,Medium
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,FTP,,5254,8718,Blocked,Medium
5,2025-03-19 13:01:40,10.0.0.43,172.217.169.46,DNS,53.0,6915,12981,Allowed,Low
7,2025-03-19 13:00:40,192.168.1.36,192.168.1.20,TCP,21.0,5655,119,Allowed,Medium


# 9. Splitting the Dataset into X (Features) and y (Target Variable)

In [52]:
# Select features (X) - Exclude the target variable
X = df.drop(columns=["Threat_Level"])

# Select the target variable (y)
y = df["Threat_Level"]

# Display the first few rows of th X and y
print("Features (X): ")
X.head()

Features (X): 


Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status
0,2025-03-19 13:04:10,10.0.0.15,192.168.1.20,TCP,,5411,8989,Blocked
1,2025-03-19 13:03:40,192.168.1.13,172.217.169.46,ICMP,443.0,4999,11808,Allowed
2,2025-03-19 13:03:10,10.0.0.5,203.0.113.99,HTTP,443.0,6360,10852,Allowed
3,2025-03-19 13:02:40,10.0.0.9,192.168.1.20,TCP,,4011,14314,Blocked
4,2025-03-19 13:02:10,192.168.1.4,172.217.169.46,FTP,,5254,8718,Blocked


In [53]:
print("\nTarget Variable (y): ")
y.head()


Target Variable (y): 


0       Low
1    Medium
2    Medium
3       Low
4    Medium
Name: Threat_Level, dtype: object

# 10. Removing a Column

In [58]:
# Remove the 'Timestamp' column
df = df.drop(columns=["Timestamp"])

# Display the first few rows to confirm
df.head()

Unnamed: 0,Source_IP,Destination_IP,Protocol,Port,Bytes_Sent,Bytes_Received,Status,Threat_Level
0,10.0.0.15,192.168.1.20,TCP,,5411,8989,Blocked,Low
1,192.168.1.13,172.217.169.46,ICMP,443.0,4999,11808,Allowed,Medium
2,10.0.0.5,203.0.113.99,HTTP,443.0,6360,10852,Allowed,Medium
3,10.0.0.9,192.168.1.20,TCP,,4011,14314,Blocked,Low
4,192.168.1.4,172.217.169.46,FTP,,5254,8718,Blocked,Medium
