In [6]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# 2. Imports
import os
import pandas as pd


In [8]:
# 3. Path to your CICIoT2023 folder (which contains all 171 CSV files)
CICIOT_PATH = "/content/drive/MyDrive/Datasets/CICIoT2023/"
OTHER_DRIVE_PATH = "/content/drive/MyDrive/Shared with me/IDS_DATASETS"



In [9]:
ciciot_files = [f for f in os.listdir(CICIOT_PATH) if f.endswith(".csv")]
print("Total CICIoT2023 CSV files:", len(ciciot_files))
print(ciciot_files[:5])


Total CICIoT2023 CSV files: 140
['part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv']


In [11]:
# 5. (Optional but recommended) Check if all CSVs have the same columns
base_cols = set(pd.read_csv(os.path.join(CICIOT_PATH, ciciot_files[0]), nrows=1).columns)

for f in ciciot_files:
    cols = set(pd.read_csv(os.path.join(CICIOT_PATH, f), nrows=1).columns)
    if cols != base_cols:
        print("Column mismatch found in:", f)

print("Column consistency check completed.")


Column consistency check completed.


In [12]:
# 6. Merge all 171 CSV files into one big CSV (memory-safe using chunks)
CICIOT_OUTPUT = "/content/CICIoT2023_FULL.csv"

first = True
for f in ciciot_files:
    file_path = os.path.join(CICIOT_PATH, f)
    print("Merging:", f)

    for chunk in pd.read_csv(file_path, chunksize=100000):
        if first:
            chunk.to_csv(CICIOT_OUTPUT, index=False, mode="w")
            first = False
        else:
            chunk.to_csv(CICIOT_OUTPUT, index=False, mode="a", header=False)

print("CICIoT2023 merged file created at:", CICIOT_OUTPUT)


Merging: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: part-00016-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Merging: p

In [13]:
import os

file_path = "/content/CICIoT2023_FULL.csv"
size_bytes = os.path.getsize(file_path)

size_mb = size_bytes / (1024 * 1024)
size_gb = size_mb / 1024

print(f"File size: {size_mb:.2f} MB")
print(f"File size: {size_gb:.2f} GB")


File size: 10669.46 MB
File size: 10.42 GB


In [1]:
import pandas as pd

input_file = "/content/CICIoT2023_FULL.csv"
output_file = "/content/CICIoT2023_SMALL.csv"

chunksize = 200_000
sample_fraction = 0.02   # 2%

first = True
for chunk in pd.read_csv(input_file, chunksize=chunksize):
    sampled = chunk.sample(frac=sample_fraction, random_state=42)

    if first:
        sampled.to_csv(output_file, index=False, mode="w")
        first = False
    else:
        sampled.to_csv(output_file, index=False, mode="a", header=False)

print("Reduced dataset created:", output_file)


Reduced dataset created: /content/CICIoT2023_SMALL.csv


In [2]:
import pandas as pd

# Load merged dataset
df = pd.read_csv("/content/CICIoT2023_SMALL.csv")

# Number of rows and columns
print("Shape of dataset (rows, columns):", df.shape)

# Total number of features (excluding label)
if "Label" in df.columns:
    print("Total columns:", len(df.columns))
    print("Total features (excluding Label):", len(df.columns) - 1)
else:
    print("Total columns:", len(df.columns))

# Column names (features)
print("\nColumns / Features:")
for col in df.columns:
    print(col)

# Quick info
print("\nDataset Info:")
print(df.info())

# View first 5 rows
print("\nSample data:")
df.head()


Shape of dataset (rows, columns): (759515, 47)
Total columns: 47

Columns / Features:
flow_duration
Header_Length
Protocol Type
Duration
Rate
Srate
Drate
fin_flag_number
syn_flag_number
rst_flag_number
psh_flag_number
ack_flag_number
ece_flag_number
cwr_flag_number
ack_count
syn_count
fin_count
urg_count
rst_count
HTTP
HTTPS
DNS
Telnet
SMTP
SSH
IRC
TCP
UDP
DHCP
ARP
ICMP
IPv
LLC
Tot sum
Min
Max
AVG
Std
Tot size
IAT
Number
Magnitue
Radius
Covariance
Variance
Weight
label

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 759515 entries, 0 to 759514
Data columns (total 47 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   flow_duration    759515 non-null  float64
 1   Header_Length    759515 non-null  float64
 2   Protocol Type    759515 non-null  float64
 3   Duration         759515 non-null  float64
 4   Rate             759515 non-null  float64
 5   Srate            759515 non-null  float64
 6   Drate            75

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.0,54.0,6.0,64.0,2.215436,2.215436,0.0,0.0,1.0,0.0,...,0.0,54.0,82973550.0,9.5,10.392305,0.0,0.0,0.0,141.55,DoS-SYN_Flood
1,0.0,0.0,1.0,64.0,4.239848,4.239848,0.0,0.0,0.0,0.0,...,0.0,42.0,83149390.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS-ICMP_Flood
2,0.041094,14025.0,17.0,64.0,7679.553787,7679.553787,0.0,0.0,0.0,0.0,...,0.0,50.0,83098540.0,9.5,10.0,0.0,0.0,0.0,141.55,DDoS-UDP_Flood
3,0.001894,55.56,6.0,63.63,9.699986,9.699986,0.0,0.0,0.0,0.0,...,0.192336,54.24,83076390.0,9.5,10.397587,0.273495,0.853467,0.05,141.55,DDoS-TCP_Flood
4,0.716513,75.6,6.0,64.0,1.116023,1.116023,0.0,0.0,1.0,0.0,...,0.0,54.0,83362200.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-SynonymousIP_Flood


In [3]:
from google.colab import files
files.download("/content/CICIoT2023_SMALL.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>