In [1]:
import torch
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, AdamW
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
file_list={"DD11":['./small/AD-S1.csv','./small/DI-S1.csv','./small/DI-S2.csv'],
"DD11T":['./small/DI-S1.csv','./small/AD-S1.csv','./small/AD-S2.csv'],
"DD12":['./small/AD-S1.csv','./small/DI-S2.csv','./small/DI-S1.csv'],
"DD12T":['./small/DI-S2.csv','./small/AD-S1.csv','./small/AD-S2.csv'],
"DD22":['./small/AD-S2.csv','./small/DI-S2.csv','./small/DI-S1.csv'],
"DD22T":['./small/DI-S2.csv','./small/AD-S2.csv','./small/AD-S1.csv'],
"DD21":['./small/AD-S2.csv','./small/DI-S1.csv','./small/DI-S2.csv'],
"DD21T":['./small/DI-S1.csv','./small/AD-S2.csv','./small/AD-S1.csv']}
#file_list={"DD11":['./small/AD-S1.csv','./small/DI-S1.csv','./small/DI-S2.csv']}

In [None]:
alltime=time.time()

# Define the Transformer-based model
class IoTClassifier(torch.nn.Module):
    def __init__(self, input_dim, num_classes):
        super(IoTClassifier, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, 768)  # Map input to 768-dim
        self.transformer = BertModel.from_pretrained('bert-base-uncased')
        self.fc2 = torch.nn.Linear(768, num_classes)  # Final output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))  
        x = self.transformer(inputs_embeds=x.unsqueeze(1)).last_hidden_state
        x = torch.mean(x, dim=1)  # Pooling
        x = self.fc2(x)  
        return x


# Define custom Dataset class
class IoTDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
# Feature selection and data loading
feature = [  
    'dstport', 'dstport_class', 'http.chat', 'http.notification', 'http.request.method',
    'ip.flags.df', 'ip.len', 'ip.proto', 'ip.ttl', 'srcport', 'srcport_class', 'tcp.ack',
    'tcp.analysis.ack_rtt', 'tcp.analysis.bytes_in_flight', 'tcp.analysis.initial_rtt',
    'tcp.analysis.push_bytes_sent', 'tcp.completeness', 'tcp.dstport', 'tcp.flags',
    'tcp.flags.push', 'tcp.flags.str', 'tcp.flags.syn', 'tcp.hdr_len', 'tcp.len',
    'tcp.nxtseq', 'tcp.srcport', 'tcp.stream', 'tcp.time_delta', 'tcp.time_relative',
    'tcp.window_size', 'tcp.window_size_scalefactor', 'tcp.window_size_value',
    'tls.record.length', 'udp.checksum.status', 'udp.dstport', 'udp.srcport',
    'udp.time_delta', 'udp.time_relative', "Label"
]


for f in file_list:
    print(f"Train: {file_list[f][0]} Test: {file_list[f][1]}")
    train = pd.read_csv(file_list[f][0], usecols=feature)
    test = pd.read_csv(file_list[f][1], usecols=feature)
    # Handle missing or invalid values
    train = train.replace(-9999, 0).fillna(0)
    test = test.replace(-9999, 0).fillna(0)
    # Separate features and labels
    X_train = train.drop(['Label'], axis=1)
    y_train = train['Label']
    # Encode labels and standardize features
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    # Separate features and labels
    X_test = test.drop(['Label'], axis=1)
    y_test = test['Label']
    # Encode labels and standardize features
    label_encoder = LabelEncoder()
    y_test = label_encoder.fit_transform(y_test)
    scaler = StandardScaler()
    X_test = scaler.fit_transform(X_test)
    
    
    
    # Create DataLoaders
    batch_size = 32
    train_dataset = IoTDataset(X_train, y_train)
    test_dataset = IoTDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    
    
    # Initialize model, loss function, and optimizer
    input_dim = X_train.shape[1]
    num_classes = len(label_encoder.classes_)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = IoTClassifier(input_dim, num_classes).to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    # Training loop
    num_epochs = 5
    
    second=time.time()
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
    
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
    
            # Forward pass
            outputs = model(features)
            loss = criterion(outputs, labels)
    
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            total_loss += loss.item()
    
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}')
    train_time=(float((time.time()-second)) )
    print(f"train_time: {train_time}")
    second=time.time()
    # Evaluation loop with F1-score calculation
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs, 1)
    
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    test_time=(float((time.time()-second)) )
    print(f"test_time: {test_time}")
    # Calculate F1-score
    f1 = f1_score(all_labels, all_preds, average='macro')
    print(f'macro F1-Score: {f1:.4f}')
    
    # Optional: Calculate and print test accuracy
    correct = sum(p == l for p, l in zip(all_preds, all_labels))
    accuracy = 100 * correct / len(all_labels)
    print(f'Test Accuracy: {accuracy:.2f}%')
print(time.time()-alltime)

Train: ./small/AD-S1.csv Test: ./small/DI-S1.csv
Epoch [1/5], Loss: 1.8399
Epoch [2/5], Loss: 0.9343
Epoch [3/5], Loss: 0.7416
Epoch [4/5], Loss: 0.6518
Epoch [5/5], Loss: 0.6067
train_time: 5045.848743915558
test_time: 288.86303997039795
macro F1-Score: 0.5577
Test Accuracy: 59.38%
Train: ./small/DI-S1.csv Test: ./small/AD-S1.csv
Epoch [1/5], Loss: 1.9458
Epoch [2/5], Loss: 1.0517
Epoch [3/5], Loss: 0.8885
Epoch [4/5], Loss: 0.8190
Epoch [5/5], Loss: 0.7671
train_time: 4904.111703872681
test_time: 348.27076864242554
macro F1-Score: 0.5125
Test Accuracy: 54.12%
Train: ./small/AD-S1.csv Test: ./small/DI-S2.csv
Epoch [1/5], Loss: 1.7928
Epoch [2/5], Loss: 0.9339
Epoch [3/5], Loss: 0.7438
Epoch [4/5], Loss: 0.6603
Epoch [5/5], Loss: 0.6047
train_time: 3760.159427881241
test_time: 269.14605355262756
macro F1-Score: 0.6137
Test Accuracy: 62.24%
Train: ./small/DI-S2.csv Test: ./small/AD-S1.csv
Epoch [1/5], Loss: 1.9664
Epoch [2/5], Loss: 1.1114
Epoch [3/5], Loss: 0.9260
Epoch [4/5], Loss: 0.

In [3]:
file_list={#"DD11":['./small/AD-S1.csv','./small/DI-S1.csv','./small/DI-S2.csv'],
#"DD11T":['./small/DI-S1.csv','./small/AD-S1.csv','./small/AD-S2.csv'],
#"DD12":['./small/AD-S1.csv','./small/DI-S2.csv','./small/DI-S1.csv'],
#"DD12T":['./small/DI-S2.csv','./small/AD-S1.csv','./small/AD-S2.csv'],
#"DD22":['./small/AD-S2.csv','./small/DI-S2.csv','./small/DI-S1.csv'],
#"DD22T":['./small/DI-S2.csv','./small/AD-S2.csv','./small/AD-S1.csv'],
"DD21":['./small/AD-S2.csv','./small/DI-S1.csv','./small/DI-S2.csv'],
"DD21T":['./small/DI-S1.csv','./small/AD-S2.csv','./small/AD-S1.csv']}

In [4]:
alltime=time.time()

# Define the Transformer-based model
class IoTClassifier(torch.nn.Module):
    def __init__(self, input_dim, num_classes):
        super(IoTClassifier, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, 768)  # Map input to 768-dim
        self.transformer = BertModel.from_pretrained('bert-base-uncased')
        self.fc2 = torch.nn.Linear(768, num_classes)  # Final output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))  
        x = self.transformer(inputs_embeds=x.unsqueeze(1)).last_hidden_state
        x = torch.mean(x, dim=1)  # Pooling
        x = self.fc2(x)  
        return x


# Define custom Dataset class
class IoTDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
# Feature selection and data loading
feature = [  
    'dstport', 'dstport_class', 'http.chat', 'http.notification', 'http.request.method',
    'ip.flags.df', 'ip.len', 'ip.proto', 'ip.ttl', 'srcport', 'srcport_class', 'tcp.ack',
    'tcp.analysis.ack_rtt', 'tcp.analysis.bytes_in_flight', 'tcp.analysis.initial_rtt',
    'tcp.analysis.push_bytes_sent', 'tcp.completeness', 'tcp.dstport', 'tcp.flags',
    'tcp.flags.push', 'tcp.flags.str', 'tcp.flags.syn', 'tcp.hdr_len', 'tcp.len',
    'tcp.nxtseq', 'tcp.srcport', 'tcp.stream', 'tcp.time_delta', 'tcp.time_relative',
    'tcp.window_size', 'tcp.window_size_scalefactor', 'tcp.window_size_value',
    'tls.record.length', 'udp.checksum.status', 'udp.dstport', 'udp.srcport',
    'udp.time_delta', 'udp.time_relative', "Label"
]


for f in file_list:
    print(f"Train: {file_list[f][0]} Test: {file_list[f][1]}")
    train = pd.read_csv(file_list[f][0], usecols=feature)
    test = pd.read_csv(file_list[f][1], usecols=feature)
    # Handle missing or invalid values
    train = train.replace(-9999, 0).fillna(0)
    test = test.replace(-9999, 0).fillna(0)
    # Separate features and labels
    X_train = train.drop(['Label'], axis=1)
    y_train = train['Label']
    # Encode labels and standardize features
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    # Separate features and labels
    X_test = test.drop(['Label'], axis=1)
    y_test = test['Label']
    # Encode labels and standardize features
    label_encoder = LabelEncoder()
    y_test = label_encoder.fit_transform(y_test)
    scaler = StandardScaler()
    X_test = scaler.fit_transform(X_test)
    
    
    
    # Create DataLoaders
    batch_size = 32
    train_dataset = IoTDataset(X_train, y_train)
    test_dataset = IoTDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    
    
    # Initialize model, loss function, and optimizer
    input_dim = X_train.shape[1]
    num_classes = len(label_encoder.classes_)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = IoTClassifier(input_dim, num_classes).to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    # Training loop
    num_epochs = 5
    
    second=time.time()
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
    
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
    
            # Forward pass
            outputs = model(features)
            loss = criterion(outputs, labels)
    
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            total_loss += loss.item()
    
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}')
    train_time=(float((time.time()-second)) )
    print(f"train_time: {train_time}")
    second=time.time()
    # Evaluation loop with F1-score calculation
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs, 1)
    
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    test_time=(float((time.time()-second)) )
    print(f"test_time: {test_time}")
    # Calculate F1-score
    f1 = f1_score(all_labels, all_preds, average='macro')
    print(f'macro F1-Score: {f1:.4f}')
    
    # Optional: Calculate and print test accuracy
    correct = sum(p == l for p, l in zip(all_preds, all_labels))
    accuracy = 100 * correct / len(all_labels)
    print(f'Test Accuracy: {accuracy:.2f}%')
print(time.time()-alltime)

Train: ./small/AD-S2.csv Test: ./small/DI-S1.csv
Epoch [1/5], Loss: 1.8773
Epoch [2/5], Loss: 0.9709
Epoch [3/5], Loss: 0.7662
Epoch [4/5], Loss: 0.6814
Epoch [5/5], Loss: 0.6344
train_time: 4995.166686058044
test_time: 520.5478582382202
macro F1-Score: 0.5858
Test Accuracy: 59.55%
Train: ./small/DI-S1.csv Test: ./small/AD-S2.csv
Epoch [1/5], Loss: 1.9114
Epoch [2/5], Loss: 1.0448
Epoch [3/5], Loss: 0.8786
Epoch [4/5], Loss: 0.8216
Epoch [5/5], Loss: 0.7807
train_time: 5506.823732376099
test_time: 468.919801235199
macro F1-Score: 0.5811
Test Accuracy: 58.84%
11496.089537382126


In [5]:
file_list={"DD11":['./small/AD-S1.csv','./small/DI-S1.csv','./small/DI-S2.csv'],
"DD11T":['./small/DI-S1.csv','./small/AD-S1.csv','./small/AD-S2.csv'],
"DD12":['./small/AD-S1.csv','./small/DI-S2.csv','./small/DI-S1.csv'],
"DD12T":['./small/DI-S2.csv','./small/AD-S1.csv','./small/AD-S2.csv'],
"DD22":['./small/AD-S2.csv','./small/DI-S2.csv','./small/DI-S1.csv'],
"DD22T":['./small/DI-S2.csv','./small/AD-S2.csv','./small/AD-S1.csv'],
"DD21":['./small/AD-S2.csv','./small/DI-S1.csv','./small/DI-S2.csv'],
"DD21T":['./small/DI-S1.csv','./small/AD-S2.csv','./small/AD-S1.csv']}

In [None]:
alltime=time.time()

# Define the Transformer-based model
class IoTClassifier(torch.nn.Module):
    def __init__(self, input_dim, num_classes):
        super(IoTClassifier, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, 768)  # Map input to 768-dim
        self.transformer = BertModel.from_pretrained('bert-base-uncased')
        self.fc2 = torch.nn.Linear(768, num_classes)  # Final output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))  
        x = self.transformer(inputs_embeds=x.unsqueeze(1)).last_hidden_state
        x = torch.mean(x, dim=1)  # Pooling
        x = self.fc2(x)  
        return x


# Define custom Dataset class
class IoTDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
# Feature selection and data loading
feature = [  
    'dstport', 'dstport_class', 'http.chat', 'http.notification', 'http.request.method',
    'ip.flags.df', 'ip.len', 'ip.proto', 'ip.ttl', 'srcport', 'srcport_class', 'tcp.ack',
    'tcp.analysis.ack_rtt', 'tcp.analysis.bytes_in_flight', 'tcp.analysis.initial_rtt',
    'tcp.analysis.push_bytes_sent', 'tcp.completeness', 'tcp.dstport', 'tcp.flags',
    'tcp.flags.push', 'tcp.flags.str', 'tcp.flags.syn', 'tcp.hdr_len', 'tcp.len',
    'tcp.nxtseq', 'tcp.srcport', 'tcp.stream', 'tcp.time_delta', 'tcp.time_relative',
    'tcp.window_size', 'tcp.window_size_scalefactor', 'tcp.window_size_value',
    'tls.record.length', 'udp.checksum.status', 'udp.dstport', 'udp.srcport',
    'udp.time_delta', 'udp.time_relative', "Label"
]


for f in file_list:
    print(f"Train: {file_list[f][0]} Test: {file_list[f][1]}")
    train = pd.read_csv(file_list[f][0], usecols=feature)
    test = pd.read_csv(file_list[f][1], usecols=feature)
    # Handle missing or invalid values
    train = train.replace(-9999, 0).fillna(0)
    test = test.replace(-9999, 0).fillna(0)
    # Separate features and labels
    X_train = train.drop(['Label'], axis=1)
    y_train = train['Label']
    # Encode labels and standardize features
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    # Separate features and labels
    X_test = test.drop(['Label'], axis=1)
    y_test = test['Label']
    # Encode labels and standardize features
    label_encoder = LabelEncoder()
    y_test = label_encoder.fit_transform(y_test)
    scaler = StandardScaler()
    X_test = scaler.fit_transform(X_test)
    
    
    
    # Create DataLoaders
    batch_size = 32
    train_dataset = IoTDataset(X_train, y_train)
    test_dataset = IoTDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    
    
    # Initialize model, loss function, and optimizer
    input_dim = X_train.shape[1]
    num_classes = len(label_encoder.classes_)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = IoTClassifier(input_dim, num_classes).to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=1e-5)
    
    # Training loop
    num_epochs = 5
    
    second=time.time()
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
    
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
    
            # Forward pass
            outputs = model(features)
            loss = criterion(outputs, labels)
    
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            total_loss += loss.item()
    
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}')
    train_time=(float((time.time()-second)) )
    print(f"train_time: {train_time}")
    second=time.time()
    # Evaluation loop with F1-score calculation
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs, 1)
    
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    test_time=(float((time.time()-second)) )
    print(f"test_time: {test_time}")
    # Calculate F1-score
    f1 = f1_score(all_labels, all_preds, average='macro')
    print(f'macro F1-Score: {f1:.4f}')
    
    # Optional: Calculate and print test accuracy
    correct = sum(p == l for p, l in zip(all_preds, all_labels))
    accuracy = 100 * correct / len(all_labels)
    print(f'Test Accuracy: {accuracy:.2f}%')
print(time.time()-alltime)

Train: ./small/AD-S1.csv Test: ./small/DI-S1.csv
Epoch [1/5], Loss: 1.8603
Epoch [2/5], Loss: 0.9579
Epoch [3/5], Loss: 0.7518
Epoch [4/5], Loss: 0.6624


In [20]:
for f in file_list:
    train = pd.read_csv(file_list[f][0], usecols=feature)
    print(len(train.groupby("Label").size()))
    

21
21
21
21
21
21
21
21


In [15]:
import numpy as np

In [17]:
for f in file_list:
    print(f"Train: {file_list[f][0]} Test: {file_list[f][1]}")
    train = pd.read_csv(file_list[f][0], usecols=feature)
    test = pd.read_csv(file_list[f][1], usecols=feature)
    # Handle missing or invalid values
    train = train.replace(-9999, 0).fillna(0)
    test = test.replace(-9999, 0).fillna(0)
    # Separate features and labels
    X_train = train.drop(['Label'], axis=1)
    y_train = train['Label']
    # Encode labels and standardize features
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    # Separate features and labels
    X_test = test.drop(['Label'], axis=1)
    y_test = test['Label']
    print(len(train.groupby("Label").size()))
    print(len(test.groupby("Label").size()))
    # Encode labels and standardize features
    label_encoder = LabelEncoder()
    y_test = label_encoder.fit_transform(y_test)
    scaler = StandardScaler()
    X_test = scaler.fit_transform(X_test)
    print(len(np.unique(y_test)))
    print(len(np.unique(y_train)))
    print("____________________________")

Train: ./small/AD-S1.csv Test: ./small/DI-S1.csv
16
19
19
16
____________________________
Train: ./small/DI-S1.csv Test: ./small/AD-S1.csv
19
16
16
19
____________________________
Train: ./small/AD-S1.csv Test: ./small/DI-S2.csv
16
18
18
16
____________________________
Train: ./small/DI-S2.csv Test: ./small/AD-S1.csv
18
16
16
18
____________________________
Train: ./small/AD-S2.csv Test: ./small/DI-S2.csv
18
18
18
18
____________________________
Train: ./small/DI-S2.csv Test: ./small/AD-S2.csv
18
18
18
18
____________________________
Train: ./small/AD-S2.csv Test: ./small/DI-S1.csv
18
19
19
18
____________________________
Train: ./small/DI-S1.csv Test: ./small/AD-S2.csv
19
18
18
19
____________________________


In [48]:
for f in file_list:
    train = pd.read_csv(file_list[f][0], usecols=feature)
    for i in train.columns[:-1]:
        try:
            np.mean(train[i])
        except:
            print(file_list[f][0],i)
            print(train.groupby(i).size())
    print("\n\n\n")



































In [47]:
for f in file_list:
    df = pd.read_csv(file_list[f][0], usecols=feature)
    for i in df.columns[:-1]:
        try:
            np.mean(df[i])
        except:
            print(file_list[f][0],i)
            
            df[i] = df[i].replace('SUBSCRIBE', 6)
            df[i] = df[i].replace('UNSUBSCRIBE',7)
            df[i] = df[i].replace('····CE····S·', 9)
            df[i] = pd.to_numeric(df[i], errors='coerce')
            print(df.groupby(i).size())
    df.to_csv(file_list[f][0],index=False)
    print("\n\n\n")

./small/AD-S1.csv http.request.method
http.request.method
-9999.0    15541
 2.0          34
 3.0          83
 4.0        1492
 5.0           2
 6.0           3
dtype: int64
./small/AD-S1.csv tcp.flags.str
tcp.flags.str
-9999.0    6759
 1.0         58
 2.0       4270
 3.0         29
 4.0        194
 5.0        318
 6.0       5273
 7.0         21
 8.0        227
 9.0          6
dtype: int64




./small/DI-S1.csv http.request.method
http.request.method
-9999    16813
 2         150
 3          16
 4        1016
 5          21
 6           5
 7           4
dtype: int64








./small/DI-S2.csv http.request.method
http.request.method
-9999    16900
 2         189
 3          22
 4         810
 5          59
 6           6
 7           3
dtype: int64
./small/DI-S2.csv tcp.flags.str
tcp.flags.str
-9999.0    6853
 1.0        177
 2.0       4145
 3.0         14
 4.0        190
 5.0        380
 6.0       5775
 7.0         30
 8.0        424
 9.0          1
dtype: int64




./small/AD-S2.csv htt

In [None]:

for f in file_list:
    train = pd.read_csv(file_list[f][0], usecols=feature)

    

# 3. Tüm değerleri numerik yapma, hatalı verileri NaN yapar


# 4. NaN olan verileri kontrol etme (gerekirse doldurma)
print(df['Value'].isna().sum(), "adet NaN değer bulundu.")  # Hata kontrolü

# 5. NaN değerleri sıfır ile doldurmak istersek (opsiyonel)
df['Value'] = df['Value'].fillna(0)

# Sonuç
print(df)