In [4]:
# change tabula to tabula_middle_padding to test middle padding method
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4" 
from tabula import Tabula 
import pandas as pd

In [5]:
# 步骤1：加载数据并确认列名完全匹配
data = pd.read_csv("Real_Datasets/Intrusion_compressed.csv")
# 打印所有列名，核对分类列列表的拼写
print("数据集所有列名：", data.columns.tolist())

# 步骤2：定义完全匹配的分类列列表（逐列核对后修正）
categorical_columns = [
    "protocol_type", "service", "flag", "land", "wrong_fragment", "urgent",
    "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell",
    "su_attempted", "num_root", "num_file_creations", "num_shells",
    "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login",
    "label"
]

# 步骤3：强制转换分类列类型（每次加载后必执行）
for col in categorical_columns:
    data[col] = data[col].astype("object")
    print(f"列 {col} 类型：{data[col].dtype}")  # 验证转换是否成功

# 步骤4：验证分类列数量
new_cat_cols = data.select_dtypes(include=["object"]).columns.tolist()
print("最终分类列数量：", len(new_cat_cols))  # 应输出21
print("最终分类列列表：", new_cat_cols)

数据集所有列名： ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label']
列 protocol_type 类型：object
列 service 类型：object
列 flag 类型：object
列 land 类型：object
列 wrong_fragment 类型：object
列 urgent 类型：object
列 hot 类型：object
列 num_failed_logins 类型：object
列 logged_in 类型：object
列 num_compromised 类型：object
列 root_shell 类型：object
列 su_att

In [None]:
# 1. 指定分类列（20个特征列+1个标签列，共21个）
categorical_columns = [
    "protocol_type", "service", "flag", "land", "logged_in", "is_host_login",
    "is_guest_login", "wrong_fragment", "urgent", "hot", "num_failed_logins",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "label"
]

# 2. 初始化模型（关键参数：epochs=50，大数据集LLM训练标准；batch_size=32，内存不足可改为16）
model = Tabula(
    llm='distilgpt2', 
    experiment_dir="intrusion_training", 
    batch_size=32, 
    epochs=50,  # 文档要求“大数据集训练50个epoch”
    categorical_columns=categorical_columns
)

In [9]:
# Comment this block out to test tabula starting from randomly initialized model.
# Comment this block out when uses tabula_middle_padding
import torch
model.model.load_state_dict(torch.load("pretrained-model/tabula_pretrained_model.pt"))

<All keys matched successfully>

In [None]:
model.fit(data)

  tabula_trainer = TabulaTrainer(self.model, training_args, train_dataset=tabula_ds, tokenizer=self.tokenizer,
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,0.3855
1000,0.3755
1500,0.3751
2000,0.3779
2500,0.3773
3000,0.3769
3500,0.3759
4000,0.376
4500,0.376
5000,0.3754


In [None]:

# 保存训练后的模型
torch.save(model.model.state_dict(), "intrusion_training/model_50epoch.pt")



  0%|          | 0/50000 [00:00<?, ?it/s]


TypeError: object of type 'NoneType' has no len()

In [None]:
# 生成合成数据（max_length=200，覆盖Intrusion的token长度；n_samples=50000匹配原始数据量）
synthetic_data = model.sample(n_samples=50000, max_length=200)
synthetic_data.to_csv("intrusion_50epoch.csv", index=False)

