In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# 上传数据集文件
events = pd.read_csv('events.csv')
print(events.head())

       timestamp  visitorid event  itemid  transactionid
0  1433221332117     257597  view  355908            NaN
1  1433224214164     992329  view  248676            NaN
2  1433221999827     111016  view  318965            NaN
3  1433221955914     483717  view  253185            NaN
4  1433221337106     951259  view  367447            NaN


In [2]:
# 转换时间戳为可读格式
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')

# 保留活跃用户
active_users = events['visitorid'].value_counts()
active_users = active_users[active_users > 10].index
events = events[events['visitorid'].isin(active_users)]

# 按时间排序
events = events.sort_values(['visitorid', 'timestamp'])

# 将 event 类型转为数字
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
events['event_encoded'] = label_encoder.fit_transform(events['event'])

event_to_idx = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("事件类别映射：", event_to_idx)

事件类别映射： {'addtocart': 0, 'transaction': 1, 'view': 2}


In [3]:
SEQ_LEN = 5  # 使用过去5步预测下一步

X, y = [], []

for visitor_id, group in events.groupby('visitorid'):
    seq = group['event_encoded'].tolist()
    for i in range(len(seq) - SEQ_LEN):
        X.append(seq[i:i+SEQ_LEN])
        y.append(seq[i+SEQ_LEN])

X = np.array(X)
y = np.array(y)

print("样本数：", len(X))

样本数： 448804


In [4]:
class SequenceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
train_dataset = SequenceDataset(X_train, y_train)
val_dataset = SequenceDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

In [5]:
class BehaviorLSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        emb = self.embedding(x)
        _, (h_n, _) = self.lstm(emb)
        out = self.fc(h_n[-1])
        return out

input_dim = len(label_encoder.classes_)  # event种类数
model = BehaviorLSTM(input_dim, embedding_dim=32, hidden_dim=64, output_dim=input_dim)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train(model, loader):
    model.train()
    total_loss = 0
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            targets.extend(y_batch.numpy())
    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, average='macro')
    return acc, f1

train_losses = []
for epoch in range(5):
    loss = train(model, train_loader)
    acc, f1 = evaluate(model, val_loader)
    train_losses.append(loss)
    print(f"Epoch {epoch+1}: Loss={loss:.4f}, Val_Acc={acc:.4f}, F1={f1:.4f}")

Epoch 1: Loss=0.2519, Val_Acc=0.9185, F1=0.4837
Epoch 2: Loss=0.2518, Val_Acc=0.9191, F1=0.5041
Epoch 3: Loss=0.2517, Val_Acc=0.9191, F1=0.5156
Epoch 4: Loss=0.2517, Val_Acc=0.9188, F1=0.5170
Epoch 5: Loss=0.2516, Val_Acc=0.9190, F1=0.5147


In [9]:
!pip install onnx

Collecting onnx
  Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.18.0


In [10]:
# 模型导出为 ONNX
dummy_input = torch.randint(0, input_dim, (1, SEQ_LEN)).to(device)
torch.onnx.export(
    model, dummy_input, "behavior_model.onnx",
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
    opset_version=11
)
print("模型已导出为behavior_model.onnx")

模型已导出为behavior_model.onnx


